Remove template parameter from Tensor (#9939)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/9939 Pull Request resolved: https://github.com/facebookresearch/weakly-supervised-action-detection/pull/13 Pull Request resolved: https://github.com/pytorch/translate/pull/166 Pull Request resolved: https://github.com/pytorch/pytorch/pull/9125 Closes https://github.com/pytorch/pytorch/pull/9125 Use inheritance for polymorphism, and remove template parameter This is to change the templating in call sites, the core implementations will change later Before Caffe2 Tensor class was compile-time fixed to bind to a particular device/context. With this change, we're making it a runtime property (stored inside the tensor), but preserve the same semantics. For example, one has to specify device type in order to create a Tensor - there are no uninitialized tensors. More specifically the changes are: 1. We added an extra argument *DeviceType* to most of the constructors of the tensor, e.g. (Tensor(DeviceType type)), 2. Semantics of constructor Tensor(const Tensor<SrcContext>& src, ContextForCopy* context); is changed, in this constructor, the second context is passed in to enable us to call the templated Copy function, it could be in a different context as source and target previously, now we'll enforce that the context should have same device type as src, if it is provided. 3. To preserve 'get-or-construct' semantics of Blob, we added specialized getter Blob::GetMutableTensor that verifies both that Blob contains a Tensor and that it's of a correct type 4. Specifically, Tensor type is not default-constructible any more (as we don't have unknown device tensors) and thus some of the code handling STL containers needs to change Note: Some changes are postponed just to keep this diff a bit smaller. Please see `TODO`s. Reviewed By: ezyang, houseroad Differential Revision: D9024330 fbshipit-source-id: e0b8295d2dc6ebe2963383ded5af799ad17164ba
2025-10-20 21:14:14 +08:00 · 2018-07-27 10:50:54 -07:00
parent 94439d7df4
commit aebf3b47ae
365 changed files with 4187 additions and 3515 deletions
--- a/binaries/benchmark_helper.cc
+++ b/binaries/benchmark_helper.cc
@ -160,7 +160,7 @@ void loadInput(
          CAFFE_THROW("Not support GPU on mobile.");
 #endif
        } else {
-          caffe2::TensorCPU* tensor = blob->GetMutable<caffe2::TensorCPU>();
+          caffe2::TensorCPU* tensor = blob->GetMutableTensor(caffe2::CPU);
          CHECK_NOTNULL(tensor);
          tensor->Resize(input_dims);
          if (input_type_list[i] == "uint8_t") {
@ -197,7 +197,7 @@ void fillInputBlob(
    int protos_size = tensor_kv.second.protos_size();
    caffe2::TensorProto* tensor_proto =
        tensor_kv.second.mutable_protos(iteration % protos_size);
-    caffe2::TensorCPU* tensor = blob->GetMutable<caffe2::TensorCPU>();
+    caffe2::TensorCPU* tensor = blob->GetMutableTensor(caffe2::CPU);
    tensor->Resize(std::vector<caffe2::TIndex>());
    if (tensor_proto->data_type() == caffe2::TensorProto::STRING) {
      (tensor->mutable_data<std::string>())[0] = tensor_proto->string_data(0);
@ -290,7 +290,7 @@ void writeOutput(
 #endif
        } else {
          writeTextOutput<caffe2::CPUContext, caffe2::TensorCPU>(
-              workspace->GetBlob(name)->GetMutable<caffe2::TensorCPU>(),
+              workspace->GetBlob(name)->GetMutableTensor(caffe2::CPU),
              output_prefix,
              name);
        }
--- a/binaries/benchmark_helper.h
+++ b/binaries/benchmark_helper.h
@ -35,7 +35,7 @@ void writeTextOutput(
    const string& output_prefix,
    const string& name) {
  string output_name = output_prefix + "/" + name + ".txt";
-  caffe2::TensorSerializer<ContextType> ser;
+  caffe2::TensorSerializer ser;
  caffe2::BlobProto blob_proto;
  ser.Serialize(
      *tensor, output_name, blob_proto.mutable_tensor(), 0, tensor->size());
--- a/binaries/core_overhead_benchmark.cc
+++ b/binaries/core_overhead_benchmark.cc
@ -139,7 +139,7 @@ BENCHMARK(BM_cudaStreamWaitEventThenStreamSynchronize);

 static void BM_CudaPointerAffinity(benchmark::State& state) {
  CAFFE2_SKIP_IF_NO_GPU;
-  TensorCUDA tensor(vector<TIndex>{1, 2, 3, 4});
+  Tensor tensor(vector<TIndex>{1, 2, 3, 4}, CUDA);
  float* ptr = tensor.mutable_data<float>();
  while (state.KeepRunning()) {
    volatile int id = GetGPUIDForPointer(ptr);
@ -198,7 +198,7 @@ static void BM_RawAllocDeallocCPU(benchmark::State& state) {
 BENCHMARK(BM_RawAllocDeallocCPU);

 static void BM_TensorAllocDeallocCPU(benchmark::State& state) {
-  Tensor<CPUContext> tensor;
+  Tensor tensor(CPU);
  // small allocation
  tensor.Resize(32, 32);
  while (state.KeepRunning()) {
@ -210,7 +210,7 @@ BENCHMARK(BM_TensorAllocDeallocCPU);

 static void BM_TensorAllocDeallocCUDA(benchmark::State& state) {
  CAFFE2_SKIP_IF_NO_GPU;
-  Tensor<CUDAContext> tensor;
+  Tensor tensor(CUDA);
  // small allocation
  tensor.Resize(32, 32);
  while (state.KeepRunning()) {
--- a/binaries/print_core_object_sizes.cc
+++ b/binaries/print_core_object_sizes.cc
@ -28,8 +28,7 @@

 int main(int /* unused */, char** /* unused */) {
  PRINT_SIZE(caffe2::Blob);
-  PRINT_SIZE(caffe2::Tensor<caffe2::CPUContext>);
-  PRINT_SIZE(caffe2::Tensor<caffe2::CUDAContext>);
+  PRINT_SIZE(caffe2::Tensor);
  PRINT_SIZE(caffe2::CPUContext);
  PRINT_SIZE(caffe2::CUDAContext);
  PRINT_SIZE(caffe2::OperatorBase);
--- a/binaries/speed_benchmark.cc
+++ b/binaries/speed_benchmark.cc
@ -136,7 +136,7 @@ int main(int argc, char** argv) {
        if (blob == nullptr) {
          blob = workspace->CreateBlob(input_names[i]);
        }
-        caffe2::TensorCPU* tensor = blob->GetMutable<caffe2::TensorCPU>();
+        caffe2::TensorCPU* tensor = blob->GetMutableTensor(caffe2::CPU);
        CHECK_NOTNULL(tensor);
        tensor->Resize(input_dims);
        if (input_type_list[i] == "uint8_t") {
--- a/caffe2/contrib/aten/aten_op_template.h
+++ b/caffe2/contrib/aten/aten_op_template.h
@ -54,11 +54,11 @@ private:
    #undef DEFINE_CASE
  }

-  at::Type & typeFor(const Tensor<Context> & ten) {
+  at::Type& typeFor(const Tensor& ten) {
    return at::getType(backend(), atScalarTypeFor(ten.meta()));
  }
-  at::Tensor tensorWrapping(const Tensor<Context>& ten_) {
-    auto& ten = const_cast<Tensor<Context>&>(ten_);
+  at::Tensor tensorWrapping(const Tensor& ten_) {
+    auto& ten = const_cast<Tensor&>(ten_);
    return typeFor(ten).tensorFromBlob(ten.raw_mutable_data(), ten.dims());
  }

@ -88,7 +88,7 @@ private:
    }
    CAFFE_THROW("Unknown type meta"); // TODO: improve error message...
  }
-  void assignTo(Tensor<Context> * dst, const at::Tensor & src_) {
+  void assignTo(Tensor* dst, const at::Tensor& src_) {
    at::Tensor src = src_.contiguous();
    auto at_sizes = src.sizes();
    std::vector<int64_t> dims(at_sizes.begin(),at_sizes.end());
@ -121,7 +121,7 @@ private:
    return s.toLong();
  }

-  void assignTo(Tensor<Context> * dst, at::Type & inferred_type, at::Scalar scalar) {
+  void assignTo(Tensor* dst, at::Type& inferred_type, at::Scalar scalar) {
    switch(inferred_type.scalarType()) {
      #define DEFINE_CASE(ctype,aten_name,native) \
        case at::k##aten_name: { \
@ -135,7 +135,7 @@ private:
    }
  }
  template <typename T>
-  void assignToValue(Tensor<Context> * dst, T v) {
+  void assignToValue(Tensor* dst, T v) {
    dst->Resize(std::vector<TIndex>());
    math::Set(1, v, dst->template mutable_data<T>(), &context_);
  }
--- a/caffe2/contrib/gloo/common.cc
+++ b/caffe2/contrib/gloo/common.cc
@ -12,7 +12,7 @@ namespace caffe2 {
 namespace gloo {

 void signalFailure(Blob* status_blob, std::exception& /* unused */) {
-  auto* res = status_blob->GetMutable<TensorCPU>();
+  auto* res = status_blob->GetMutableTensor(CPU);
  res->Resize(1);
  res->template mutable_data<int32_t>()[0] = 1;
 }
--- a/caffe2/contrib/nccl/cuda_nccl_op_gpu.cc
+++ b/caffe2/contrib/nccl/cuda_nccl_op_gpu.cc
@ -17,17 +17,17 @@ nccl::NCCLExecution getNCCLElements(
  ex.elements.resize(op->InputSize());
  for (auto i = 0; i < op->InputSize(); ++i) {
    auto& el = ex.elements[i];
-    el.src = &(op->Input<TensorCUDA>(i));
+    el.src = &(op->Input<Tensor>(i, CUDA));
    if (op->OutputSize() == 1) {
      // Reduce op
      if (i == ex.root) {
-        el.dst = op->Output<TensorCUDA>(0);
+        el.dst = op->Output<Tensor>(0, CUDA);
      }
    } else if (i < op->OutputSize()) {
-      el.dst = op->Output<TensorCUDA>(i);
+      el.dst = op->Output<Tensor>(i, CUDA);
    }
    // TODO - expensive (>1ms) - cache these.
-    el.device = GetGPUIDForPointer(op->Input<TensorCUDA>(i).raw_data());
+    el.device = GetGPUIDForPointer(op->Input<Tensor>(i, CUDA).raw_data());
  }

  return ex;
@ -38,7 +38,7 @@ namespace {
 template <typename T>
 bool AllInputsAre(OperatorBase* op) {
  for (auto i = 0; i < op->InputSize(); ++i) {
-    if (op->Input<TensorCUDA>(i).IsType<T>()) {
+    if (op->Input<Tensor>(i, CUDA).IsType<T>()) {
      continue;
    } else {
      return false;
--- a/caffe2/contrib/nervana/nervana_fc_op_gpu_test.cc
+++ b/caffe2/contrib/nervana/nervana_fc_op_gpu_test.cc
@ -22,7 +22,7 @@ static void AddConstInput(const std::vector<int>& shape, const float value,
  option.set_device_type(CUDA);
  CUDAContext context(option);
  Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutable<Tensor<CUDAContext>>();
+  auto* tensor = blob->GetMutableTensor(CUDA);
  tensor->Resize(shape);
  math::Set<float, CUDAContext>(tensor->size(), value,
                                tensor->mutable_data<float>(),
@ -54,8 +54,8 @@ TEST(NervanaFullyConnectedTest, Test) {
  EXPECT_TRUE(op->Run());
  Blob* Yblob = ws.GetBlob("Y");
  EXPECT_NE(nullptr, Yblob);
-  auto& Y = Yblob->Get<Tensor<CUDAContext>>();
-  TensorCPU Y_cpu(Y);
+  auto& Y = Yblob->Get<Tensor>();
+  Tensor Y_cpu(Y, CPU);
  EXPECT_EQ(Y.size(), 5 * 6);
  for (int i = 0; i < Y.size(); ++i) {
    CHECK_LT(Y_cpu.data<float>()[i], 10.11);
--- a/caffe2/contrib/warpctc/ctc_op.h
+++ b/caffe2/contrib/warpctc/ctc_op.h
@ -47,26 +47,26 @@ class CTCOp final : public Operator<Context> {
    const auto& inputs = Input(INPUTS);
    const auto minibatchSize = inputs.dim(1);
    const auto alphabetSize = inputs.dim(2);
-    const auto& labels = OperatorBase::template Input<TensorCPU>(LABELS);
+    const auto& labels = OperatorBase::template Input<Tensor>(LABELS, CPU);
    const auto& labelLengths =
-        OperatorBase::template Input<TensorCPU>(LABEL_LENGTHS);
+        OperatorBase::template Input<Tensor>(LABEL_LENGTHS, CPU);
    const auto& inputLengths =
-        OperatorBase::template Input<TensorCPU>(INPUT_LENGTHS);
+        OperatorBase::template Input<Tensor>(INPUT_LENGTHS, CPU);

    // outputs
-    Tensor<Context>* gradients = nullptr;
+    Tensor* gradients = nullptr;
    TensorCPU* costs;
-    Tensor<Context>* workspace;
+    Tensor* workspace;
    if (!is_test_) {
      // [grads, costs, workspace] to maintain backward compatibility
      gradients = Output(0);
      gradients->ResizeLike(inputs);
-      costs = OperatorBase::template Output<TensorCPU>(1);
+      costs = OperatorBase::template Output<Tensor>(1, CPU);
      costs->ResizeLike(labelLengths);
      workspace = Output(2);
    } else {
      // [costs, workspace]
-      costs = OperatorBase::template Output<TensorCPU>(0);
+      costs = OperatorBase::template Output<Tensor>(0, CPU);
      costs->ResizeLike(labelLengths);
      workspace = Output(1);
    }
--- a/caffe2/core/allocator.cc
+++ b/caffe2/core/allocator.cc
@ -26,7 +26,7 @@ void SetCPUAllocator(CPUAllocator* alloc) {
  g_cpu_allocator.reset(alloc);
 }

-MemoryAllocationReporter CPUContext::reporter_;
+MemoryAllocationReporter CPUStaticContext::reporter_;

 void MemoryAllocationReporter::New(void* ptr, size_t nbytes) {
  std::lock_guard<std::mutex> guard(mutex_);
--- a/caffe2/core/blob.h
+++ b/caffe2/core/blob.h
@ -9,8 +9,9 @@

 #include "caffe2/core/blob_serializer_base.h"
 #include "caffe2/core/common.h"
-#include "caffe2/core/typeid.h"
 #include "caffe2/core/logging.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/core/typeid.h"
 #include "caffe2/proto/caffe2.pb.h"

 namespace caffe2 {
@ -60,6 +61,20 @@ class Blob {
  template <class T>
  bool IsType() const { return meta_.Match<T>(); }

+  // TODO(jerryzh): Remove template
+  template <class T>
+  bool IsType(DeviceType device_type) const {
+    static_assert(
+        std::is_same<T, Tensor>::value,
+        "IsType(DeviceType) only available on "
+        "Tensor types.");
+    auto* tensor = static_cast<Tensor*>(pointer_);
+    if (tensor && tensor->GetDeviceType() == device_type) {
+      return true;
+    }
+    return false;
+  }
+
  /**
   * Returns the meta info of the blob.
   */
@ -74,6 +89,7 @@ class Blob {
   * @brief Gets the const reference of the stored object. The code checks if
   * the stored object is of the desired type.
   */
+  // TODO(jerryzh): add a Get(DeviceType) function?
  template <class T>
  const T& Get() const {
    CAFFE_ENFORCE(
@ -123,6 +139,17 @@ class Blob {
    }
  }

+  inline Tensor* GetMutableTensor(DeviceType device_type) {
+    if (IsType<Tensor>() &&
+        static_cast<Tensor*>(pointer_)->GetDeviceType() == device_type) {
+      return static_cast<Tensor*>(pointer_);
+    } else {
+      VLOG(1) << "Create new mutable object " << TypeMeta::TypeName<Tensor>()
+              << " DeviceType:" << device_type;
+      return Reset<Tensor>(new Tensor(device_type));
+    }
+  }
+
  /**
   * Sets the underlying object to the allocated one. The Blob then takes over
   * the ownership of the passed in pointer. If there is already an object in
--- a/caffe2/core/blob_gpu_test.cc
+++ b/caffe2/core/blob_gpu_test.cc
@ -17,7 +17,7 @@ TYPED_TEST_CASE(TensorGPUDeathTest, TensorTypes);

 TYPED_TEST(TensorGPUTest, TensorInitializedEmpty) {
  if (!caffe2::HasCudaGPU()) return;
-  TensorCUDA tensor;
+  Tensor tensor(CUDA);
  EXPECT_EQ(tensor.ndim(), 0);
  vector<int> dims(3);
  dims[0] = 2;
@ -38,7 +38,7 @@ TYPED_TEST(TensorGPUTest, TensorInitializedNonEmpty) {
  dims[0] = 2;
  dims[1] = 3;
  dims[2] = 5;
-  TensorCUDA tensor(dims);
+  Tensor tensor(dims, CUDA);
  EXPECT_EQ(tensor.ndim(), 3);
  EXPECT_EQ(tensor.dim32(0), 2);
  EXPECT_EQ(tensor.dim32(1), 3);
@ -65,8 +65,8 @@ TYPED_TEST(TensorGPUTest, TensorShareData) {
  dims[0] = 2;
  dims[1] = 3;
  dims[2] = 5;
-  TensorCUDA tensor(dims);
-  TensorCUDA other_tensor(dims);
+  Tensor tensor(dims, CUDA);
+  Tensor other_tensor(dims, CUDA);
  EXPECT_TRUE(tensor.mutable_data<TypeParam>() != nullptr);
  other_tensor.ShareData(tensor);
  EXPECT_TRUE(tensor.data<TypeParam>() != nullptr);
@ -82,8 +82,8 @@ TYPED_TEST(TensorGPUTest, TensorShareDataCanUseDifferentShapes) {
  dims[2] = 5;
  vector<int> alternate_dims(1);
  alternate_dims[0] = 2 * 3 * 5;
-  TensorCUDA tensor(dims);
-  TensorCUDA other_tensor(alternate_dims);
+  Tensor tensor(dims, CUDA);
+  Tensor other_tensor(alternate_dims, CUDA);
  EXPECT_TRUE(tensor.mutable_data<TypeParam>() != nullptr);
  other_tensor.ShareData(tensor);
  EXPECT_EQ(other_tensor.ndim(), 1);
@ -99,8 +99,8 @@ TYPED_TEST(TensorGPUTest, NoLongerSharesAfterResize) {
  dims[0] = 2;
  dims[1] = 3;
  dims[2] = 5;
-  TensorCUDA tensor(dims);
-  TensorCUDA other_tensor(dims);
+  Tensor tensor(dims, CUDA);
+  Tensor other_tensor(dims, CUDA);
  EXPECT_TRUE(tensor.mutable_data<TypeParam>() != nullptr);
  other_tensor.ShareData(tensor);
  EXPECT_EQ(tensor.data<TypeParam>(), other_tensor.data<TypeParam>());
@ -115,7 +115,7 @@ TYPED_TEST(TensorGPUTest, NoLongerSharesAfterResize) {
 TYPED_TEST(TensorGPUDeathTest, CannotAccessDataWhenEmpty) {
  if (!HasCudaGPU()) return;
  ::testing::FLAGS_gtest_death_test_style = "threadsafe";
-  TensorCUDA tensor;
+  Tensor tensor(CUDA);
  EXPECT_EQ(tensor.ndim(), 0);
  EXPECT_THROW(tensor.data<TypeParam>(), EnforceNotMet);
 }
@ -126,12 +126,12 @@ TYPED_TEST(TensorGPUDeathTest, CannotAccessDataWhenEmpty) {
      return;                                                              \
    }                                                                      \
    Blob blob;                                                             \
-    TensorCPU cpu_tensor;                                                  \
+    Tensor cpu_tensor(CPU);                                                \
    cpu_tensor.Resize(2, 3);                                               \
    for (int i = 0; i < 6; ++i) {                                          \
      cpu_tensor.mutable_data<TypeParam>()[i] = static_cast<TypeParam>(i); \
    }                                                                      \
-    blob.GetMutable<TensorCUDA>()->CopyFrom(cpu_tensor);                   \
+    blob.GetMutableTensor(CUDA)->CopyFrom(cpu_tensor);                     \
    string serialized = blob.Serialize("test");                            \
    BlobProto proto;                                                       \
    CAFFE_ENFORCE(proto.ParseFromString(serialized));                      \
@ -148,8 +148,8 @@ TYPED_TEST(TensorGPUDeathTest, CannotAccessDataWhenEmpty) {
    }                                                                      \
    Blob new_blob;                                                         \
    EXPECT_NO_THROW(new_blob.Deserialize(serialized));                     \
-    EXPECT_TRUE(new_blob.IsType<TensorCUDA>());                            \
-    TensorCPU new_cpu_tensor(blob.Get<TensorCUDA>());                      \
+    EXPECT_TRUE(new_blob.IsType<Tensor>(CUDA));                            \
+    Tensor new_cpu_tensor(blob.Get<Tensor>(), CPU);                        \
    EXPECT_EQ(new_cpu_tensor.ndim(), 2);                                   \
    EXPECT_EQ(new_cpu_tensor.dim(0), 2);                                   \
    EXPECT_EQ(new_cpu_tensor.dim(1), 3);                                   \
@ -172,7 +172,7 @@ TEST_SERIALIZATION_GPU_WITH_TYPE(int64_t, int64_data)

 TEST(TensorTest, TensorSerializationMultiDevices) {
  Blob blob;
-  TensorCPU tensor;
+  Tensor tensor(CPU);
  tensor.Resize(2, 3);
  for (int i = 0; i < 6; ++i) {
    tensor.mutable_data<float>()[i] = i;
@ -180,7 +180,7 @@ TEST(TensorTest, TensorSerializationMultiDevices) {
  for (int gpu_id = 0; gpu_id < NumCudaDevices(); ++gpu_id) {
    DeviceGuard guard(gpu_id);
    CUDAContext context(gpu_id);
-    blob.Reset(new TensorCUDA(tensor, &context));
+    blob.Reset(new Tensor(tensor, &context, CUDA));
    string serialized = blob.Serialize("test");
    BlobProto proto;
    CAFFE_ENFORCE(proto.ParseFromString(serialized));
@ -198,7 +198,7 @@ TEST(TensorTest, TensorSerializationMultiDevices) {
    // Test if the restored blob is still of the same device.
    blob.Reset();
    EXPECT_NO_THROW(blob.Deserialize(serialized));
-    EXPECT_TRUE(blob.IsType<TensorCUDA>());
+    EXPECT_TRUE(blob.IsType<Tensor>(CUDA));
    EXPECT_EQ(GetGPUIDForPointer(blob.Get<TensorCUDA>().data<float>()),
              gpu_id);
    // Test if we force the restored blob on a different device, we
@ -206,7 +206,7 @@ TEST(TensorTest, TensorSerializationMultiDevices) {
    blob.Reset();
    proto.mutable_tensor()->mutable_device_detail()->set_cuda_gpu_id(0);
    EXPECT_NO_THROW(blob.Deserialize(proto.SerializeAsString()));
-    EXPECT_TRUE(blob.IsType<TensorCUDA>());
+    EXPECT_TRUE(blob.IsType<Tensor>(CUDA));
    EXPECT_EQ(GetGPUIDForPointer(blob.Get<TensorCUDA>().data<float>()), 0);
  }
 }
--- a/caffe2/core/blob_serialization.cc
+++ b/caffe2/core/blob_serialization.cc
@ -33,7 +33,7 @@ class StringSerializer : public BlobSerializerBase {
  StringSerializer() {}
  ~StringSerializer() {}
  /**
-   * Serializes a Blob. Note that this blob has to contain Tensor<Context>,
+   * Serializes a Blob. Note that this blob has to contain Tensor,
   * otherwise this function produces a fatal error.
   */
  void Serialize(
@ -83,12 +83,242 @@ std::string Blob::Serialize(const string& name) const {
  return data;
 }

-// Specialization for StoreDeviceDetail for CPU - nothing needs to be done.
-template <>
-void TensorSerializer<CPUContext>::StoreDeviceDetail(
-    const Tensor<CPUContext>& /*input*/,
-    TensorProto* /*proto*/) {}
+void TensorSerializer::Serialize(
+    const Blob& blob,
+    const string& name,
+    BlobSerializerBase::SerializationAcceptor acceptor) {
+  this->SerializeWithChunkSize(blob, name, acceptor, kDefaultChunkSize);
+}

+void TensorSerializer::SerializeWithChunkSize(
+    const Blob& blob,
+    const string& name,
+    BlobSerializerBase::SerializationAcceptor acceptor,
+    int chunk_size) {
+  CAFFE_ENFORCE(blob.IsType<Tensor>());
+  const auto& tensor = blob.template Get<Tensor>();
+  if (chunk_size == kNoChunking) {
+    chunk_size = tensor.size() + 1; // to account for empty tensors
+  } else if (chunk_size == kDefaultChunkSize) {
+    chunk_size = FLAGS_caffe2_tensor_chunk_size;
+  }
+
+  auto processChunk = [&](int64_t chunkStart) {
+    BlobProto blob_proto;
+    blob_proto.set_name(name);
+    blob_proto.set_type(kTensorBlobType);
+    TensorProto& proto = *blob_proto.mutable_tensor();
+    proto.set_name(name);
+    this->Serialize(
+        tensor, name, blob_proto.mutable_tensor(), chunkStart, chunk_size);
+    acceptor(
+        MakeString(name, kChunkIdSeparator, chunkStart / chunk_size),
+        blob_proto.SerializeAsString());
+  };
+
+#ifndef __ANDROID__
+  std::vector<std::future<void>> futures;
+  // Poorman's IOBound ThreadPool
+  SimpleQueue<size_t> chunkQueue;
+  auto task = [&]() {
+    size_t chunkStart;
+    while (chunkQueue.Pop(&chunkStart)) {
+      processChunk(chunkStart);
+    }
+  };
+  if (tensor.size() > chunk_size) {
+    for (int i = 0; i < FLAGS_caffe2_max_tensor_serializer_threads; ++i) {
+      futures.emplace_back(std::async(std::launch::async, task));
+    }
+  }
+#endif
+
+  VLOG(1) << "Serializing blob " << name;
+  // Serialize whole vector. If vector is empty, it's shape still needs to be
+  // serialized in empty proto
+  for (size_t chunkBegin = 0;
+       chunkBegin < std::max(tensor.size(), static_cast<TIndex>(1));
+       chunkBegin += chunk_size) {
+    VLOG(2) << "Starting a chunk at " << chunkBegin;
+#ifndef __ANDROID__
+    if (tensor.size() > chunk_size) {
+      chunkQueue.Push(chunkBegin);
+    } else {
+      // Sync mode for small tensors
+      processChunk(chunkBegin);
+    }
+#else
+    // Since Android does not have std::future, we will always do sync mode
+    processChunk(chunkBegin);
+#endif
+  }
+
+#ifndef __ANDROID__
+  chunkQueue.NoMoreJobs();
+  for (auto& fut : futures) {
+    fut.get();
+  }
+#endif
+}
+
+void TensorSerializer::Serialize(
+    const Tensor& input,
+    const string& /*name*/,
+    TensorProto* proto_ptr,
+    size_t chunkBegin,
+    int32_t chunkSize) {
+  CAFFE_ENFORCE(
+      chunkBegin <= input.size(),
+      "Chunk begin is out of tensor: ",
+      chunkBegin,
+      ' ',
+      input.size());
+  if (chunkBegin + chunkSize > input.size()) {
+    chunkSize = input.size() - chunkBegin;
+  }
+
+  CAFFE_ENFORCE(
+      input.raw_data() || chunkSize == 0,
+      "The input does not have data input yet. This is probably because you "
+      "created a tensor of non-zero shape but never filled its data via "
+      "mutable_data() calls. This means that it makes no sense to serialize "
+      "the tensor content.");
+
+  TensorProto& proto = *proto_ptr;
+  proto.mutable_segment()->set_begin(chunkBegin);
+  proto.mutable_segment()->set_end(chunkBegin + chunkSize);
+
+  for (int i = 0; i < input.ndim(); ++i) {
+    proto.add_dims(input.dim(i));
+  }
+  const TensorProto::DataType data_type = TypeMetaToDataType(input.meta());
+  proto.set_data_type(data_type);
+  StoreDeviceDetail(input, &proto);
+  auto uniq_ptr = input.GetStaticContext()->CreateContext();
+  // A lot of copypaste is error prone. Should we create a macro for this?
+  switch (data_type) {
+    case TensorProto_DataType_FLOAT:
+      detail::CopyToProtoAsIs(
+          chunkSize,
+          input.template data<float>() + chunkBegin,
+          proto.mutable_float_data(),
+          uniq_ptr.get());
+      break;
+    case TensorProto_DataType_INT32:
+      detail::CopyToProtoAsIs(
+          chunkSize,
+          input.template data<int>() + chunkBegin,
+          proto.mutable_int32_data(),
+          uniq_ptr.get());
+      break;
+    case TensorProto_DataType_BYTE:
+      LOG(FATAL) << "This should not happen. When serializing, "
+                    "BYTE is deprecated and moved to UINT8.";
+      break;
+    case TensorProto_DataType_STRING: {
+      proto.mutable_string_data()->Reserve(chunkSize);
+      const string* content = input.template data<string>();
+      for (int i = chunkBegin; i < chunkBegin + chunkSize; ++i) {
+        proto.add_string_data(content[i]);
+      }
+      break;
+    }
+    case TensorProto_DataType_BOOL:
+      detail::CopyToProtoWithCast(
+          chunkSize,
+          input.template data<bool>() + chunkBegin,
+          proto.mutable_int32_data(),
+          uniq_ptr.get());
+      break;
+    case TensorProto_DataType_UINT8:
+      detail::CopyToProtoWithCast(
+          chunkSize,
+          input.template data<uint8_t>() + chunkBegin,
+          proto.mutable_int32_data(),
+          uniq_ptr.get());
+      break;
+    case TensorProto_DataType_INT8:
+      detail::CopyToProtoWithCast(
+          chunkSize,
+          input.template data<int8_t>() + chunkBegin,
+          proto.mutable_int32_data(),
+          uniq_ptr.get());
+      break;
+    case TensorProto_DataType_UINT16:
+      detail::CopyToProtoWithCast(
+          chunkSize,
+          input.template data<uint16_t>() + chunkBegin,
+          proto.mutable_int32_data(),
+          uniq_ptr.get());
+      break;
+    case TensorProto_DataType_INT16:
+      detail::CopyToProtoWithCast(
+          chunkSize,
+          input.template data<int16_t>() + chunkBegin,
+          proto.mutable_int32_data(),
+          uniq_ptr.get());
+      break;
+    case TensorProto_DataType_INT64:
+      detail::CopyToProtoAsIs(
+          chunkSize,
+          input.template data<int64_t>() + chunkBegin,
+          proto.mutable_int64_data(),
+          uniq_ptr.get());
+      break;
+    case TensorProto_DataType_FLOAT16: {
+      if (FLAGS_caffe2_serialize_fp16_as_bytes) {
+        const int kValue = 1;
+        CAFFE_ENFORCE_EQ(
+            reinterpret_cast<const char*>(&kValue)[0],
+            1,
+            "Serialization of FLOAT16 on big endian platform "
+            "is not written yet.");
+        unique_ptr<char[]> buffer(new char[2 * chunkSize]);
+        this->context_->template CopyToCPU<char>(
+            2 * chunkSize,
+            reinterpret_cast<const char*>(
+                input.template data<float16>() + chunkBegin),
+            buffer.get());
+        this->context_->FinishDeviceComputation();
+        proto.set_byte_data(buffer.release(), 2 * chunkSize);
+      } else {
+        detail::CopyToProtoWithCast(
+            chunkSize,
+            reinterpret_cast<const uint16_t*>(input.template data<float16>()) +
+                chunkBegin,
+            proto.mutable_int32_data(),
+            uniq_ptr.get());
+      }
+    } break;
+    case TensorProto_DataType_DOUBLE:
+      detail::CopyToProtoAsIs(
+          chunkSize,
+          input.template data<double>() + chunkBegin,
+          proto.mutable_double_data(),
+          uniq_ptr.get());
+      break;
+    case TensorProto_DataType_UNDEFINED: {
+      proto.mutable_string_data()->Reserve(chunkSize);
+      Blob temp_blob;
+      const char* raw_data = static_cast<const char*>(input.raw_data());
+      for (int i = chunkBegin; i < chunkBegin + chunkSize; ++i) {
+        temp_blob.ShareExternal(
+            const_cast<char*>(raw_data + i * input.itemsize()), input.meta());
+        proto.add_string_data(temp_blob.Serialize(""));
+      }
+    } break;
+      // Note: we intentially do not provide "default:" so if any new data types
+      // are added, the compiler should warn the user to add the case here.
+  }
+}
+
+int GetGPUIDForPointer(const void* ptr);
+
+void TensorSerializer::StoreDeviceDetail(
+    const Tensor& input,
+    TensorProto* proto) {
+  input.ExtractDeviceOption(proto->mutable_device_detail());
+}
 // The actual serialization registry objects.
 CAFFE_DEFINE_TYPED_REGISTRY(
    BlobSerializerRegistry,
@ -127,12 +357,176 @@ void Blob::Deserialize(const BlobProto& blob_proto) {
  }
 }

+void TensorDeserializer::Deserialize(const BlobProto& blob_proto, Blob* blob) {
+  auto tensor_proto = blob_proto.tensor();
+  Deserialize(
+      tensor_proto,
+      blob->GetMutableTensor(
+          static_cast<DeviceType>(tensor_proto.device_detail().device_type())));
+}
+
+void TensorDeserializer::Deserialize(const TensorProto& proto, Tensor* tensor) {
+  // We create a local context for deserializing. Since Caffe2 contexts are
+  // usually lightweight, this should not involve too much overhead.
+  auto uniq_ptr =
+      tensor->GetStaticContext()->CreateContext(proto.device_detail());
+  auto context = uniq_ptr.get();
+  context->SwitchToDevice(0);
+  vector<TIndex> dims;
+  for (const TIndex d : proto.dims()) {
+    dims.push_back(d);
+  }
+  tensor->Resize(dims);
+
+  int64_t chunkBegin = 0;
+  auto chunkEnd = tensor->size();
+  if (proto.has_segment()) {
+    chunkBegin = proto.segment().begin();
+    chunkEnd = proto.segment().end();
+  }
+  CAFFE_ENFORCE(
+      0 <= chunkBegin && chunkBegin <= chunkEnd && chunkEnd <= tensor->size(),
+      "Invalid chunk ",
+      chunkBegin,
+      ' ',
+      chunkEnd,
+      " with total tensor size ",
+      tensor->size());
+  auto chunkSize = chunkEnd - chunkBegin;
+
+  switch (proto.data_type()) {
+    case TensorProto_DataType_FLOAT:
+      detail::CopyFromProtoAsIs(
+          chunkSize,
+          proto.float_data(),
+          tensor->template mutable_data<float>() + chunkBegin,
+          context);
+      break;
+    case TensorProto_DataType_INT32:
+      detail::CopyFromProtoAsIs(
+          chunkSize,
+          proto.int32_data(),
+          tensor->template mutable_data<int>() + chunkBegin,
+          context);
+      break;
+    case TensorProto_DataType_BYTE:
+      // Since BYTE stores the data in a string field instead of a repreated
+      // field we will have it special cased.
+      CAFFE_ENFORCE_EQ(
+          chunkSize, proto.byte_data().size(), "Incorrect proto field size.");
+      context->template CopyToCPU<uint8_t>(
+          chunkSize,
+          reinterpret_cast<const uint8_t*>(proto.byte_data().data()),
+          tensor->template mutable_data<uint8_t>() + chunkBegin);
+      break;
+    case TensorProto_DataType_STRING:
+      // Special handing of string because it is a non-fundamental type.
+      {
+        string* content = tensor->template mutable_data<string>();
+        for (int i = 0; i < chunkSize; ++i) {
+          content[i + chunkBegin] = proto.string_data(i);
+        }
+      }
+      break;
+    case TensorProto_DataType_BOOL:
+      detail::CopyFromProtoWithCast(
+          chunkSize,
+          proto.int32_data(),
+          tensor->template mutable_data<bool>() + chunkBegin,
+          context);
+      break;
+    case TensorProto_DataType_UINT8:
+      detail::CopyFromProtoWithCast(
+          chunkSize,
+          proto.int32_data(),
+          tensor->template mutable_data<uint8_t>() + chunkBegin,
+          context);
+      break;
+    case TensorProto_DataType_INT8:
+      detail::CopyFromProtoWithCast(
+          chunkSize,
+          proto.int32_data(),
+          tensor->template mutable_data<int8_t>() + chunkBegin,
+          context);
+      break;
+    case TensorProto_DataType_UINT16:
+      detail::CopyFromProtoWithCast(
+          chunkSize,
+          proto.int32_data(),
+          tensor->template mutable_data<uint16_t>() + chunkBegin,
+          context);
+      break;
+    case TensorProto_DataType_INT16:
+      detail::CopyFromProtoWithCast(
+          chunkSize,
+          proto.int32_data(),
+          tensor->template mutable_data<int16_t>() + chunkBegin,
+          context);
+      break;
+    case TensorProto_DataType_INT64:
+      detail::CopyFromProtoAsIs(
+          chunkSize,
+          proto.int64_data(),
+          tensor->template mutable_data<int64_t>() + chunkBegin,
+          context);
+      break;
+    case TensorProto_DataType_FLOAT16:
+      if (proto.has_byte_data()) {
+        const int kValue = 1;
+        CAFFE_ENFORCE_EQ(
+            reinterpret_cast<const char*>(&kValue)[0],
+            1,
+            "Serialization of FLOAT16 on big endian platform "
+            "is not written yet.");
+        CAFFE_ENFORCE_EQ(
+            2 * chunkSize,
+            proto.byte_data().size(),
+            "Incorrect proto field size.");
+        context->template CopyToCPU<float16>(
+            chunkSize,
+            reinterpret_cast<const float16*>(proto.byte_data().data()),
+            tensor->template mutable_data<float16>() + chunkBegin);
+      } else {
+        // Backward compatibility with models which used int32_data field
+        detail::CopyFromProtoWithCast(
+            chunkSize,
+            proto.int32_data(),
+            reinterpret_cast<uint16_t*>(
+                tensor->template mutable_data<float16>()) +
+                chunkBegin,
+            context);
+      }
+      break;
+    case TensorProto_DataType_DOUBLE:
+      detail::CopyFromProtoAsIs(
+          chunkSize,
+          proto.double_data(),
+          tensor->template mutable_data<double>() + chunkBegin,
+          context);
+      break;
+    case TensorProto_DataType_UNDEFINED: {
+      Blob temp_blob;
+      void* raw_ptr = nullptr;
+      for (int i = 0; i < chunkSize; ++i) {
+        temp_blob.Deserialize(proto.string_data(i));
+        if (i == 0) {
+          raw_ptr = tensor->raw_mutable_data(temp_blob.meta());
+        }
+        temp_blob.meta().copy()(
+            temp_blob.GetRaw(),
+            static_cast<char*>(raw_ptr) +
+                (i + chunkBegin) * temp_blob.meta().itemsize(),
+            1);
+      }
+    }
+  }
+  context->FinishDeviceComputation();
+}
+
 namespace {
-// Serialize TensorCPU.
-REGISTER_BLOB_SERIALIZER(
-    (TypeMeta::Id<TensorCPU>()),
-    TensorSerializer<CPUContext>);
-REGISTER_BLOB_DESERIALIZER(TensorCPU, TensorDeserializer<CPUContext>);
+// Serialize Tensor
+REGISTER_BLOB_SERIALIZER((TypeMeta::Id<Tensor>()), TensorSerializer);
+REGISTER_BLOB_DESERIALIZER(TensorCPU, TensorDeserializer);
 // Serialize std::string
 REGISTER_BLOB_SERIALIZER((TypeMeta::Id<std::string>()), StringSerializer);
 REGISTER_BLOB_DESERIALIZER(std::string, StringDeserializer);
--- a/caffe2/core/blob_serialization.h
+++ b/caffe2/core/blob_serialization.h
@ -42,13 +42,12 @@ inline unique_ptr<BlobSerializerBase> CreateSerializer(CaffeTypeId id) {
 * TensorSerializer takes in a blob that contains a Tensor, and serializes it
 * into a TensorProto protocol buffer.
 */
-template <class Context>
 class TensorSerializer : public BlobSerializerBase {
 public:
-  TensorSerializer() : context_() {}
+  TensorSerializer() {}
  ~TensorSerializer() override {}
  /**
-   * Serializes a Blob. Note that this blob has to contain Tensor<Context>,
+   * Serializes a Blob. Note that this blob has to contain Tensor,
   * otherwise this function produces a fatal error.
   */
  void Serialize(
@ -61,13 +60,17 @@ class TensorSerializer : public BlobSerializerBase {
      SerializationAcceptor acceptor,
      int chunk_size) override;

-  void Serialize(const Tensor<Context>& tensor, const string& name,
-                 TensorProto* proto, size_t chunkBegin, int32_t chunkSize);
+  void Serialize(
+      const Tensor& tensor,
+      const string& name,
+      TensorProto* proto,
+      size_t chunkBegin,
+      int32_t chunkSize);

 private:
  // A utility function to store the device context detauls.
-  void StoreDeviceDetail(const Tensor<Context>& input, TensorProto* proto);
-  Context context_;
+  void StoreDeviceDetail(const Tensor& input, TensorProto* proto);
+  unique_ptr<BaseContext> context_;
 };

 /**
@ -98,11 +101,10 @@ inline unique_ptr<BlobDeserializerBase> CreateDeserializer(const string& type) {
 * tensor, change the TensorProto's corresponding fields before calling
 * Deserialize.
 */
-template <class Context>
 class TensorDeserializer : public BlobDeserializerBase {
 public:
  void Deserialize(const BlobProto& proto, Blob* blob) override;
-  void Deserialize(const TensorProto& proto, Tensor<Context>* tensor);
+  void Deserialize(const TensorProto& proto, Tensor* tensor);
 };

 ////////////////////////////////////////////////////////////////////////////////
@ -110,12 +112,12 @@ class TensorDeserializer : public BlobDeserializerBase {
 ////////////////////////////////////////////////////////////////////////////////

 namespace detail {
-template <typename SrcType, typename DstType, class Context>
+template <typename SrcType, typename DstType>
 inline void CopyToProtoAsIs(
    const size_t size,
    const SrcType* src,
    google::protobuf::RepeatedField<DstType>* field,
-    Context* context) {
+    BaseContext* context) {
  static_assert(
      sizeof(SrcType) == sizeof(DstType),
      "The source type and dest type cannot be copied as-is. Did "
@ -124,23 +126,22 @@ inline void CopyToProtoAsIs(
  for (int i = 0; i < size; ++i) {
    field->Add(0);
  }
-  context->template Copy<SrcType, Context, CPUContext>(
+  context->template CopyToCPU<SrcType>(
      size, src, reinterpret_cast<SrcType*>(field->mutable_data()));
  // Make sure that we finish the copy into the protobuf.
  context->FinishDeviceComputation();
 }

-template <typename SrcType, typename DstType, class Context>
+template <typename SrcType, typename DstType>
 inline void CopyToProtoWithCast(
    const size_t size,
    const SrcType* src,
    google::protobuf::RepeatedField<DstType>* field,
-    Context* context) {
+    BaseContext* context) {
  // TODO: we are having one unnecessary copy here if the context is already
  // CPUContext. Remove it if it is performance critical.
  unique_ptr<SrcType[]> buffer(new SrcType[size]);
-  context->template Copy<SrcType, Context, CPUContext>(
-      size, src, buffer.get());
+  context->template CopyToCPU<SrcType>(size, src, buffer.get());
  context->FinishDeviceComputation();
  field->Reserve(size);
  for (int i = 0; i < size; ++i) {
@ -148,27 +149,27 @@ inline void CopyToProtoWithCast(
  }
 }

-template <typename SrcType, typename DstType, class Context>
+template <typename SrcType, typename DstType>
 inline void CopyFromProtoAsIs(
    const size_t size,
    const google::protobuf::RepeatedField<SrcType>& field,
    DstType* dst,
-    Context* context) {
+    BaseContext* context) {
  static_assert(
      sizeof(SrcType) == sizeof(DstType),
      "The source type and dest type cannot be copied as-is. Did "
      "you mean CopyFromProtoWithCast?");
  CAFFE_ENFORCE_EQ(size, field.size(), "Incorrect proto field size.");
-  context->template Copy<DstType, CPUContext, Context>(
+  context->template CopyFromCPU<DstType>(
      size, reinterpret_cast<const DstType*>(field.data()), dst);
 }

-template <typename SrcType, typename DstType, class Context>
+template <typename SrcType, typename DstType>
 inline void CopyFromProtoWithCast(
    const size_t size,
    const google::protobuf::RepeatedField<SrcType>& field,
    DstType* dst,
-    Context* context) {
+    BaseContext* context) {
  CAFFE_ENFORCE_EQ(size, field.size(), "Incorrect proto field size.");
  // TODO: we are having one unnecessary copy here if the context is already
  // CPUContext. Remove it if it is performance critical.
@ -177,410 +178,10 @@ inline void CopyFromProtoWithCast(
  for (int i = 0; i < size; ++i) {
    buffer[i] = static_cast<DstType>(src[i]);
  }
-  context->template Copy<DstType, CPUContext, Context>(size, buffer.get(), dst);
+  context->template CopyFromCPU<DstType>(size, buffer.get(), dst);
 }

 }  // namespace detail
-
-template <class Context>
-void TensorSerializer<Context>::Serialize(
-    const Blob& blob,
-    const string& name,
-    BlobSerializerBase::SerializationAcceptor acceptor) {
-  this->SerializeWithChunkSize(blob, name, acceptor, kDefaultChunkSize);
-}
-
-template <class Context>
-void TensorSerializer<Context>::SerializeWithChunkSize(
-    const Blob& blob,
-    const string& name,
-    BlobSerializerBase::SerializationAcceptor acceptor,
-    int chunk_size) {
-  CAFFE_ENFORCE(blob.IsType<Tensor<Context>>());
-  const auto& tensor = blob.template Get<Tensor<Context>>();
-  if (chunk_size == kNoChunking) {
-    chunk_size = tensor.size() + 1; // to account for empty tensors
-  } else if (chunk_size == kDefaultChunkSize) {
-    chunk_size = FLAGS_caffe2_tensor_chunk_size;
-  }
-
-  auto processChunk = [&](int64_t chunkStart) {
-    BlobProto blob_proto;
-    blob_proto.set_name(name);
-    blob_proto.set_type(kTensorBlobType);
-    TensorProto& proto = *blob_proto.mutable_tensor();
-    proto.set_name(name);
-    this->Serialize(
-        tensor, name, blob_proto.mutable_tensor(), chunkStart, chunk_size);
-    acceptor(
-        MakeString(name, kChunkIdSeparator, chunkStart / chunk_size),
-        blob_proto.SerializeAsString());
-  };
-
-#ifndef __ANDROID__
-  std::vector<std::future<void>> futures;
-  // Poorman's IOBound ThreadPool
-  SimpleQueue<size_t> chunkQueue;
-  auto task = [&]() {
-    size_t chunkStart;
-    while (chunkQueue.Pop(&chunkStart)) {
-      processChunk(chunkStart);
-    }
-  };
-  if (tensor.size() > chunk_size) {
-    for (int i = 0; i < FLAGS_caffe2_max_tensor_serializer_threads; ++i) {
-      futures.emplace_back(std::async(std::launch::async, task));
-    }
-  }
-#endif
-
-  VLOG(1) << "Serializing blob " << name;
-  // Serialize whole vector. If vector is empty, it's shape still needs to be
-  // serialized in empty proto
-  for (size_t chunkBegin = 0;
-       chunkBegin < std::max(tensor.size(), static_cast<TIndex>(1));
-       chunkBegin += chunk_size) {
-    VLOG(2) << "Starting a chunk at " << chunkBegin;
-#ifndef __ANDROID__
-    if (tensor.size() > chunk_size) {
-      chunkQueue.Push(chunkBegin);
-    } else {
-      // Sync mode for small tensors
-      processChunk(chunkBegin);
-    }
-#else
-    // Since Android does not have std::future, we will always do sync mode
-    processChunk(chunkBegin);
-#endif
-  }
-
-#ifndef __ANDROID__
-  chunkQueue.NoMoreJobs();
-  for (auto& fut : futures) {
-    fut.get();
-  }
-#endif
-}
-
-template <class Context>
-void TensorSerializer<Context>::Serialize(
-    const Tensor<Context>& input,
-    const string& /*name*/,
-    TensorProto* proto_ptr,
-    size_t chunkBegin,
-    int32_t chunkSize) {
-  CAFFE_ENFORCE(
-      chunkBegin <= input.size(),
-      "Chunk begin is out of tensor: ",
-      chunkBegin,
-      ' ',
-      input.size());
-  if (chunkBegin + chunkSize > input.size()) {
-    chunkSize = input.size() - chunkBegin;
-  }
-
-  CAFFE_ENFORCE(
-      input.raw_data() || chunkSize == 0,
-      "The input does not have data input yet. This is probably because you "
-      "created a tensor of non-zero shape but never filled its data via "
-      "mutable_data() calls. This means that it makes no sense to serialize "
-      "the tensor content.");
-
-  TensorProto& proto = *proto_ptr;
-  proto.mutable_segment()->set_begin(chunkBegin);
-  proto.mutable_segment()->set_end(chunkBegin + chunkSize);
-
-  for (int i = 0; i < input.ndim(); ++i) {
-    proto.add_dims(input.dim(i));
-  }
-  const TensorProto::DataType data_type = TypeMetaToDataType(input.meta());
-  proto.set_data_type(data_type);
-  StoreDeviceDetail(input, &proto);
-
-  // A lot of copypaste is error prone. Should we create a macro for this?
-  switch (data_type) {
-  case TensorProto_DataType_FLOAT:
-    detail::CopyToProtoAsIs(
-        chunkSize,
-        input.template data<float>() + chunkBegin,
-        proto.mutable_float_data(),
-        &this->context_);
-    break;
-  case TensorProto_DataType_INT32:
-    detail::CopyToProtoAsIs(
-        chunkSize,
-        input.template data<int>() + chunkBegin,
-        proto.mutable_int32_data(),
-        &this->context_);
-    break;
-  case TensorProto_DataType_BYTE:
-    LOG(FATAL) << "This should not happen. When serializing, "
-                  "BYTE is deprecated and moved to UINT8.";
-    break;
-  case TensorProto_DataType_STRING:
-    {
-      proto.mutable_string_data()->Reserve(chunkSize);
-      const string* content = input.template data<string>();
-      for (int i = chunkBegin; i < chunkBegin + chunkSize; ++i) {
-        proto.add_string_data(content[i]);
-      }
-      break;
-    }
-  case TensorProto_DataType_BOOL:
-    detail::CopyToProtoWithCast(
-        chunkSize,
-        input.template data<bool>() + chunkBegin,
-        proto.mutable_int32_data(),
-        &this->context_);
-    break;
-  case TensorProto_DataType_UINT8:
-    detail::CopyToProtoWithCast(
-        chunkSize,
-        input.template data<uint8_t>() + chunkBegin,
-        proto.mutable_int32_data(),
-        &this->context_);
-    break;
-  case TensorProto_DataType_INT8:
-    detail::CopyToProtoWithCast(
-        chunkSize,
-        input.template data<int8_t>() + chunkBegin,
-        proto.mutable_int32_data(),
-        &this->context_);
-    break;
-  case TensorProto_DataType_UINT16:
-    detail::CopyToProtoWithCast(
-        chunkSize,
-        input.template data<uint16_t>() + chunkBegin,
-        proto.mutable_int32_data(),
-        &this->context_);
-    break;
-  case TensorProto_DataType_INT16:
-    detail::CopyToProtoWithCast(
-        chunkSize,
-        input.template data<int16_t>() + chunkBegin,
-        proto.mutable_int32_data(),
-        &this->context_);
-    break;
-  case TensorProto_DataType_INT64:
-    detail::CopyToProtoAsIs(
-        chunkSize,
-        input.template data<int64_t>() + chunkBegin,
-        proto.mutable_int64_data(),
-        &this->context_);
-    break;
-  case TensorProto_DataType_FLOAT16: {
-    if (FLAGS_caffe2_serialize_fp16_as_bytes) {
-      const int kValue = 1;
-      CAFFE_ENFORCE_EQ(
-          reinterpret_cast<const char*>(&kValue)[0],
-          1,
-          "Serialization of FLOAT16 on big endian platform "
-          "is not written yet.");
-      unique_ptr<char[]> buffer(new char[2 * chunkSize]);
-      this->context_.template Copy<char, Context, CPUContext>(
-          2 * chunkSize,
-          reinterpret_cast<const char*>(
-              input.template data<float16>() + chunkBegin),
-          buffer.get());
-      this->context_.FinishDeviceComputation();
-      proto.set_byte_data(buffer.release(), 2 * chunkSize);
-    } else {
-      detail::CopyToProtoWithCast(
-          chunkSize,
-          reinterpret_cast<const uint16_t*>(input.template data<float16>()) +
-              chunkBegin,
-          proto.mutable_int32_data(),
-          &this->context_);
-    }
-  } break;
-  case TensorProto_DataType_DOUBLE:
-    detail::CopyToProtoAsIs(
-        chunkSize,
-        input.template data<double>() + chunkBegin,
-        proto.mutable_double_data(),
-        &this->context_);
-    break;
-  case TensorProto_DataType_UNDEFINED: {
-    proto.mutable_string_data()->Reserve(chunkSize);
-    Blob temp_blob;
-    const char* raw_data = static_cast<const char*>(input.raw_data());
-    for (int i = chunkBegin; i < chunkBegin + chunkSize; ++i) {
-      temp_blob.ShareExternal(
-          const_cast<char*>(raw_data + i * input.itemsize()), input.meta());
-      proto.add_string_data(temp_blob.Serialize(""));
-    }
-  } break;
-    // Note: we intentially do not provide "default:" so if any new data types
-    // are added, the compiler should warn the user to add the case here.
-  }
-}
-
-template <class Context>
-void TensorDeserializer<Context>::Deserialize(
-    const BlobProto& blob_proto,
-    Blob* blob) {
-  Deserialize(blob_proto.tensor(), blob->GetMutable<Tensor<Context>>());
-}
-
-template <class Context>
-void TensorDeserializer<Context>::Deserialize(
-    const TensorProto& proto,
-    Tensor<Context>* tensor) {
-  // We create a local context for deserializing. Since Caffe2 contexts are
-  // usually lightweighted, this should not involve too much overhead.
-  Context context(proto.device_detail());
-  context.SwitchToDevice(0);
-  vector<TIndex> dims;
-  for (const TIndex d : proto.dims()) {
-    dims.push_back(d);
-  }
-  tensor->Resize(dims);
-
-  int64_t chunkBegin = 0;
-  auto chunkEnd = tensor->size();
-  if (proto.has_segment()) {
-    chunkBegin = proto.segment().begin();
-    chunkEnd = proto.segment().end();
-  }
-  CAFFE_ENFORCE(
-      0 <= chunkBegin && chunkBegin <= chunkEnd && chunkEnd <= tensor->size(),
-      "Invalid chunk ",
-      chunkBegin,
-      ' ',
-      chunkEnd,
-      " with total tensor size ",
-      tensor->size());
-  auto chunkSize = chunkEnd - chunkBegin;
-
-  switch (proto.data_type()) {
-    case TensorProto_DataType_FLOAT:
-      detail::CopyFromProtoAsIs(
-          chunkSize,
-          proto.float_data(),
-          tensor->template mutable_data<float>() + chunkBegin,
-          &context);
-      break;
-    case TensorProto_DataType_INT32:
-      detail::CopyFromProtoAsIs(
-          chunkSize,
-          proto.int32_data(),
-          tensor->template mutable_data<int>() + chunkBegin,
-          &context);
-      break;
-    case TensorProto_DataType_BYTE:
-      // Since BYTE stores the data in a string field instead of a repreated
-      // field we will have it special cased.
-      CAFFE_ENFORCE_EQ(
-          chunkSize, proto.byte_data().size(), "Incorrect proto field size.");
-      context.template Copy<uint8_t, Context, CPUContext>(
-          chunkSize,
-          reinterpret_cast<const uint8_t*>(proto.byte_data().data()),
-          tensor->template mutable_data<uint8_t>() + chunkBegin);
-      break;
-    case TensorProto_DataType_STRING:
-      // Special handing of string because it is a non-fundamental type.
-      {
-        string* content = tensor->template mutable_data<string>();
-        for (int i = 0; i < chunkSize; ++i) {
-          content[i + chunkBegin] = proto.string_data(i);
-        }
-      }
-      break;
-    case TensorProto_DataType_BOOL:
-      detail::CopyFromProtoWithCast(
-          chunkSize,
-          proto.int32_data(),
-          tensor->template mutable_data<bool>() + chunkBegin,
-          &context);
-      break;
-    case TensorProto_DataType_UINT8:
-      detail::CopyFromProtoWithCast(
-          chunkSize,
-          proto.int32_data(),
-          tensor->template mutable_data<uint8_t>() + chunkBegin,
-          &context);
-      break;
-    case TensorProto_DataType_INT8:
-      detail::CopyFromProtoWithCast(
-          chunkSize,
-          proto.int32_data(),
-          tensor->template mutable_data<int8_t>() + chunkBegin,
-          &context);
-      break;
-    case TensorProto_DataType_UINT16:
-      detail::CopyFromProtoWithCast(
-          chunkSize,
-          proto.int32_data(),
-          tensor->template mutable_data<uint16_t>() + chunkBegin,
-          &context);
-      break;
-    case TensorProto_DataType_INT16:
-      detail::CopyFromProtoWithCast(
-          chunkSize,
-          proto.int32_data(),
-          tensor->template mutable_data<int16_t>() + chunkBegin,
-          &context);
-      break;
-    case TensorProto_DataType_INT64:
-      detail::CopyFromProtoAsIs(
-          chunkSize,
-          proto.int64_data(),
-          tensor->template mutable_data<int64_t>() + chunkBegin,
-          &context);
-      break;
-    case TensorProto_DataType_FLOAT16:
-      if (proto.has_byte_data()) {
-        const int kValue = 1;
-        CAFFE_ENFORCE_EQ(
-            reinterpret_cast<const char*>(&kValue)[0],
-            1,
-            "Serialization of FLOAT16 on big endian platform "
-            "is not written yet.");
-        CAFFE_ENFORCE_EQ(
-            2 * chunkSize,
-            proto.byte_data().size(),
-            "Incorrect proto field size.");
-        context.template Copy<float16, Context, CPUContext>(
-            chunkSize,
-            reinterpret_cast<const float16*>(proto.byte_data().data()),
-            tensor->template mutable_data<float16>() + chunkBegin);
-      } else {
-        // Backward compatibility with models which used int32_data field
-        detail::CopyFromProtoWithCast(
-            chunkSize,
-            proto.int32_data(),
-            reinterpret_cast<uint16_t*>(
-                tensor->template mutable_data<float16>()) +
-                chunkBegin,
-            &context);
-      }
-      break;
-    case TensorProto_DataType_DOUBLE:
-      detail::CopyFromProtoAsIs(
-          chunkSize,
-          proto.double_data(),
-          tensor->template mutable_data<double>() + chunkBegin,
-          &context);
-      break;
-    case TensorProto_DataType_UNDEFINED: {
-      Blob temp_blob;
-      void* raw_ptr = nullptr;
-      for (int i = 0; i < chunkSize; ++i) {
-        temp_blob.Deserialize(proto.string_data(i));
-        if (i == 0) {
-          raw_ptr = tensor->raw_mutable_data(temp_blob.meta());
-        }
-        temp_blob.meta().copy()(
-            temp_blob.GetRaw(),
-            static_cast<char*>(raw_ptr) +
-                (i + chunkBegin) * temp_blob.meta().itemsize(),
-            1);
-      }
-    }
-  }
-  context.FinishDeviceComputation();
-}
-
 }  // namespace caffe2

 #endif  // CAFFE2_CORE_BLOB_SERIALIZATION_H_
--- a/caffe2/core/blob_serialization_gpu.cc
+++ b/caffe2/core/blob_serialization_gpu.cc
@ -4,20 +4,7 @@

 namespace caffe2 {

-template <>
-void TensorSerializer<CUDAContext>::StoreDeviceDetail(
-    const Tensor<CUDAContext>& input, TensorProto* proto) {
-  auto* device_detail = proto->mutable_device_detail();
-  device_detail->set_device_type(CUDA);
-  device_detail->set_cuda_gpu_id(
-      GetGPUIDForPointer(input.raw_data()));
-}
-
 namespace {
-REGISTER_BLOB_SERIALIZER(
-    (TypeMeta::Id<TensorCUDA>()),
-    TensorSerializer<CUDAContext>);
-REGISTER_BLOB_DESERIALIZER(TensorCUDA, TensorDeserializer<CUDAContext>);
+REGISTER_BLOB_DESERIALIZER(TensorCUDA, TensorDeserializer);
 }
 }  // namespace caffe2
-
--- a/caffe2/core/blob_test.cc
+++ b/caffe2/core/blob_test.cc
@ -47,7 +47,7 @@ class BlobTestFooSerializer : public BlobSerializerBase {
  BlobTestFooSerializer() {}
  ~BlobTestFooSerializer() {}
  /**
-   * Serializes a Blob. Note that this blob has to contain Tensor<Context>,
+   * Serializes a Blob. Note that this blob has to contain Tensor,
   * otherwise this function produces a fatal error.
   */
  void Serialize(
@ -181,7 +181,7 @@ TEST(TensorNonTypedTest, TensorChangeType) {
  dims[0] = 2;
  dims[1] = 3;
  dims[2] = 5;
-  TensorCPU tensor(dims);
+  Tensor tensor(dims, CPU);

  auto* ptr = tensor.mutable_data<int>();
  EXPECT_TRUE(ptr != nullptr);
@ -200,7 +200,7 @@ TEST(TensorNonTypedTest, TensorChangeType) {

  // share the data with other tensor so that the pointer won't be reused
  // when we reallocate
-  TensorCPU other_tensor(dims);
+  Tensor other_tensor(dims, CPU);
  other_tensor.ShareData(tensor);
  // but double is bigger, so it should allocate a new one
  auto* doubleptr = tensor.mutable_data<double>();
@ -215,7 +215,7 @@ TEST(TensorNonTypedTest, NonDefaultConstructible) {
  dims[0] = 2;
  dims[1] = 3;
  dims[2] = 5;
-  TensorCPU tensor(dims);
+  Tensor tensor(dims, CPU);

  // this doesn't compile - good!
  // auto* ptr = tensor.mutable_data<BlobTestNonDefaultConstructible>();
@ -232,7 +232,7 @@ TYPED_TEST_CASE(TensorCPUTest, TensorTypes);
 TYPED_TEST_CASE(TensorCPUDeathTest, TensorTypes);

 TYPED_TEST(TensorCPUTest, TensorInitializedEmpty) {
-  TensorCPU tensor;
+  Tensor tensor(CPU);
  EXPECT_EQ(tensor.ndim(), 0);
  vector<int> dims(3);
  dims[0] = 2;
@ -253,7 +253,7 @@ TYPED_TEST(TensorCPUTest, TensorInitializedNonEmpty) {
  dims[0] = 2;
  dims[1] = 3;
  dims[2] = 5;
-  TensorCPU tensor(dims);
+  Tensor tensor(dims, CPU);
  EXPECT_EQ(tensor.ndim(), 3);
  EXPECT_EQ(tensor.dim32(0), 2);
  EXPECT_EQ(tensor.dim32(1), 3);
@ -279,7 +279,7 @@ TYPED_TEST(TensorCPUTest, TensorInitializedZeroDim) {
  dims[0] = 2;
  dims[1] = 0;
  dims[2] = 5;
-  TensorCPU tensor(dims);
+  Tensor tensor(dims, CPU);
  EXPECT_EQ(tensor.ndim(), 3);
  EXPECT_EQ(tensor.dim32(0), 2);
  EXPECT_EQ(tensor.dim32(1), 0);
@ -293,7 +293,7 @@ TYPED_TEST(TensorCPUTest, TensorResizeZeroDim) {
  dims[0] = 2;
  dims[1] = 3;
  dims[2] = 5;
-  TensorCPU tensor(dims);
+  Tensor tensor(dims, CPU);
  EXPECT_EQ(tensor.ndim(), 3);
  EXPECT_EQ(tensor.dim32(0), 2);
  EXPECT_EQ(tensor.dim32(1), 3);
@ -317,7 +317,7 @@ TYPED_TEST(TensorCPUTest, TensorResizeZeroDim) {

 TYPED_TEST(TensorCPUTest, TensorInitializedScalar) {
  vector<int> dims;
-  TensorCPU tensor(dims);
+  Tensor tensor(dims, CPU);
  EXPECT_EQ(tensor.ndim(), 0);
  EXPECT_EQ(tensor.size(), 1);
  EXPECT_TRUE(tensor.mutable_data<TypeParam>() != nullptr);
@ -329,8 +329,8 @@ TYPED_TEST(TensorCPUTest, TensorShareData) {
  dims[0] = 2;
  dims[1] = 3;
  dims[2] = 5;
-  TensorCPU tensor(dims);
-  TensorCPU other_tensor(dims);
+  Tensor tensor(dims, CPU);
+  Tensor other_tensor(dims, CPU);
  EXPECT_TRUE(tensor.mutable_data<TypeParam>() != nullptr);
  other_tensor.ShareData(tensor);
  EXPECT_TRUE(tensor.data<TypeParam>() != nullptr);
@ -349,7 +349,7 @@ TYPED_TEST(TensorCPUTest, TensorShareDataRawPointer) {
  dims[1] = 3;
  dims[2] = 5;
  std::unique_ptr<TypeParam[]> raw_buffer(new TypeParam[2*3*5]);
-  TensorCPU tensor(dims);
+  Tensor tensor(dims, CPU);
  tensor.ShareExternalPointer(raw_buffer.get());
  EXPECT_EQ(tensor.mutable_data<TypeParam>(), raw_buffer.get());
  EXPECT_EQ(tensor.data<TypeParam>(), raw_buffer.get());
@ -366,7 +366,7 @@ TYPED_TEST(TensorCPUTest, TensorShareDataRawPointerWithMeta) {
  dims[1] = 3;
  dims[2] = 5;
  std::unique_ptr<TypeParam[]> raw_buffer(new TypeParam[2 * 3 * 5]);
-  TensorCPU tensor(dims);
+  Tensor tensor(dims, CPU);
  TypeMeta meta = TypeMeta::Make<TypeParam>();
  tensor.ShareExternalPointer(raw_buffer.get(), meta);
  EXPECT_EQ(tensor.mutable_data<TypeParam>(), raw_buffer.get());
@ -380,7 +380,7 @@ TYPED_TEST(TensorCPUTest, TensorShareDataRawPointerWithMeta) {

 TYPED_TEST(TensorCPUTest, CannotShareDataWhenShapeNotSet) {
  std::unique_ptr<TypeParam[]> raw_buffer(new TypeParam[10]);
-  TensorCPU tensor;
+  Tensor tensor(CPU);
  ASSERT_THROW(tensor.ShareExternalPointer(raw_buffer.get()), EnforceNotMet);
 }

@ -391,8 +391,8 @@ TYPED_TEST(TensorCPUTest, TensorShareDataCanUseDifferentShapes) {
  dims[2] = 5;
  vector<int> alternate_dims(1);
  alternate_dims[0] = 2 * 3 * 5;
-  TensorCPU tensor(dims);
-  TensorCPU other_tensor(alternate_dims);
+  Tensor tensor(dims, CPU);
+  Tensor other_tensor(alternate_dims, CPU);
  EXPECT_TRUE(tensor.mutable_data<TypeParam>() != nullptr);
  other_tensor.ShareData(tensor);
  EXPECT_EQ(other_tensor.ndim(), 1);
@ -413,8 +413,8 @@ TYPED_TEST(TensorCPUTest, NoLongerSharesAfterResize) {
  dims[0] = 2;
  dims[1] = 3;
  dims[2] = 5;
-  TensorCPU tensor(dims);
-  TensorCPU other_tensor(dims);
+  Tensor tensor(dims, CPU);
+  Tensor other_tensor(dims, CPU);
  EXPECT_TRUE(tensor.mutable_data<TypeParam>() != nullptr);
  other_tensor.ShareData(tensor);
  EXPECT_EQ(tensor.data<TypeParam>(), other_tensor.data<TypeParam>());
@ -431,8 +431,8 @@ TYPED_TEST(TensorCPUTest, NoLongerSharesAfterFreeMemory) {
  dims[0] = 2;
  dims[1] = 3;
  dims[2] = 5;
-  TensorCPU tensor(dims);
-  TensorCPU other_tensor(dims);
+  Tensor tensor(dims, CPU);
+  Tensor other_tensor(dims, CPU);
  EXPECT_TRUE(tensor.mutable_data<TypeParam>() != nullptr);
  other_tensor.ShareData(tensor);
  EXPECT_EQ(tensor.data<TypeParam>(), other_tensor.data<TypeParam>());
@ -449,7 +449,7 @@ TYPED_TEST(TensorCPUTest, KeepOnShrink) {
  FLAGS_caffe2_max_keep_on_shrink_memory = LLONG_MAX;

  vector<int> dims{2, 3, 5};
-  TensorCPU tensor(dims);
+  Tensor tensor(dims, CPU);
  TypeParam* ptr = tensor.mutable_data<TypeParam>();
  EXPECT_TRUE(ptr != nullptr);
  // Expanding - will reallocate
@ -480,7 +480,7 @@ TYPED_TEST(TensorCPUTest, MaxKeepOnShrink) {
  FLAGS_caffe2_max_keep_on_shrink_memory = 8 * 4 * sizeof(TypeParam);

  vector<int> dims{1, 8, 8};
-  TensorCPU tensor(dims);
+  Tensor tensor(dims, CPU);
  TypeParam* ptr = tensor.mutable_data<TypeParam>();
  EXPECT_TRUE(ptr != nullptr);
  // Shrinking - will not reallocate
@ -501,19 +501,19 @@ TYPED_TEST(TensorCPUTest, MaxKeepOnShrink) {
 }

 TYPED_TEST(TensorCPUDeathTest, CannotAccessRawDataWhenEmpty) {
-  TensorCPU tensor;
+  Tensor tensor(CPU);
  EXPECT_EQ(tensor.ndim(), 0);
  ASSERT_ANY_THROW(tensor.raw_data());
 }

 TYPED_TEST(TensorCPUDeathTest, CannotAccessDataWhenEmpty) {
-  TensorCPU tensor;
+  Tensor tensor(CPU);
  EXPECT_EQ(tensor.ndim(), 0);
  ASSERT_ANY_THROW(tensor.data<TypeParam>());
 }

 TEST(TensorTest, TensorNonFundamentalType) {
-  TensorCPU tensor(vector<int>{2, 3, 4});
+  Tensor tensor(vector<int>{2, 3, 4}, CPU);
  EXPECT_TRUE(tensor.mutable_data<std::string>() != nullptr);
  const std::string* ptr = tensor.data<std::string>();
  for (int i = 0; i < tensor.size(); ++i) {
@ -522,14 +522,14 @@ TEST(TensorTest, TensorNonFundamentalType) {
 }

 TEST(TensorTest, TensorNonFundamentalTypeClone) {
-  TensorCPU tensor(vector<int>{2, 3, 4});
+  Tensor tensor(vector<int>{2, 3, 4}, CPU);
  std::string* ptr = tensor.mutable_data<std::string>();
  EXPECT_TRUE(ptr != nullptr);
  for (int i = 0; i < tensor.size(); ++i) {
    EXPECT_TRUE(ptr[i] == "");
    ptr[i] = "filled";
  }
-  TensorCPU dst_tensor = tensor.Clone();
+  Tensor dst_tensor = tensor.Clone();
  const std::string* dst_ptr = dst_tensor.data<std::string>();
  for (int i = 0; i < dst_tensor.size(); ++i) {
    EXPECT_TRUE(dst_ptr[i] == "filled");
@ -549,7 +549,7 @@ TEST(TensorTest, Tensor64BitDimension) {
  // Initialize a large tensor.
  TIndex large_number =
      static_cast<int64_t>(std::numeric_limits<int>::max()) + 1;
-  TensorCPU tensor(vector<TIndex>{large_number});
+  Tensor tensor(vector<TIndex>{large_number}, CPU);
  EXPECT_EQ(tensor.ndim(), 1);
  EXPECT_EQ(tensor.dim(0), large_number);
  EXPECT_EQ(tensor.size(), large_number);
@ -581,7 +581,7 @@ TEST(TensorTest, Tensor64BitDimension) {
 TEST(TensorDeathTest, CannotCastDownLargeDims) {
  TIndex large_number =
      static_cast<int64_t>(std::numeric_limits<int>::max()) + 1;
-  TensorCPU tensor(vector<TIndex>{large_number});
+  Tensor tensor(vector<TIndex>{large_number}, CPU);
  EXPECT_EQ(tensor.ndim(), 1);
  EXPECT_EQ(tensor.dim(0), large_number);
  ASSERT_THROW(tensor.dim32(0), EnforceNotMet);
@ -590,7 +590,7 @@ TEST(TensorDeathTest, CannotCastDownLargeDims) {
 #define TEST_SERIALIZATION_WITH_TYPE(TypeParam, field_name)               \
  TEST(TensorTest, TensorSerialization_##TypeParam) {                     \
    Blob blob;                                                            \
-    TensorCPU* tensor = blob.GetMutable<TensorCPU>();                     \
+    Tensor* tensor = blob.GetMutableTensor(CPU);                          \
    tensor->Resize(2, 3);                                                 \
    for (int i = 0; i < 6; ++i) {                                         \
      tensor->mutable_data<TypeParam>()[i] = static_cast<TypeParam>(i);   \
@ -611,7 +611,7 @@ TEST(TensorDeathTest, CannotCastDownLargeDims) {
    }                                                                     \
    Blob new_blob;                                                        \
    EXPECT_NO_THROW(new_blob.Deserialize(serialized));                    \
-    EXPECT_TRUE(new_blob.IsType<TensorCPU>());                            \
+    EXPECT_TRUE(new_blob.IsType<Tensor>(CPU));                            \
    const TensorCPU& new_tensor = blob.Get<TensorCPU>();                  \
    EXPECT_EQ(new_tensor.ndim(), 2);                                      \
    EXPECT_EQ(new_tensor.dim(0), 2);                                      \
@ -624,7 +624,7 @@ TEST(TensorDeathTest, CannotCastDownLargeDims) {
                                                                          \
  TEST(EmptyTensorTest, TensorSerialization_##TypeParam) {                \
    Blob blob;                                                            \
-    TensorCPU* tensor = blob.GetMutable<TensorCPU>();                     \
+    TensorCPU* tensor = blob.GetMutableTensor(CPU);                       \
    tensor->Resize(0, 3);                                                 \
    tensor->mutable_data<TypeParam>();                                    \
    string serialized = blob.Serialize("test");                           \
@ -640,7 +640,7 @@ TEST(TensorDeathTest, CannotCastDownLargeDims) {
    EXPECT_EQ(tensor_proto.field_name##_size(), 0);                       \
    Blob new_blob;                                                        \
    EXPECT_NO_THROW(new_blob.Deserialize(serialized));                    \
-    EXPECT_TRUE(new_blob.IsType<TensorCPU>());                            \
+    EXPECT_TRUE(new_blob.IsType<Tensor>(CPU));                            \
    const TensorCPU& new_tensor = blob.Get<TensorCPU>();                  \
    EXPECT_EQ(new_tensor.ndim(), 2);                                      \
    EXPECT_EQ(new_tensor.dim(0), 0);                                      \
@ -659,7 +659,7 @@ TEST_SERIALIZATION_WITH_TYPE(int64_t, int64_data)

 TEST(TensorTest, TensorSerialization_CustomType) {
  Blob blob;
-  TensorCPU* tensor = blob.GetMutable<TensorCPU>();
+  TensorCPU* tensor = blob.GetMutableTensor(CPU);
  tensor->Resize(2, 3);
  for (int i = 0; i < 6; ++i) {
    tensor->mutable_data<BlobTestFoo>()[i].val = i;
@ -671,7 +671,7 @@ TEST(TensorTest, TensorSerialization_CustomType) {
  EXPECT_EQ(proto.type(), "Tensor");
  Blob new_blob;
  EXPECT_NO_THROW(new_blob.Deserialize(serialized));
-  EXPECT_TRUE(new_blob.IsType<TensorCPU>());
+  EXPECT_TRUE(new_blob.IsType<Tensor>(CPU));
  const TensorCPU& new_tensor = blob.Get<TensorCPU>();
  EXPECT_EQ(new_tensor.ndim(), 2);
  EXPECT_EQ(new_tensor.dim(0), 2);
@ -686,7 +686,7 @@ TEST(TensorTest, TensorSerialization_CustomType) {
 TEST(TensorTest, float16) {
  const TIndex kSize = 3000000;
  Blob blob;
-  TensorCPU* tensor = blob.GetMutable<TensorCPU>();
+  TensorCPU* tensor = blob.GetMutableTensor(CPU);
  tensor->Resize(kSize);
  for (int i = 0; i < tensor->size(); ++i) {
    tensor->mutable_data<float16>()[i].x = i % 10000;
@ -714,7 +714,7 @@ TEST(TensorTest, float16) {
  }
  Blob new_blob;
  EXPECT_NO_THROW(new_blob.Deserialize(serialized));
-  EXPECT_TRUE(new_blob.IsType<TensorCPU>());
+  EXPECT_TRUE(new_blob.IsType<Tensor>(CPU));
  const TensorCPU& new_tensor = blob.Get<TensorCPU>();
  EXPECT_EQ(new_tensor.ndim(), 1);
  EXPECT_EQ(new_tensor.dim(0), kSize);
@ -850,7 +850,7 @@ TYPED_TEST(TypedTensorTest, BigTensorSerialization) {
  {
    VLOG(1) << "Test begin";
    Blob blob;
-    TensorCPU* tensor = blob.GetMutable<TensorCPU>();
+    Tensor* tensor = blob.GetMutableTensor(CPU);
    VLOG(1) << "Allocating blob";
    tensor->Resize(d1, d2);
    auto mutableData = tensor->mutable_data<TypeParam>();
@ -893,7 +893,7 @@ TYPED_TEST(TypedTensorTest, BigTensorSerialization) {
    load_op->Run();
    VLOG(1) << "Reading blob from workspace";
    auto new_blob = ws.GetBlob("test");
-    EXPECT_TRUE(new_blob->IsType<TensorCPU>());
+    EXPECT_TRUE(new_blob->IsType<Tensor>(CPU));
    const auto& new_tensor = new_blob->Get<TensorCPU>();

    EXPECT_EQ(new_tensor.ndim(), d1);
@ -1020,7 +1020,7 @@ TEST(CustomChunkSize, BigTensorSerialization) {
  int64_t size = d1 * d2;

  Blob blob;
-  TensorCPU* tensor = blob.GetMutable<TensorCPU>();
+  TensorCPU* tensor = blob.GetMutableTensor(CPU);
  tensor->Resize(d1, d2);
  tensor->mutable_data<float>();
  std::mutex mutex;
@ -1070,10 +1070,9 @@ TEST(BlobTest, CastingMessage) {
 }

 TEST(TensorConstruction, UnitializedCopyTest) {
-  CPUContext context;
-  TensorCPU x;
-  TensorCPU y(x, &context);
-  TensorCPU z = x.Clone();
+  Tensor x(CPU);
+  Tensor y(x, CPU);
+  Tensor z = x.Clone();
  // should be uninitialized
  EXPECT_EQ(x.size(), -1);
  EXPECT_EQ(y.size(), -1);
@ -1082,14 +1081,11 @@ TEST(TensorConstruction, UnitializedCopyTest) {
 }

 TEST(TensorConstruction, CopyConstructorTest) {
-  CPUContext context;
-
-  TensorCPU x;
+  Tensor x(CPU);
  x.Resize(5);
  x.mutable_data<float>()[0] = 1;
-  TensorCPU y = x.Clone();
-  TensorCPU z(x, &context);
-  TensorCPU w;
+  Tensor y = x.Clone();
+  Tensor z(x, CPU);

  EXPECT_EQ(*x.data<float>(), 1);
  EXPECT_EQ(*y.data<float>(), 1);
@ -1100,13 +1096,12 @@ TEST(TensorConstruction, CopyConstructorTest) {
  EXPECT_EQ(*z.data<float>(), 1);
 }

-TEST(TensorConstruction, MoveConstructorTest) {
-  CPUContext context;
-
-  TensorCPU x;
+TEST(TensorConstruction, MoveAssignmentOpTest) {
+  Tensor x(CPU);
  x.Resize(5);
  x.mutable_data<float>()[0] = 1;
-  TensorCPU y = std::move(x);
+  Tensor y(CPU);
+  y = std::move(x);

  EXPECT_EQ(*y.data<float>(), 1);
 }
--- a/caffe2/core/context.cc
+++ b/caffe2/core/context.cc
@ -7,6 +7,12 @@

 namespace caffe2 {

+// We put this here because context.h rather than context_base.h is included in
+// user code
+// TODO: rename context.h -> context_cpu.h & context_base.h -> context.h
+CAFFE2_API BaseStaticContext*
+    BaseContext::static_context_[COMPILE_TIME_MAX_DEVICE_TYPES];
+
 uint32_t RandomNumberSeed() {
  // Originally copied from folly::randomNumberSeed (at 418ad4)
  // modified to use chrono instead of sys/time.h
@ -24,4 +30,11 @@ uint32_t RandomNumberSeed() {
      kPrime2 * tv_sec + kPrime3 * tv_usec;
 }

+BaseStaticContext* GetCPUStaticContext() {
+  static CPUStaticContext context;
+  return &context;
+}
+
+REGISTER_STATIC_CONTEXT(CPU, GetCPUStaticContext());
+
 } // namespace caffe2
--- a/caffe2/core/context.h
+++ b/caffe2/core/context.h
@ -7,6 +7,7 @@
 #include <unordered_map>

 #include "caffe2/core/allocator.h"
+#include "caffe2/core/context_base.h"
 #include "caffe2/core/event.h"
 #include "caffe2/core/logging.h"
 #include "caffe2/core/typeid.h"
@ -16,6 +17,8 @@ CAFFE2_DECLARE_bool(caffe2_report_cpu_memory_usage);

 namespace caffe2 {

+BaseStaticContext* GetCPUStaticContext();
+
 /**
 * A function to generate a random number seed that is unique in a best-effort
 * basis, using an ever-incrementing seed and the current time.
@ -26,44 +29,15 @@ uint32_t RandomNumberSeed();
 * The CPU Context, representing the bare minimum of what a Context class in
 * Caffe2 should implement.
 *
+ * // TODO modify docs
 * See operator.h, especially Operator<Context>, for how Context are used in
 * actual operator implementations that are associated with specific devices.
 * In general, the Context class is passed in as a template argument, and
 * the operator can use the functions defined in the context to execute whatever
 * computation it has.
 *
- * A Context defines all the necessities to run an operator on a specific
- * device. Specific Context classes have the freedom to choose what functions it
- * implements, but there are a few functions that you should consider
- * implementing if you want to write your own context class:
- * - void SwitchToDevice(): any necessary code to switch to the device before
- *     running anything.
- * - void WaitEvent(const Event& ev): make the current context to wait on
- *     an event. For example, for cuda, this is the equivalent of
- *     cudaStreamWaitEvent. For CPU context, it essentially synchronizes the
- *     event.
- * - void Record(Event* ev): record the async activities on the current context
- *     to the event. For example, for cuda, this is the equivalent of
- *     cudaEventRecord on the current stream. For CPU context, it is always
- *     synchronous.
- * - void FinishDeviceComputation(): any wrapping-up work after all the
- *     computation of the operator is done. If there are errors during the
- *     execution, throw exception. For example, in a CUDAContext, this function
- *     carries out a stream synchronization and spots potential errors for
- *     the cuda kernel calls.
- * - static std::pair<void*, MemoryDeleter> New(size_t nbytes): allocates
-       memory and returns a deleter.
- * - template <class SrcContext, class DstContext> void CopyBytes(...): does
- *     cross context memory copy.
- * - template <typename T, class SrcContext, class DstContext> void Copy(...):
- *     usually a simple wrapper around the above CopyBytes function.
- *
- * We intentionally did not create a base class for the various possible Context
- * classes there might be, since they are intended to be specified during
- * compile time using templates rather than via polymorphism. You should also
- * not have classes derived from existing context classes.
 */
-class CPUContext final {
+class CPUContext final : public BaseContext {
 public:
  typedef std::mt19937 rand_gen_type;
  CPUContext() : random_seed_(RandomNumberSeed()) {}
@ -74,23 +48,30 @@ class CPUContext final {
    CAFFE_ENFORCE_EQ(option.device_type(), CPU);
  }

-  ~CPUContext() noexcept {}
+  ~CPUContext() noexcept override {}

-  inline void SwitchToDevice(int /*stream_id*/) {}
-  inline void SwitchToDevice() {
-    SwitchToDevice(0);
+  BaseStaticContext* GetStaticContext() const override {
+    return GetCPUStaticContext();
  }

-  inline void WaitEvent(const Event& ev) {
+  static BaseStaticContext* StaticContext() {
+    return GetCPUStaticContext();
+  }
+
+  inline void SwitchToDevice(int /*stream_id*/) override {}
+
+  using BaseContext::SwitchToDevice;
+
+  inline void WaitEvent(const Event& ev) override {
    ev.Wait(CPU, this);
  }

-  inline void Record(Event* ev, const char* err_msg = nullptr) const {
+  inline void Record(Event* ev, const char* err_msg = nullptr) const override {
    CAFFE_ENFORCE(ev, "Event must not be null.");
    ev->Record(CPU, this, err_msg);
  }

-  inline void FinishDeviceComputation() {}
+  inline void FinishDeviceComputation() override {}

  inline rand_gen_type& RandGenerator() {
    if (!random_generator_.get()) {
@ -99,16 +80,32 @@ class CPUContext final {
    return *random_generator_.get();
  }

-  static std::pair<void*, MemoryDeleter> New(size_t nbytes) {
-    auto data_and_deleter = GetCPUAllocator()->New(nbytes);
-    if (FLAGS_caffe2_report_cpu_memory_usage) {
-      reporter_.New(data_and_deleter.first, nbytes);
-      data_and_deleter.second = ReportAndDelete;
-    }
-    return data_and_deleter;
+  inline static std::pair<void*, MemoryDeleter> New(size_t nbytes) {
+    return StaticContext()->New(nbytes);
+  }
+
+  void CopyBytesSameDevice(size_t nbytes, const void* src, void* dst) override {
+    if (nbytes == 0) {
+      return;
+    }
+    CAFFE_ENFORCE(src);
+    CAFFE_ENFORCE(dst);
+    memcpy(dst, src, nbytes);
+  }
+
+  void CopyBytesFromCPU(size_t nbytes, const void* src, void* dst) override {
+    CopyBytesSameDevice(nbytes, src, dst);
+  }
+
+  void CopyBytesToCPU(size_t nbytes, const void* src, void* dst) override {
+    CopyBytesSameDevice(nbytes, src, dst);
+  }
+
+  bool SupportsNonFundamentalTypes() const override {
+    // CPU non fumdamental type copy OK
+    return true;
  }

-  // Two copy functions that deals with cross-device copies.
  template <class SrcContext, class DstContext>
  inline void CopyBytes(size_t nbytes, const void* src, void* dst);

@ -147,14 +144,65 @@ class CPUContext final {

  // CPU streams are not implemented and are silently ignored by CPU ops,
  // return true to signal executor to schedule a CPU op
-  static bool IsStreamFree(const DeviceOption& /* unused */, int /* unused */) {
+  static bool IsStreamFree(
+      const DeviceOption& /* option */,
+      int /* stream_id */) {
    return true;
  }

+  DeviceType GetDevicetype() const override {
+    return CPU;
+  }
+
+  static constexpr DeviceType GetDeviceType() {
+    return CPU;
+  }
+
 protected:
  // TODO(jiayq): instead of hard-coding a generator, make it more flexible.
  int random_seed_{1701};
  std::unique_ptr<rand_gen_type> random_generator_;
+};
+
+template <>
+inline void CPUContext::CopyBytes<CPUContext, CPUContext>(
+    size_t nbytes,
+    const void* src,
+    void* dst) {
+  if (nbytes == 0) {
+    return;
+  }
+  CAFFE_ENFORCE(src);
+  CAFFE_ENFORCE(dst);
+  memcpy(dst, src, nbytes);
+}
+
+// TODO(jerryzh): merge CPUStaticContext with Allocator
+class CPUStaticContext : public BaseStaticContext {
+ public:
+  std::pair<void*, MemoryDeleter> New(size_t nbytes) const override {
+    auto data_and_deleter = GetCPUAllocator()->New(nbytes);
+    if (FLAGS_caffe2_report_cpu_memory_usage) {
+      reporter_.New(data_and_deleter.first, nbytes);
+      data_and_deleter.second = ReportAndDelete;
+    }
+    return data_and_deleter;
+  }
+
+  std::unique_ptr<BaseContext> CreateContext() override {
+    return caffe2::make_unique<CPUContext>();
+  }
+
+  std::unique_ptr<BaseContext> CreateContext(
+      const DeviceOption& option) override {
+    return caffe2::make_unique<CPUContext>(option);
+  }
+
+  DeviceType GetDeviceType() override {
+    return CPU;
+  }
+
+ protected:
  CAFFE2_API static MemoryAllocationReporter reporter_;

 private:
@ -164,17 +212,6 @@ class CPUContext final {
  }
 };

-template<>
-inline void CPUContext::CopyBytes<CPUContext, CPUContext>(
-    size_t nbytes, const void* src, void* dst) {
-  if (nbytes == 0) {
-    return;
-  }
-  CAFFE_ENFORCE(src);
-  CAFFE_ENFORCE(dst);
-  memcpy(dst, src, nbytes);
-}
-
 }  // namespace caffe2

 #endif  // CAFFE2_CORE_CONTEXT_H_
--- a/caffe2/core/context_base.cc
+++ b/caffe2/core/context_base.cc
@ -0,0 +1,3 @@
+#include "context_base.h"
+
+namespace caffe2 {}
--- a/caffe2/core/context_base.h
+++ b/caffe2/core/context_base.h
@ -0,0 +1,191 @@
+#pragma once
+
+#include <cstdlib>
+#include <ctime>
+#include <memory>
+#include <unordered_map>
+
+#include "caffe2/core/allocator.h"
+#include "caffe2/core/event.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/typeid.h"
+#include "caffe2/proto/caffe2.pb.h"
+
+namespace caffe2 {
+
+class BaseContext;
+
+/* BaseStaticContext defines the interface for static context, which contains
+   functions that are invoked statically before in Tensor class, e.g. New,
+   We will merge this with Allocator later.
+ */
+class BaseStaticContext {
+ public:
+  virtual ~BaseStaticContext() noexcept {}
+
+  virtual std::pair<void*, MemoryDeleter> New(size_t nbytes) const = 0;
+
+  virtual std::unique_ptr<BaseContext> CreateContext() = 0;
+
+  virtual std::unique_ptr<BaseContext> CreateContext(const DeviceOption&) = 0;
+
+  virtual DeviceType GetDeviceType() = 0;
+
+  /*
+   * @brief: Sets the DeviceOption for argument `device` based on the
+   * current context and the a data pointer
+   */
+  virtual void ExtractDeviceOption(DeviceOption* device, const void* /*data*/) {
+    device->set_device_type(GetDeviceType());
+  }
+};
+
+/**
+ * Virtual interface for the Context class in Caffe2.
+ *
+ * A Context defines all the necessities to run an operator on a specific
+ * device. Specific Context classes needs to implement all the pure virtual
+ * functions in the BaseContext class.
+ * TODO: add docs after this is finalized.
+ */
+class BaseContext {
+ public:
+  virtual ~BaseContext() noexcept {}
+
+  virtual BaseStaticContext* GetStaticContext() const = 0;
+
+  /* Sorry for the naming, will get rid of this in future diff */
+  virtual DeviceType GetDevicetype() const = 0;
+
+  virtual void SwitchToDevice(int /*stream_id*/) = 0;
+
+  inline void SwitchToDevice() {
+    SwitchToDevice(0);
+  }
+
+  virtual void WaitEvent(const Event& ev) = 0;
+
+  virtual void Record(Event* ev, const char* err_msg = nullptr) const = 0;
+
+  virtual void FinishDeviceComputation() = 0;
+
+  // This used to be arbitrary cross-device copy, but it turns out everyone
+  // did direct CPU-X copy, so we just make three functions for it (to avoid
+  // double dispatch).  This will get obsoleted by C10. where copies
+  // will be proper operators (and get to rely on multiple dispatch there.)
+  virtual void
+  CopyBytesSameDevice(size_t nbytes, const void* src, void* dst) = 0;
+
+  virtual void CopyBytesFromCPU(size_t nbytes, const void* src, void* dst) = 0;
+
+  virtual void CopyBytesToCPU(size_t nbytes, const void* src, void* dst) = 0;
+
+  virtual void CopyBytesToDevice(
+      size_t nbytes,
+      const void* src,
+      void* dst,
+      DeviceType type) {
+    if (type == CPU) {
+      CopyBytesToCPU(nbytes, src, dst);
+    } else if (type == GetDevicetype()) {
+      CopyBytesSameDevice(nbytes, src, dst);
+    } else {
+      CAFFE_THROW(
+          "CopyBytesToDevice can only copy to CPU or between same "
+          "device. Can't copy from: ",
+          GetDevicetype(),
+          " to",
+          type);
+    }
+  }
+
+  template <typename T>
+  inline void CopySameDevice(size_t n, const T* src, T* dst) {
+    static_assert(
+        std::is_fundamental<T>::value,
+        "CopySameDevice requires fundamental types");
+    CopyBytesSameDevice(
+        n * sizeof(T), static_cast<const void*>(src), static_cast<void*>(dst));
+  }
+
+  template <typename T>
+  inline void CopyFromCPU(size_t n, const T* src, T* dst) {
+    static_assert(
+        std::is_fundamental<T>::value,
+        "CopyFromCPU requires fundamental types");
+    CopyBytesFromCPU(
+        n * sizeof(T), static_cast<const void*>(src), static_cast<void*>(dst));
+  }
+
+  template <typename T>
+  inline void CopyToCPU(size_t n, const T* src, T* dst) {
+    static_assert(
+        std::is_fundamental<T>::value, "CopyToCPU requires fundamental types");
+    CopyBytesToCPU(
+        n * sizeof(T), static_cast<const void*>(src), static_cast<void*>(dst));
+  }
+
+  virtual bool SupportsNonFundamentalTypes() const {
+    return false;
+  }
+
+  inline void EnforceMetaCopyOK() {
+    CAFFE_ENFORCE(
+        SupportsNonFundamentalTypes(), "Context requires fundamental types");
+  }
+
+  inline void CopyItemsSameDevice(
+      const TypeMeta& meta,
+      size_t n,
+      const void* src,
+      void* dst) {
+    if (meta.copy()) {
+      EnforceMetaCopyOK();
+      meta.copy()(src, dst, n);
+    } else {
+      CopyBytesSameDevice(n * meta.itemsize(), src, dst);
+    }
+  }
+
+  inline void
+  CopyItemsFromCPU(const TypeMeta& meta, size_t n, const void* src, void* dst) {
+    if (meta.copy()) {
+      EnforceMetaCopyOK();
+      meta.copy()(src, dst, n);
+    } else {
+      CopyBytesFromCPU(n * meta.itemsize(), src, dst);
+    }
+  }
+
+  inline void
+  CopyItemsToCPU(const TypeMeta& meta, size_t n, const void* src, void* dst) {
+    if (meta.copy()) {
+      EnforceMetaCopyOK();
+      meta.copy()(src, dst, n);
+    } else {
+      CopyBytesToCPU(n * meta.itemsize(), src, dst);
+    }
+  }
+
+  CAFFE2_API static BaseStaticContext*
+      static_context_[COMPILE_TIME_MAX_DEVICE_TYPES];
+
+  template <int d>
+  friend struct StaticContextFunctionRegisterer;
+};
+
+template <int d>
+struct StaticContextFunctionRegisterer {
+  explicit StaticContextFunctionRegisterer(BaseStaticContext* ptr) {
+    static_assert(d < COMPILE_TIME_MAX_DEVICE_TYPES, "");
+    BaseContext::static_context_[d] = ptr;
+  }
+};
+
+#define REGISTER_STATIC_CONTEXT(d, f)                                \
+  namespace {                                                        \
+  static StaticContextFunctionRegisterer<d> g_static_context_##d(f); \
+  }
+
+#define GET_STATIC_CONTEXT(d) BaseContext::static_context_[d]
+} // namespace caffe2
--- a/caffe2/core/context_gpu.cu
+++ b/caffe2/core/context_gpu.cu
@ -59,7 +59,6 @@ CAFFE2_DEFINE_int(

 namespace caffe2 {

-
 thread_local ThreadLocalCUDAObjects CUDAContext::cuda_objects_;

 // TODO(jiayq): these variables shouldn't be currently accessed during static
@ -100,19 +99,6 @@ CudaMemoryPoolType GetCudaMemoryPoolType() {
  return g_cuda_memory_pool_type;
 }

-vector<TIndex> GetCUDATensorInfo(
-    const void* c,
-    bool* shares_data,
-    size_t* capacity,
-    DeviceOption* device) {
-  vector<TIndex> dims =
-      GetTensorInfo<CUDAContext>(c, shares_data, capacity, device);
-  const Tensor<CUDAContext>* tc = static_cast<const Tensor<CUDAContext>*>(c);
-  device->set_device_type(CUDA);
-  device->set_cuda_gpu_id(GetGPUIDForPointer(tc->raw_data()));
-  return dims;
-}
-
 ///////////////////////////////////////////////////////////////////////////////
 // A wrapper to allow us to lazily initialize all cuda environments that Caffe
 // uses. This gets done the first time a caffe2::CUDAContext::New() gets called
@ -163,14 +149,6 @@ static void Caffe2InitializeCuda() {
    }
  }

-  RegisterTypeCallFunction(
-    TypeMeta::Id<Tensor<CUDAContext>>(),
-    GetTensorType<CUDAContext>
-  );
-
-  RegisterTensorInfoFunction(
-      TypeMeta::Id<Tensor<CUDAContext>>(), GetCUDATensorInfo);
-
 #ifdef CAFFE2_USE_CUDNN
  // Check the versions of cuDNN that were compiled and linked with are compatible
  CheckCuDNNVersions();
@ -252,21 +230,6 @@ struct Caffe2CudaInitializerHelper {
    }
  }
 };
-
-struct TensorCUDAStatGetter : BlobStatGetter {
-  size_t sizeBytes(const Blob& blob) const override {
-    const auto& tensor = blob.Get<TensorCUDA>();
-    auto nbytes = tensor.nbytes();
-    if (nbytes > 0 && tensor.IsType<std::string>()) {
-      const auto* data = tensor.data<std::string>();
-      for (int i = 0; i < tensor.size(); ++i) {
-        nbytes += data[i].size();
-      }
-    }
-    return nbytes;
-  }
-};
-REGISTER_BLOB_STAT_GETTER(TensorCUDA, TensorCUDAStatGetter);
 } // namespace

 /**
@ -343,7 +306,7 @@ void TrackMemoryAlloc(size_t nbytes) {
 }
 }

-std::pair<void*, MemoryDeleter> CUDAContext::New(size_t nbytes) {
+std::pair<void*, MemoryDeleter> CUDAStaticContext::New(size_t nbytes) const {
  // Lock the mutex
  std::lock_guard<std::mutex> lock(CUDAContext::mutex());
  // A one-time caffe2 cuda initializer.
@ -381,7 +344,7 @@ std::pair<void*, MemoryDeleter> CUDAContext::New(size_t nbytes) {
  return {nullptr, Delete};
 }

-void CUDAContext::Delete(void* ptr) {
+void CUDAStaticContext::Delete(void* ptr) {
  // lock the mutex
  std::lock_guard<std::mutex> lock(CUDAContext::mutex());

@ -433,4 +396,11 @@ void CUDAContext::Delete(void* ptr) {
  }
 }

+BaseStaticContext* GetCUDAStaticContext() {
+  static CUDAStaticContext context;
+  return &context;
+}
+
+REGISTER_STATIC_CONTEXT(CUDA, GetCUDAStaticContext());
+
 }  // namespace caffe2
--- a/caffe2/core/context_gpu.h
+++ b/caffe2/core/context_gpu.h
@ -7,6 +7,7 @@
 #include "caffe2/core/common.h"
 #include "caffe2/core/common_gpu.h"
 #include "caffe2/core/context.h"
+#include "caffe2/core/context_base.h"
 #include "caffe2/core/logging.h"
 #include "caffe2/core/numa.h"
 #include "caffe2/core/tensor.h"
@ -134,37 +135,46 @@ class ThreadLocalCUDAObjects {
 #endif // CAFFE2_USE_CUDNN
 };

-class CUDAContext final {
+BaseStaticContext* GetCUDAStaticContext();
+
+class CUDAContext final : public BaseContext {
 public:
  // The default cuda context constructor.
  explicit CUDAContext(const int gpu_id = -1);
  explicit CUDAContext(const DeviceOption& option);

-  ~CUDAContext() {
+  ~CUDAContext() override {
    if (curand_generator_) {
      CURAND_CHECK(curandDestroyGenerator(curand_generator_));
    }
    FinishDeviceComputation();
  }

-  inline void SwitchToDevice(int stream_id) {
+  BaseStaticContext* GetStaticContext() const override {
+    return GetCUDAStaticContext();
+  }
+
+  static BaseStaticContext* StaticContext() {
+    return GetCUDAStaticContext();
+  }
+
+  inline void SwitchToDevice(int stream_id) override {
    set_stream_id(stream_id);
    CaffeCudaSetDevice(gpu_id_);
  }
-  inline void SwitchToDevice() {
-    SwitchToDevice(0);
-  }

-  inline void WaitEvent(const Event& ev) {
+  using BaseContext::SwitchToDevice;
+
+  inline void WaitEvent(const Event& ev) override {
    ev.Wait(CUDA, this);
  }

-  inline void Record(Event* ev, const char* err_msg = nullptr) const {
+  inline void Record(Event* ev, const char* err_msg = nullptr) const override {
    CAFFE_ENFORCE(ev, "Event must not be null.");
    ev->Record(CUDA, this, err_msg);
  }

-  void FinishDeviceComputation() {
+  void FinishDeviceComputation() override {
    cudaStreamSynchronize(cuda_objects_.GetStream(gpu_id_, stream_id_));
    cudaError_t error = cudaGetLastError();
    if (error != cudaSuccess) {
@ -211,7 +221,9 @@ class CUDAContext final {
    return curand_generator_;
  }

-  static std::pair<void*, MemoryDeleter> New(size_t nbytes);
+  inline static std::pair<void*, MemoryDeleter> New(size_t nbytes) {
+    return StaticContext()->New(nbytes);
+  }

  // Get a mutex to lock out cudaMalloc / cudaFree calls when
  // NCCL kernels are being launched. Should remove threat of
@ -233,6 +245,18 @@ class CUDAContext final {
        cuda_objects_.GetStream(gpu_id_, stream_id_)));
  }

+  void CopyBytesSameDevice(size_t nbytes, const void* src, void* dst) override {
+    CopyBytes<CUDAContext, CUDAContext>(nbytes, src, dst);
+  }
+
+  void CopyBytesToCPU(size_t nbytes, const void* src, void* dst) override {
+    CopyBytes<CUDAContext, CPUContext>(nbytes, src, dst);
+  }
+
+  void CopyBytesFromCPU(size_t nbytes, const void* src, void* dst) override {
+    CopyBytes<CPUContext, CUDAContext>(nbytes, src, dst);
+  }
+
  template <typename T, class SrcContext, class DstContext>
  inline void Copy(int n, const T* src, T* dst) {
    CopyBytes<SrcContext, DstContext>(n * sizeof(T),
@ -261,8 +285,15 @@ class CUDAContext final {
    return cudaStreamQuery(stream) == cudaSuccess;
  }

+  DeviceType GetDevicetype() const override {
+    return CUDA;
+  }
+
+  static constexpr DeviceType GetDeviceType() {
+    return CUDA;
+  }
+
 protected:
-  static void Delete(void* data);
  void set_stream_id(int stream_id) {
    stream_id_ = stream_id;
  }
@ -350,8 +381,37 @@ struct PinnedCPUAllocator final : CPUAllocator {
  DefaultCPUAllocator baseAllocator_;
 };

-// For simplicity, we will typedef Tensor<CPUContext> to TensorCPU.
-typedef Tensor<CUDAContext> TensorCUDA;
+class CUDAStaticContext final : public BaseStaticContext {
+ public:
+  std::pair<void*, MemoryDeleter> New(size_t nbytes) const override;
+
+  std::unique_ptr<BaseContext> CreateContext() override {
+    return caffe2::make_unique<CUDAContext>();
+  }
+
+  std::unique_ptr<BaseContext> CreateContext(
+      const DeviceOption& option) override {
+    return caffe2::make_unique<CUDAContext>(option);
+  }
+
+  std::unique_ptr<BaseContext> CreateContext(int gpu_id = -1) {
+    return caffe2::make_unique<CUDAContext>(gpu_id);
+  }
+
+  DeviceType GetDeviceType() override {
+    return CUDA;
+  }
+
+  void ExtractDeviceOption(DeviceOption* device, const void* data) override {
+    device->set_device_type(GetDeviceType());
+    device->set_cuda_gpu_id(GetGPUIDForPointer(data));
+  }
+
+ protected:
+  static void Delete(void* data);
+};
+
+using TensorCUDA = Tensor;

 }  // namespace caffe2

--- a/caffe2/core/context_test.cc
+++ b/caffe2/core/context_test.cc
@ -26,7 +26,7 @@ TEST(CPUContextTest, TestAllocDealloc) {
  }
  DeviceOption option;
  CPUContext context(option);
-  context.Copy<float, CPUContext, CPUContext>(10, data, dst_data);
+  context.CopyToCPU<float>(10, data, dst_data);
  for (int i = 0; i < 10; ++i) {
    EXPECT_FLOAT_EQ(dst_data[i], i);
  }
--- a/caffe2/core/dispatch/CMakeLists.txt
+++ b/caffe2/core/dispatch/CMakeLists.txt
@ -18,6 +18,7 @@ set(TEST_SOURCES

 add_library(dispatch OBJECT ${LIB_SOURCES})
 target_enable_style_warnings(dispatch)
+add_dependencies(dispatch Caffe2_PROTO)

 if(BUILD_TEST)
    add_executable(dispatch_test ${TEST_SOURCES} $<TARGET_OBJECTS:dispatch>)
--- a/caffe2/core/dispatch/OpSchema.h
+++ b/caffe2/core/dispatch/OpSchema.h
@ -1,13 +1,12 @@
 #pragma once

 #include "caffe2/core/dispatch/DispatchKey.h"
-#include "caffe2/utils/Metaprogramming.h"
+#include "caffe2/proto/caffe2.pb.h"
 #include "caffe2/utils/Array.h"
+#include "caffe2/utils/Metaprogramming.h"

 namespace caffe2 {
-template<class Context> class Tensor;
-class CPUContext;
-class CUDAContext;
+class Tensor;
 }  // namespace caffe2

 namespace c10 {
@ -18,26 +17,29 @@ namespace details {
 * If Arg is a Tensor or reference to a Tensor, provide the member constant value equal to true.  Otherwise
 * return false.
 */
-template<class Arg> using is_tensor_arg = guts::is_instantiation_of<caffe2::Tensor, guts::remove_cv_t<guts::remove_reference_t<Arg>>>;
+template <class Arg>
+using is_tensor_arg = std::
+    is_same<caffe2::Tensor, guts::remove_cv_t<guts::remove_reference_t<Arg>>>;
+
+inline DeviceTypeId to_device_type_id(caffe2::DeviceType device_type) {
+  switch (device_type) {
+    case caffe2::CPU:
+      return DeviceTypeId::CPU;
+    case caffe2::CUDA:
+      return DeviceTypeId::CUDA;
+    default:
+      return DeviceTypeId::UNDEFINED;
+  }
+}

 // TODO get rid of tensor_to_dispatch_key once c2::Tensor is de-templatized. This then fits into a template lambda instead of a functor.
-template<class TensorType, class Enable = void> struct tensor_to_dispatch_key_ final {};
-template<class TensorType>
-struct tensor_to_dispatch_key_<TensorType, guts::enable_if_t<std::is_same<TensorType, caffe2::Tensor<caffe2::CPUContext>>::value>> final {
-    static TensorParameterDispatchKey call(const TensorType& tensor) {
-      return TensorParameterDispatchKey{DeviceTypeId::CPU, LayoutId(0), tensor.meta().id()};
-    }
-};
-template<class TensorType>
-struct tensor_to_dispatch_key_<TensorType, guts::enable_if_t<std::is_same<TensorType, caffe2::Tensor<caffe2::CUDAContext>>::value>> final {
-    static TensorParameterDispatchKey call(const TensorType& tensor) {
-      return TensorParameterDispatchKey{DeviceTypeId::CUDA, LayoutId(0), tensor.meta().id()};
-    }
-};
 struct tensor_to_dispatch_key final {
    template<class TensorType>
    TensorParameterDispatchKey operator()(const TensorType& tensor) const {
-      return tensor_to_dispatch_key_<TensorType, void>::call(tensor);
+      return TensorParameterDispatchKey{
+          to_device_type_id(tensor.GetDeviceType()),
+          LayoutId(0),
+          tensor.meta().id()};
    }
 };

--- a/caffe2/core/dispatch/OpSchema_test.cpp
+++ b/caffe2/core/dispatch/OpSchema_test.cpp
@ -4,16 +4,13 @@
 using namespace c10;
 using namespace caffe2;

-static_assert(details::is_tensor_arg<Tensor<CPUContext>>::value, "");
-static_assert(details::is_tensor_arg<const Tensor<CPUContext> &>::value, "");
-static_assert(details::is_tensor_arg<Tensor<CPUContext> &&>::value, "");
-static_assert(details::is_tensor_arg<Tensor<CUDAContext>>::value, "");
-static_assert(details::is_tensor_arg<const Tensor<CUDAContext> &>::value, "");
-static_assert(details::is_tensor_arg<Tensor<CUDAContext> &&>::value, "");
+static_assert(details::is_tensor_arg<Tensor>::value, "");
+static_assert(details::is_tensor_arg<const Tensor&>::value, "");
+static_assert(details::is_tensor_arg<Tensor&&>::value, "");
 static_assert(!details::is_tensor_arg<int>::value, "");

 struct SchemaDef final {
-  using Signature = bool (int, Tensor<CPUContext>, float, Tensor<CPUContext>, Tensor<CPUContext>, unsigned int);
+  using Signature = bool(int, Tensor, float, Tensor, Tensor, unsigned int);
  static constexpr guts::array<const char*, 6> parameter_names = {{
      "1", "2", "3", "4", "5", "6"
  }};
@ -21,4 +18,9 @@ struct SchemaDef final {
 static_assert(6 == OpSchema<SchemaDef>::signature::num_args, "test num_dispatch_args");
 static_assert(3 == OpSchema<SchemaDef>::signature::num_tensor_args, "test num_dispatch_args");
 static_assert(std::is_same<bool, typename OpSchema<SchemaDef>::signature::return_type>::value, "test num_dispatch_args");
-static_assert(std::is_same<guts::typelist::typelist<int, Tensor<CPUContext>, float, Tensor<CPUContext>, Tensor<CPUContext>, unsigned int>, typename OpSchema<SchemaDef>::signature::parameter_types>::value, "test num_dispatch_args");
+static_assert(
+    std::is_same<
+        guts::typelist::
+            typelist<int, Tensor, float, Tensor, Tensor, unsigned int>,
+        typename OpSchema<SchemaDef>::signature::parameter_types>::value,
+    "test num_dispatch_args");
--- a/caffe2/core/hip/blob_serialization_hip.cc
+++ b/caffe2/core/hip/blob_serialization_hip.cc
@ -4,17 +4,7 @@

 namespace caffe2 {

-template <>
-void TensorSerializer<HIPContext>::StoreDeviceDetail(const Tensor<HIPContext>& input,
-                                                     TensorProto* proto)
-{
-    auto* device_detail = proto->mutable_device_detail();
-    device_detail->set_device_type(HIP);
-    device_detail->set_hip_gpu_id(GetGPUIDForPointer(input.raw_data()));
-}
-
 namespace {
-REGISTER_BLOB_SERIALIZER((TypeMeta::Id<TensorHIP>()), TensorSerializer<HIPContext>);
-REGISTER_BLOB_DESERIALIZER(TensorHIP, TensorDeserializer<HIPContext>);
+REGISTER_BLOB_DESERIALIZER(TensorHIP, TensorDeserializer);
 }
 } // namespace caffe2
--- a/caffe2/core/hip/context_hip.cc
+++ b/caffe2/core/hip/context_hip.cc
@ -50,8 +50,6 @@ CAFFE2_DEFINE_int(caffe2_gpu_memory_report_interval_mb,

 namespace caffe2 {

-CAFFE_KNOWN_TYPE(Tensor<HIPContext>);
-
 thread_local ThreadLocalHIPObjects HIPContext::hip_objects_;

 // TODO(jiayq): these variables shouldn't be currently accessed during static
@ -88,16 +86,6 @@ static long g_last_rep  = 0;

 HipMemoryPoolType GetHipMemoryPoolType() { return g_hip_memory_pool_type; }

-vector<TIndex>
-GetHipTensorInfo(const void* c, bool* shares_data, size_t* capacity, DeviceOption* device)
-{
-    vector<TIndex> dims          = GetTensorInfo<HIPContext>(c, shares_data, capacity, device);
-    const Tensor<HIPContext>* tc = static_cast<const Tensor<HIPContext>*>(c);
-    device->set_device_type(HIP);
-    device->set_hip_gpu_id(GetGPUIDForPointer(tc->raw_data()));
-    return dims;
-}
-
 ///////////////////////////////////////////////////////////////////////////////
 // A wrapper to allow us to lazily initialize all HIP environments that Caffe
 // uses. This gets done the first time a caffe2::HIPContext::New() gets called
@ -151,10 +139,6 @@ static void Caffe2InitializeHip()
        }
    }

-    RegisterTypeCallFunction(TypeMeta::Id<Tensor<HIPContext>>(), GetTensorType<HIPContext>);
-
-    RegisterTensorInfoFunction(TypeMeta::Id<Tensor<HIPContext>>(), GetHipTensorInfo);
-
    // CheckMiOpenVersions();
 }

@ -327,20 +311,17 @@ void TrackMemoryAlloc(size_t nbytes)
 }
 }

-std::pair<void*, MemoryDeleter> HIPContext::New(size_t nbytes)
-{
+std::pair<void*, MemoryDeleter> HIPStaticContext::New(size_t nbytes) const {
  // Lock the mutex
  std::lock_guard<std::mutex> lock(HIPContext::mutex());
  // A one-time caffe2 cuda initializer.
  static Caffe2HipInitializerHelper g_hip_initializer_;
  void* ptr = nullptr;

-    if(FLAGS_caffe2_gpu_memory_tracking)
-    {
+  if (FLAGS_caffe2_gpu_memory_tracking) {
    TrackMemoryAlloc(nbytes);
  }
-    switch(g_hip_memory_pool_type)
-    {
+  switch (g_hip_memory_pool_type) {
    case HipMemoryPoolType::NONE:
        HIP_ENFORCE(hipMalloc(&ptr, nbytes));
        if(FLAGS_caffe2_gpu_memory_tracking)
@ -362,13 +343,11 @@ std::pair<void*, MemoryDeleter> HIPContext::New(size_t nbytes)
    return {nullptr, Delete};
 }

-void HIPContext::Delete(void* ptr)
-{
+void HIPStaticContext::Delete(void* ptr) {
  // lock the mutex
  std::lock_guard<std::mutex> lock(HIPContext::mutex());

-    if(FLAGS_caffe2_gpu_memory_tracking)
-    {
+  if (FLAGS_caffe2_gpu_memory_tracking) {
    auto sz_it = g_size_map.find(ptr);
    DCHECK(sz_it != g_size_map.end());
    auto aff_it = g_hip_device_affiliation.find(ptr);
@ -378,8 +357,7 @@ void HIPContext::Delete(void* ptr)
    g_size_map.erase(sz_it);
  }

-    switch(g_hip_memory_pool_type)
-    {
+  switch (g_hip_memory_pool_type) {
    case HipMemoryPoolType::NONE:
    {
        // If memory pool is not set up, use simple hipFree.
@ -415,4 +393,11 @@ void HIPContext::Delete(void* ptr)
    }
 }

+BaseStaticContext* GetHIPStaticContext() {
+  static HIPStaticContext context;
+  return &context;
+}
+
+REGISTER_STATIC_CONTEXT(HIP, GetHIPStaticContext());
+
 } // namespace caffe2
--- a/caffe2/core/hip/context_hip.h
+++ b/caffe2/core/hip/context_hip.h
@ -119,37 +119,46 @@ class ThreadLocalHIPObjects {
  vector<miopenHandle_t> miopen_handles_[CAFFE2_COMPILE_TIME_MAX_HIP_GPUS];
 };

-class HIPContext final {
+BaseStaticContext* GetHIPStaticContext();
+
+class HIPContext final : public BaseContext {
 public:
  // The default HIP context constructor.
  explicit HIPContext(const int gpu_id = -1);
  explicit HIPContext(const DeviceOption& option);

-  ~HIPContext() {
+  ~HIPContext() override {
    if (hiprand_generator_) {
      HIPRAND_CHECK(hiprandDestroyGenerator(hiprand_generator_));
    }
    FinishDeviceComputation();
  }

-  inline void SwitchToDevice(int stream_id) {
+  BaseStaticContext* GetStaticContext() const override {
+    return GetHIPStaticContext();
+  }
+
+  static BaseStaticContext* StaticContext() {
+    return GetHIPStaticContext();
+  }
+
+  inline void SwitchToDevice(int stream_id) override {
    set_stream_id(stream_id);
    CaffeHipSetDevice(gpu_id_);
  }
-  inline void SwitchToDevice() {
-    SwitchToDevice(0);
-  }

-  inline void WaitEvent(const Event& ev) {
+  using BaseContext::SwitchToDevice;
+
+  inline void WaitEvent(const Event& ev) override {
    ev.Wait(HIP, this);
  }

-  inline void Record(Event* ev, const char* err_msg = nullptr) const {
+  inline void Record(Event* ev, const char* err_msg = nullptr) const override {
    CAFFE_ENFORCE(ev, "Event must not be null.");
    ev->Record(HIP, this, err_msg);
  }

-  void FinishDeviceComputation() {
+  void FinishDeviceComputation() override {
    hipStreamSynchronize(hip_objects_.GetStream(gpu_id_, stream_id_));
    hipError_t error = hipGetLastError();
    if (error != hipSuccess) {
@ -194,7 +203,9 @@ class HIPContext final {
    return hiprand_generator_;
  }

-  static std::pair<void*, MemoryDeleter> New(size_t nbytes);
+  static std::pair<void*, MemoryDeleter> New(size_t nbytes) {
+    return StaticContext()->New(nbytes);
+  }

  // Get a mutex to lock out hipMalloc / hipFree calls when
  // NCCL kernels are being launched. Should remove threat of
@ -218,6 +229,18 @@ class HIPContext final {
        hip_objects_.GetStream(gpu_id_, stream_id_)));
  }

+  void CopyBytesSameDevice(size_t nbytes, const void* src, void* dst) override {
+    CopyBytes<HIPContext, HIPContext>(nbytes, src, dst);
+  }
+
+  void CopyBytesToCPU(size_t nbytes, const void* src, void* dst) override {
+    CopyBytes<HIPContext, CPUContext>(nbytes, src, dst);
+  }
+
+  void CopyBytesFromCPU(size_t nbytes, const void* src, void* dst) override {
+    CopyBytes<CPUContext, HIPContext>(nbytes, src, dst);
+  }
+
  template <typename T, class SrcContext, class DstContext>
  inline void Copy(int n, const T* src, T* dst) {
    CopyBytes<SrcContext, DstContext>(
@ -245,6 +268,14 @@ class HIPContext final {
    return hipStreamQuery(stream) == hipSuccess;
  }

+  DeviceType GetDevicetype() const override {
+    return HIP;
+  }
+
+  static constexpr DeviceType GetDeviceType() {
+    return HIP;
+  }
+
 protected:
  static void Delete(void* data);
  void set_stream_id(int stream_id) {
@ -338,8 +369,37 @@ struct PinnedCPUAllocator final : CPUAllocator {
  DefaultCPUAllocator baseAllocator_;
 };

-// For simplicity, we will typedef Tensor<CPUContext> to TensorCPU.
-typedef Tensor<HIPContext> TensorHIP;
+class HIPStaticContext final : public BaseStaticContext {
+ public:
+  std::pair<void*, MemoryDeleter> New(size_t nbytes) const override;
+
+  std::unique_ptr<BaseContext> CreateContext() override {
+    return caffe2::make_unique<HIPContext>();
+  }
+
+  std::unique_ptr<BaseContext> CreateContext(
+      const DeviceOption& option) override {
+    return caffe2::make_unique<HIPContext>(option);
+  }
+
+  std::unique_ptr<BaseContext> CreateContext(int gpu_id = -1) {
+    return caffe2::make_unique<HIPContext>(gpu_id);
+  }
+
+  DeviceType GetDeviceType() override {
+    return HIP;
+  }
+
+  void ExtractDeviceOption(DeviceOption* device, const void* data) override {
+    device->set_device_type(GetDeviceType());
+    device->set_hip_gpu_id(GetGPUIDForPointer(data));
+  }
+
+ protected:
+  static void Delete(void* data);
+};
+
+typedef Tensor TensorHIP;

 } // namespace caffe2

--- a/caffe2/core/int8_serialization.cc
+++ b/caffe2/core/int8_serialization.cc
@ -56,7 +56,7 @@ class Int8TensorCPUSerializer : public BlobSerializerBase {
  CPUContext context_;
 };

-class Int8TensorCPUDeserializer : public TensorDeserializer<CPUContext> {
+class Int8TensorCPUDeserializer : public TensorDeserializer {
 public:
  void Deserialize(const BlobProto& blob_proto, Blob* blob) override {
    const QTensorProto& proto = blob_proto.qtensor();
--- a/caffe2/core/operator.h
+++ b/caffe2/core/operator.h
@ -79,11 +79,45 @@ class OperatorBase : public Observable<OperatorBase> {
    }
  }

+  // TODO(jerryzh): Remove template
+  // and the type argument?
+  // This is to keep the API changes minimal and make refactoring
+  // a bit easier
+  template <typename T>
+  inline const T& Input(int idx, DeviceType type) {
+    static_assert(
+        std::is_same<T, Tensor>::value,
+        "Input(int, DeviceType) is only available for Tensor");
+    DCHECK_LT(idx, inputs_.size());
+    try {
+      // TODO(jerryzh): We'll need to check device type in Get<T>() later
+      // Get<T>() -> Get<T>(type)
+      const auto& tensor = inputs_.at(idx)->template Get<T>();
+      return tensor;
+    } catch (::caffe2::EnforceNotMet& enf) {
+      if (has_debug_def()) {
+        enf.AppendMessage(".\nOffending Blob name: ");
+        enf.AppendMessage(debug_def().input(idx));
+        enf.AppendMessage(".\n");
+      }
+      throw enf;
+    }
+  }
+
  template <typename T>
  inline T* Output(int idx) {
    return outputs_.at(idx)->template GetMutable<T>();
  }

+  // TODO(jerryzh): Remove this template
+  template <typename T>
+  inline T* Output(int idx, DeviceType type) {
+    static_assert(
+        std::is_same<T, Tensor>::value,
+        "Output(int, DeviceType) is only available for Tensor");
+    return outputs_.at(idx)->GetMutableTensor(type);
+  }
+
  template <typename T>
  inline T* Output(int idx, T* allocated) {
    outputs_.at(idx)->Reset(allocated);
@ -103,11 +137,29 @@ class OperatorBase : public Observable<OperatorBase> {
    return inputs_.at(idx)->template IsType<T>();
  }

+  template <typename T>
+  inline bool InputIsType(int idx, DeviceType device_type) {
+    static_assert(
+        std::is_same<T, Tensor>::value,
+        "InputIsType(idx, DeviceType) only available on "
+        "Tensor types.");
+    return inputs_.at(idx)->template IsType<T>(device_type);
+  }
+
  template <typename T>
  inline bool OutputIsType(int idx) {
    return outputs_.at(idx)->template IsType<T>();
  }

+  template <typename T>
+  inline bool OutputIsType(int idx, DeviceType type) {
+    static_assert(
+        std::is_same<T, Tensor>::value,
+        "OutputIsType(idx, DeviceType) only available on "
+        "Tensor types.");
+    return outputs_.at(idx)->template IsType<T>(type);
+  }
+
  inline int InputSize() const {
    return inputs_.size();
  }
@ -380,11 +432,14 @@ class Operator : public OperatorBase {
  }
  ~Operator() noexcept override {}

-  inline const Tensor<Context>& Input(int idx) {
-    return OperatorBase::template Input<Tensor<Context>>(idx);
+  inline const Tensor& Input(
+      int idx,
+      DeviceType type = Context::GetDeviceType()) {
+    return OperatorBase::template Input<Tensor>(idx, type);
  }
-  inline Tensor<Context>* Output(int idx) {
-    return OperatorBase::template Output<Tensor<Context>>(idx);
+
+  inline Tensor* Output(int idx, DeviceType type = Context::GetDeviceType()) {
+    return OperatorBase::template Output<Tensor>(idx, type);
  }

  void WaitEvent(const Event& ev, int stream_id = -1) final {
@ -714,8 +769,8 @@ struct DispatchHelper<FixedValues<>, ExtraArgs...> {
      return DispatchHelper<TensorTypes<Types...>, ExtraArgs...>::             \
          template call<Op>(op, meta);                                         \
    }                                                                          \
-    template <typename Op, typename Context>                                   \
-    static bool call(Op* op, const Tensor<Context>& tensor) {                  \
+    template <typename Op>                                                     \
+    static bool call(Op* op, const Tensor& tensor) {                           \
      return call<Op>(op, tensor.meta());                                      \
    }                                                                          \
    template <typename Op>                                                     \
@ -730,8 +785,8 @@ struct DispatchHelper<FixedValues<>, ExtraArgs...> {
    static bool call(Op* /* unused */, const TypeMeta& meta) {                 \
      CAFFE_THROW("Unsupported type of tensor: ", meta.name());                \
    }                                                                          \
-    template <typename Op, typename Context>                                   \
-    static bool call(Op* op, const Tensor<Context>& tensor) {                  \
+    template <typename Op>                                                     \
+    static bool call(Op* op, const Tensor& tensor) {                           \
      return call<Op>(op, tensor.meta());                                      \
    }                                                                          \
    template <typename Op>                                                     \
@ -748,8 +803,8 @@ struct DispatchHelper<FixedValues<>, ExtraArgs...> {
    static bool call(Op* op, const TypeMeta&) {                                \
      return op->template DoRunWithOtherType<ExtraArgs...>();                  \
    }                                                                          \
-    template <typename Op, typename Context>                                   \
-    static bool call(Op* op, const Tensor<Context>& tensor) {                  \
+    template <typename Op>                                                     \
+    static bool call(Op* op, const Tensor& tensor) {                           \
      return call<Op>(op, tensor.meta());                                      \
    }                                                                          \
    template <typename Op>                                                     \
--- a/caffe2/core/plan_executor.cc
+++ b/caffe2/core/plan_executor.cc
@ -131,8 +131,7 @@ struct WorkspaceIdInjector {
          "Integer overflow while calculating GLOBAL_WORKSPACE_ID blob");
      int32_t global_ws_id = (seq_++) + (static_cast<int32_t>(node_id) << 16);
      Blob* global_ws_id_blob = workspace->CreateLocalBlob(GLOBAL_WORKSPACE_ID);
-      TensorCPU* global_ws_id_tensor =
-          global_ws_id_blob->template GetMutable<TensorCPU>();
+      TensorCPU* global_ws_id_tensor = global_ws_id_blob->GetMutableTensor(CPU);
      global_ws_id_tensor->Resize();
      global_ws_id_tensor->template mutable_data<int32_t>()[0] = global_ws_id;
      VLOG(1) << "Adding " << GLOBAL_WORKSPACE_ID << " = " << global_ws_id;
--- a/caffe2/core/tensor.cc
+++ b/caffe2/core/tensor.cc
@ -43,9 +43,33 @@ TensorPrinter::~TensorPrinter() {
  }
 }

+void TensorPrinter::PrintMeta(const Tensor& tensor) {
+  if (to_file_) {
+    (*log_file_) << MetaStr(tensor) << std::endl;
+  } else {
+    LOG(INFO) << MetaStr(tensor);
+  }
+}
+
+std::string TensorPrinter::MetaStr(const Tensor& tensor) {
+  std::stringstream meta_stream;
+  meta_stream << "Tensor " << tensor_name_ << " of type "
+              << tensor.meta().name() << ". Dims: (";
+  for (const auto dim : tensor.dims()) {
+    meta_stream << dim << ",";
+  }
+  meta_stream << "): ";
+  return meta_stream.str();
+}
+
+TypeMeta GetTensorType(const void* c) {
+  const Tensor* tc = static_cast<const Tensor*>(c);
+  return tc->meta();
+}
+
+// TODO(jerryzh): Remove
 static CaffeMap<CaffeTypeId, TypeCall> type_call_registry_{
-  {TypeMeta::Id<Tensor<CPUContext>>(), GetTensorType<CPUContext>}
-};
+    {TypeMeta::Id<Tensor>(), GetTensorType}};

 TypeCall GetTypeCallFunction(CaffeTypeId id) {
  auto f = type_call_registry_.find(id);
@ -59,9 +83,26 @@ void RegisterTypeCallFunction(CaffeTypeId id, TypeCall c) {
  type_call_registry_[id] = c;
 }

-static CaffeMap<CaffeTypeId, TensorInfoCall> tensor_info_call_registry_{
-    {TypeMeta::Id<Tensor<CPUContext>>(), GetTensorInfo<CPUContext>}};
+int GetGPUIDForPointer(const void* ptr);

+vector<TIndex> GetTensorInfo(
+    const void* c,
+    bool* shares_data,
+    size_t* capacity,
+    DeviceOption* device) {
+  const Tensor* tc = static_cast<const Tensor*>(c);
+  *shares_data = tc->shares_data();
+  *capacity = tc->capacity_nbytes();
+  tc->ExtractDeviceOption(device);
+  return tc->dims();
+}
+
+// since we only have one tensor, probably need to remove this at some point?
+static CaffeMap<CaffeTypeId, TensorInfoCall> tensor_info_call_registry_{
+    {TypeMeta::Id<Tensor>(), GetTensorInfo}};
+
+// TODO: Remove this code in a separate diff, since we only have one
+// GetTensorInfo function now
 TensorInfoCall GetTensorInfoFunction(CaffeTypeId id) {
  auto f = tensor_info_call_registry_.find(id);
  if (f == tensor_info_call_registry_.end()) {
@ -74,11 +115,21 @@ void RegisterTensorInfoFunction(CaffeTypeId id, TensorInfoCall c) {
  tensor_info_call_registry_[id] = c;
 }

+void TensorVectorResize(
+    std::vector<Tensor>& tensors,
+    int size,
+    DeviceType type) {
+  tensors.reserve(size);
+  for (auto i = 0; i < size; ++i) {
+    tensors.emplace_back(type);
+  }
+}
+
 namespace {

-struct TensorCPUStatGetter : BlobStatGetter {
+struct TensorStatGetter : BlobStatGetter {
  size_t sizeBytes(const Blob& blob) const override {
-    const auto& tensor = blob.Get<TensorCPU>();
+    const auto& tensor = blob.Get<Tensor>();
    auto nbytes = tensor.nbytes();
    if (nbytes > 0 && tensor.IsType<std::string>()) {
      const auto* data = tensor.data<std::string>();
@ -89,7 +140,7 @@ struct TensorCPUStatGetter : BlobStatGetter {
    return nbytes;
  }
 };
-REGISTER_BLOB_STAT_GETTER(TensorCPU, TensorCPUStatGetter);
+REGISTER_BLOB_STAT_GETTER(Tensor, TensorStatGetter);
 }

 } // namespace caffe2
--- a/caffe2/core/tensor.h
+++ b/caffe2/core/tensor.h
@ -89,13 +89,10 @@ inline int canonical_axis_index_(int axis_index, int ndims) {
 * the allocation and de-allocation of such memory. We make a simplified
 * assumption that the memory is always contiguous.
 */
-template <class Context>
 class Tensor {
 public:
-  /**
-   * Initializes an empty tensor.
-   */
-  Tensor() {}
+  Tensor() = delete;
+  explicit Tensor(DeviceType type) : device_type_(type) {}

  /**
   * @brief Creates a tensor of the given dimension.
@ -103,67 +100,87 @@ class Tensor {
   * Note that the actual data allocation is not going to be carried out until
   * the first time mutable_data() is called.
   */
-  explicit Tensor(const vector<TIndex>& dims) { Resize(dims); }
-  explicit Tensor(const vector<int>& dims) { Resize(dims); }
+  explicit Tensor(const vector<TIndex>& dims, DeviceType type)
+      : device_type_(type) {
+    Resize(dims);
+  }
+  explicit Tensor(const vector<int>& dims, DeviceType type)
+      : device_type_(type) {
+    Resize(dims);
+  }

-  /**
-   * @brief Creates a tensor from a source tensor, copying over the content.
-   *
-   * Note that the source tensor can be from a different device context. The
-   * second argument provides a device context object (either Context or
-   * SrcContext) that will be responsible for copying the underlying data.
-   * If you do not wish to pass in a Context object, an equivalent constructor
-   * function exists that will create an implicit context object for copy, but
-   * be noted that this will cause a potential performance hit.
+  /* Now we require that context_for_copy has the same device type as src since
+   * template is removed
   */
-  template <class SrcContext, class ContextForCopy>
-  Tensor(const Tensor<SrcContext>& src, ContextForCopy* context) {
-    CopyFrom(src, context);
+  Tensor(const Tensor& src, BaseContext* context_for_copy, DeviceType type)
+      : device_type_(type) {
+    CopyFrom(src, context_for_copy);
  }

  /**
-   * @brief Creates a tensor from a source tensor, copying over the content.
-   *
-   * Note that this may have a potential performance hit, since a temporary
-   * context object will be created for the memory copy. Prefer explicitly
-   * providing a context for copy if you can.
-   *
-   * Since it's a potentially expensive operation - making copy constructor
-   * explicit here. If SrcContext != Context it's actually a typecast
-   * constructor and it should be definitely explicit.
+   * @brief: Create a Tensor of DeviceType `type` and initialize it with
+   * src Tensor
   */
-  template <class SrcContext>
-  explicit Tensor(const Tensor<SrcContext>& src) {
+  Tensor(const Tensor& src, DeviceType type) : device_type_(type) {
    CopyFrom(src);
  }

  /**
   * @brief Creates a tensor, and fills its contents with the given values.
+   * The type of tensor will be decided by the context parameter
   */
  template <typename T>
-  Tensor(const vector<TIndex>& dims, const vector<T>& values, Context* context)
+  Tensor(
+      const vector<TIndex>& dims,
+      const vector<T>& values,
+      BaseContext* context)
      : meta_(TypeMeta::Make<T>()) {
    Resize(dims);
    CAFFE_ENFORCE_EQ_WITH_CALLER(values.size(), size_);
-    context->template Copy<T, CPUContext, Context>(size_, values.data(), mutable_data<T>());
+    device_type_ = context->GetDevicetype();
+    context->CopyItemsFromCPU(meta_, size_, values.data(), mutable_data<T>());
  }

  /**
   * @brief Creates a scalar tensor, and fills its content with the given value.
+   * The type of tensor will be decided by the context parameter
   */
-  template <typename T,
+  template <
+      typename T,
      typename = typename std::enable_if<std::is_scalar<T>::value>::type>
-  Tensor(const T& value, Context* context) {
+  Tensor(const T& value, BaseContext* context) : meta_(TypeMeta::Make<T>()) {
    Resize(vector<TIndex>{});
-    context->template Copy<T, CPUContext, Context>(size_, &value, mutable_data<T>());
+    device_type_ = context->GetDevicetype();
+    context->CopyItemsFromCPU(meta_, size_, &value, mutable_data<T>());
  }

+  /*
+   * Since we removed template from tensor, we now store a static
+   * context pointer in tensor, which indicates the type of the tensor.
+   */
+  BaseStaticContext* GetStaticContext() const {
+    return GET_STATIC_CONTEXT(device_type_);
+  }
+
+  /* @brief
+   * Create a context that has the same device_type
+   * as the tensor.
+   * Note that this doesn't support passing in argument
+   * TODO(jerryzh): move this to a global registry
+   * that can create context for us
+   */
+  std::unique_ptr<BaseContext> CreateContext() const {
+    return GetStaticContext()->CreateContext();
+  }
+
+  DeviceType GetDeviceType() const {
+    return device_type_;
+  }
  /**
   * @brief Copies the data from a source tensor, with a contex provided to
   * carry out the underlying memcpy operation.
   */
-  template <class SrcContext, class ContextForCopy>
-  void CopyFrom(const Tensor<SrcContext>& src, ContextForCopy* context) {
+  void CopyFrom(const Tensor& src, BaseContext* context = nullptr) {
    if ((void*)&src == (void*)this) {
      return;
    }
@ -180,25 +197,37 @@ class Tensor {
    Resize(src.dims());
    if (size() > 0) {
      if (meta_.copy()) {
+        CAFFE_ENFORCE(
+            GetDeviceType() == CPU,
+            "In CopyFrom source and dest tensors must both be CPU for meta copy");
+        CAFFE_ENFORCE(
+            src.GetDeviceType() == CPU,
+            "In CopyFrom source and dest tensors must both be CPU for meta copy");
        meta_.copy()(src.raw_data(), raw_mutable_data(), size());
      } else {
-        context->template CopyBytes<SrcContext, Context>(
+        // We'll need to use a non-CPU context to perform the copy if
+        // one of the context is not CPU since only non-CPU context
+        // knows how to copy between CPU and that context
+        if (src.GetDeviceType() != CPU || GetDeviceType() == CPU) {
+          if (!context) {
+            src.CreateContext().get()->CopyBytesToDevice(
+                nbytes(), src.raw_data(), raw_mutable_data(), GetDeviceType());
+          } else {
+            CAFFE_ENFORCE(
+                context->GetDevicetype() == src.GetDeviceType(),
+                "Type for provided context does not match the type of source");
+            context->CopyBytesToDevice(
+                nbytes(), src.raw_data(), raw_mutable_data(), GetDeviceType());
+          }
+        } else {
+          // In case source context is CPU, and target context is non-CPU
+          // We'll have to create a Context from target and perform the
+          // copy using that context
+          CreateContext().get()->CopyBytesFromCPU(
              nbytes(), src.raw_data(), raw_mutable_data());
        }
      }
    }
-
-  /**
-   * @brief Copies the data from a source tensor.
-   *
-   * Note that this may have a potential performance hit, since a temporary
-   * context object will be created for the memory copy. Prefer explicitly
-   * providing a context for copy if you can.
-   */
-  template <class SrcContext>
-  inline void CopyFrom(const Tensor<SrcContext>& src) {
-    SrcContext tmp_context;
-    CopyFrom(src, &tmp_context);
  }

  virtual ~Tensor() noexcept {}
@ -212,8 +241,7 @@ class Tensor {
   * growthPct. This ensures that Extend runs on an amortized O(1) time
   * complexity.
   */
-  template <class ContextForCopy>
-  void Extend(TIndex num, float growthPct, ContextForCopy* context) {
+  void Extend(TIndex num, float growthPct, BaseContext* context) {
    CAFFE_ENFORCE_GE_WITH_CALLER(dims_.size(), 1);
    auto newDims = dims_;
    newDims[0] += num;
@ -239,8 +267,8 @@ class Tensor {
    size_ = newSize;
  }

-  template <class T, class ContextForCopy>
-  void Reserve(const std::vector<T>& newCapacity, ContextForCopy* context) {
+  template <class T>
+  void Reserve(const std::vector<T>& newCapacity, BaseContext* context) {
    auto newSize = std::accumulate(
        newCapacity.begin(),
        newCapacity.end(),
@ -254,8 +282,7 @@ class Tensor {
    auto oldDims = dims_;
    Resize(newCapacity);
    auto* newData = raw_mutable_data(meta_);
-    context->template CopyItems<ContextForCopy, ContextForCopy>(
-        meta_, oldSize, oldData.get(), newData);
+    context->CopyItemsSameDevice(meta_, oldSize, oldData.get(), newData);
    dims_ = oldDims;
    size_ = oldSize;
    reserved_ = true;
@ -320,8 +347,7 @@ class Tensor {
   * Resize the tensor like the source tensor. Note that this is just a
   * sugar wrapper that essentially calls Resize(src_tensor.dims()).
   */
-  template <class OtherContext>
-  inline void ResizeLike(const Tensor<OtherContext>& src_tensor) {
+  inline void ResizeLike(const Tensor& src_tensor) {
    // Note: need casting for different context types.
    if (static_cast<void*>(this) != static_cast<const void*>(&src_tensor)) {
      Resize(src_tensor.dims());
@ -384,7 +410,7 @@ class Tensor {
    return ss.str();
  }

-  void swap(Tensor<Context>& other) {
+  void swap(Tensor& other) noexcept {
    std::swap(dims_, other.dims_);
    std::swap(size_, other.size_);
    std::swap(meta_, other.meta_);
@ -392,6 +418,7 @@ class Tensor {
    std::swap(shares_data_, other.shares_data_);
    std::swap(capacity_, other.capacity_);
    std::swap(reserved_, other.reserved_);
+    std::swap(device_type_, other.device_type_);
  }

  /**
@ -542,7 +569,8 @@ class Tensor {
        // destruction procedure.
        auto size = size_;
        auto dtor = meta_.dtor();
-        auto ptr_and_deleter = Context::New(size_ * meta_.itemsize());
+        auto ptr_and_deleter =
+            GetStaticContext()->New(size_ * meta_.itemsize());
        auto deleter = ptr_and_deleter.second;
        data_.reset(
            ptr_and_deleter.first, [size, dtor, deleter](void* ptr) -> void {
@ -552,7 +580,8 @@ class Tensor {
        meta_.ctor()(data_.get(), size_);
      } else {
        // For fundamental type, new and delete is easier.
-        auto ptr_and_deleter = Context::New(size_ * meta_.itemsize());
+        auto ptr_and_deleter =
+            GetStaticContext()->New(size_ * meta_.itemsize());
        data_.reset(ptr_and_deleter.first, ptr_and_deleter.second);
      }
      capacity_ = size_ * meta_.itemsize();
@ -690,20 +719,28 @@ class Tensor {
    return dims_[i];
  }

+  // We don't allow change to the type of
+  // tensor after initialization
  Tensor Clone() const {
-    Tensor x;
+    Tensor x(GetDeviceType());
    x.CopyFrom(*this);
    return x;
  }

-  Tensor(Tensor<Context>&& src) noexcept {
+  Tensor(Tensor&& src) noexcept {
    swap(src);
  }

+  Tensor& operator=(Tensor&&) = default;
+
  /**
   * @brief Delete the copy constructor and use Clone explicitly
   */
-  Tensor(const Tensor<Context>& src) = delete;
+  Tensor(const Tensor& src) = delete;
+
+  void ExtractDeviceOption(DeviceOption* device) const {
+    GetStaticContext()->ExtractDeviceOption(device, raw_data());
+  }

 protected:
  vector<TIndex> dims_;
@ -713,6 +750,7 @@ class Tensor {
  bool shares_data_ = false;
  size_t capacity_ = 0;
  bool reserved_ = false;
+  DeviceType device_type_ = CPU;
  // In case of chunk load we store how much data was already loaded

 private:
@ -785,8 +823,7 @@ class Tensor {
  Tensor& operator=(const Tensor& src) = delete;
 };

-// For simplicity, we will typedef Tensor<CPUContext> to TensorCPU.
-typedef Tensor<CPUContext> TensorCPU;
+using TensorCPU = Tensor;

 constexpr int k_limit_default_ = 1000;

@ -795,12 +832,6 @@ typedef TypeMeta (*TypeCall)(const void*);
 TypeCall GetTypeCallFunction(CaffeTypeId id);
 void RegisterTypeCallFunction(CaffeTypeId id, TypeCall c);

-template <class Context>
-TypeMeta GetTensorType(const void* c) {
-  const Tensor<Context>* tc = static_cast<const Tensor<Context>*>(c);
-  return tc->meta();
-}
-
 // Shape call registry
 typedef vector<TIndex> (*TensorInfoCall)(
    const void*,
@ -810,19 +841,11 @@ typedef vector<TIndex> (*TensorInfoCall)(
 TensorInfoCall GetTensorInfoFunction(CaffeTypeId id);
 void RegisterTensorInfoFunction(CaffeTypeId id, TensorInfoCall c);

-template <class Context>
-vector<TIndex> GetTensorInfo(
-    const void* c,
-    bool* shares_data,
-    size_t* capacity,
-    DeviceOption* device) {
-  const Tensor<Context>* tc = static_cast<const Tensor<Context>*>(c);
-  *shares_data = tc->shares_data();
-  *capacity = tc->capacity_nbytes();
-  device->set_device_type(CPU);
-  device->set_cuda_gpu_id(0);
-  return tc->dims();
-}
+// resize helper function
+void TensorVectorResize(
+    std::vector<Tensor>& tensors,
+    int size,
+    DeviceType type);

 class TensorPrinter {
 public:
@ -833,13 +856,11 @@ class TensorPrinter {
  ~TensorPrinter();

  template <class T>
-  void Print(const Tensor<CPUContext>& tensor);
+  void Print(const Tensor& tensor);

-  template <class Context>
-  void PrintMeta(const Tensor<Context>& tensor);
+  void PrintMeta(const Tensor& tensor);

-  template <class Context>
-  string MetaStr(const Tensor<Context>& tensor);
+  string MetaStr(const Tensor& tensor);

 private:
  bool to_file_;
@ -849,7 +870,7 @@ class TensorPrinter {
 };

 template <class T>
-void TensorPrinter::Print(const Tensor<CPUContext>& tensor) {
+void TensorPrinter::Print(const Tensor& tensor) {
  std::stringstream values_stream;
  // One most likely doesn't want to print int64-number of items for visual
  // inspection, so we cast down to int here.
@ -869,26 +890,5 @@ void TensorPrinter::Print(const Tensor<CPUContext>& tensor) {
  }
 }

-template <class Context>
-void TensorPrinter::PrintMeta(const Tensor<Context>& tensor) {
-  if (to_file_) {
-    (*log_file_) << MetaStr(tensor) << std::endl;
-  } else {
-    LOG(INFO) << MetaStr(tensor);
-  }
-}
-
-template <class Context>
-std::string TensorPrinter::MetaStr(const Tensor<Context>& tensor) {
-  std::stringstream meta_stream;
-  meta_stream << "Tensor " << tensor_name_ << " of type "
-              << tensor.meta().name() << ". Dims: (";
-  for (const auto dim : tensor.dims()) {
-    meta_stream << dim << ",";
-  }
-  meta_stream << "): ";
-  return meta_stream.str();
-}
-
 }  // namespace caffe2
 #endif  // CAFFE2_CORE_TENSOR_H_
--- a/caffe2/core/tensor_int8.h
+++ b/caffe2/core/tensor_int8.h
@ -3,6 +3,7 @@

 #include "caffe2/core/context.h"
 #include "caffe2/core/tensor.h"
+#include "caffe2/proto/caffe2.pb.h"

 namespace caffe2 {
 namespace int8 {
@ -12,7 +13,7 @@ struct Int8TensorCPU {
  int32_t zero_point{0};
  // Generally stores uint8_t data, but sometimes int32_t (e.g. bias
  // parameters).
-  TensorCPU t;
+  Tensor t{CPU};
 };
 } // namespace int8
 } // namespace caffe2
--- a/caffe2/core/typeid.cc
+++ b/caffe2/core/typeid.cc
@ -69,8 +69,7 @@ CaffeTypeId CaffeTypeId::createTypeId() {
  return CaffeTypeId(new_value);
 }

-CAFFE_DEFINE_KNOWN_TYPE(Tensor<CPUContext>);
-CAFFE_DEFINE_KNOWN_TYPE(Tensor<CUDAContext>);
+CAFFE_DEFINE_KNOWN_TYPE(Tensor);
 CAFFE_DEFINE_KNOWN_TYPE(float);
 CAFFE_DEFINE_KNOWN_TYPE(int);
 CAFFE_DEFINE_KNOWN_TYPE(std::string);
--- a/caffe2/core/typeid.h
+++ b/caffe2/core/typeid.h
@ -437,41 +437,37 @@ inline bool operator!=(const TypeMeta& lhs, const TypeMeta& rhs) noexcept {
      #T);                                                     \
  }

-template <class Context>
 class Tensor;
-class CPUContext;
-class CUDAContext;

 // note: first preallocated id is 1, because 0 is used for uninitialized type
 // ids.
 struct _CaffeHighestPreallocatedTypeId final {};

-CAFFE_DECLARE_KNOWN_TYPE(1, Tensor<CPUContext>);
-CAFFE_DECLARE_KNOWN_TYPE(2, Tensor<CUDAContext>);
-CAFFE_DECLARE_KNOWN_TYPE(3, float);
-CAFFE_DECLARE_KNOWN_TYPE(4, int);
-CAFFE_DECLARE_KNOWN_TYPE(5, std::string);
-CAFFE_DECLARE_KNOWN_TYPE(6, bool);
-CAFFE_DECLARE_KNOWN_TYPE(7, uint8_t);
-CAFFE_DECLARE_KNOWN_TYPE(8, int8_t);
-CAFFE_DECLARE_KNOWN_TYPE(9, uint16_t);
-CAFFE_DECLARE_KNOWN_TYPE(10, int16_t);
-CAFFE_DECLARE_KNOWN_TYPE(11, int64_t);
-CAFFE_DECLARE_KNOWN_TYPE(12, double);
-CAFFE_DECLARE_KNOWN_TYPE(13, char);
-CAFFE_DECLARE_KNOWN_TYPE(14, std::unique_ptr<std::mutex>);
-CAFFE_DECLARE_KNOWN_TYPE(15, std::unique_ptr<std::atomic<bool>>);
-CAFFE_DECLARE_KNOWN_TYPE(16, std::vector<int32_t>);
-CAFFE_DECLARE_KNOWN_TYPE(17, std::vector<int64_t>);
-CAFFE_DECLARE_KNOWN_TYPE(18, std::vector<unsigned long>);
-CAFFE_DECLARE_KNOWN_TYPE(19, bool*);
-CAFFE_DECLARE_KNOWN_TYPE(20, char*);
-CAFFE_DECLARE_KNOWN_TYPE(21, int*);
+CAFFE_DECLARE_KNOWN_TYPE(1, Tensor);
+CAFFE_DECLARE_KNOWN_TYPE(2, float);
+CAFFE_DECLARE_KNOWN_TYPE(3, int);
+CAFFE_DECLARE_KNOWN_TYPE(4, std::string);
+CAFFE_DECLARE_KNOWN_TYPE(5, bool);
+CAFFE_DECLARE_KNOWN_TYPE(6, uint8_t);
+CAFFE_DECLARE_KNOWN_TYPE(7, int8_t);
+CAFFE_DECLARE_KNOWN_TYPE(8, uint16_t);
+CAFFE_DECLARE_KNOWN_TYPE(9, int16_t);
+CAFFE_DECLARE_KNOWN_TYPE(10, int64_t);
+CAFFE_DECLARE_KNOWN_TYPE(11, double);
+CAFFE_DECLARE_KNOWN_TYPE(12, char);
+CAFFE_DECLARE_KNOWN_TYPE(13, std::unique_ptr<std::mutex>);
+CAFFE_DECLARE_KNOWN_TYPE(14, std::unique_ptr<std::atomic<bool>>);
+CAFFE_DECLARE_KNOWN_TYPE(15, std::vector<int32_t>);
+CAFFE_DECLARE_KNOWN_TYPE(16, std::vector<int64_t>);
+CAFFE_DECLARE_KNOWN_TYPE(17, std::vector<unsigned long>);
+CAFFE_DECLARE_KNOWN_TYPE(18, bool*);
+CAFFE_DECLARE_KNOWN_TYPE(19, char*);
+CAFFE_DECLARE_KNOWN_TYPE(20, int*);

 #ifdef CAFFE2_UNIQUE_LONG_TYPEMETA
-CAFFE_DECLARE_KNOWN_TYPE(22, long);
-CAFFE_DECLARE_KNOWN_TYPE(23, std::vector<long>);
+CAFFE_DECLARE_KNOWN_TYPE(21, long);
+CAFFE_DECLARE_KNOWN_TYPE(22, std::vector<long>);
 #endif // CAFFE2_UNIQUE_LONG_TYPEMETA

-CAFFE_DECLARE_KNOWN_TYPE(24, _CaffeHighestPreallocatedTypeId);
+CAFFE_DECLARE_KNOWN_TYPE(23, _CaffeHighestPreallocatedTypeId);
 }
--- a/caffe2/core/workspace.h
+++ b/caffe2/core/workspace.h
@ -136,14 +136,14 @@ class Workspace {
      auto* from_blob = parent_ws->GetBlob(ws_blob.second);
      CAFFE_ENFORCE(from_blob);
      CAFFE_ENFORCE(
-          from_blob->template IsType<Tensor<Context>>(),
+          from_blob->template IsType<Tensor>(),
          "Expected blob with tensor value",
          ws_blob.second);
      forwarded_blobs_.erase(blob);
      auto* to_blob = CreateBlob(blob);
      CAFFE_ENFORCE(to_blob);
-      const auto& from_tensor = from_blob->template Get<Tensor<Context>>();
-      auto* to_tensor = to_blob->template GetMutable<Tensor<Context>>();
+      const auto& from_tensor = from_blob->template Get<Tensor>();
+      auto* to_tensor = to_blob->GetMutableTensor(Context::GetDeviceType());
      to_tensor->CopyFrom(from_tensor);
    }
  }
--- a/caffe2/experiments/operators/fully_connected_op_decomposition.h
+++ b/caffe2/experiments/operators/fully_connected_op_decomposition.h
@ -100,8 +100,8 @@ class FullyConnectedOpDecomp final : public Operator<Context> {
  }

 protected:
-  Tensor<Context> bias_multiplier_;
-  Tensor<Context> multi_buffer_;
+  Tensor bias_multiplier_{Context::GetDeviceType()};
+  Tensor multi_buffer_{Context::GetDeviceType()};
 };

 template <typename T, class Context, class Engine=DefaultEngine>
@ -207,10 +207,10 @@ class FullyConnectedDecompGradientOp : public Operator<Context> {
  }

 protected:
-  Tensor<Context> bias_multiplier_;
-  Tensor<Context> du_buffer_;
-  Tensor<Context> dv_buffer_;
-  Tensor<Context> dx_buffer_;
+  Tensor bias_multiplier_{Context::GetDeviceType()};
+  Tensor du_buffer_{Context::GetDeviceType()};
+  Tensor dv_buffer_{Context::GetDeviceType()};
+  Tensor dx_buffer_{Context::GetDeviceType()};
 };

 }  // namespace caffe2
--- a/caffe2/experiments/operators/fully_connected_op_prune.h
+++ b/caffe2/experiments/operators/fully_connected_op_prune.h
@ -189,7 +189,7 @@ namespace caffe2 {
        }

      protected:
-        Tensor<Context> bias_multiplier_;
+       Tensor bias_multiplier_{Context::GetDeviceType()};
    };

  template <typename T, class Context, class Engine=DefaultEngine>
@ -343,9 +343,9 @@ namespace caffe2 {
        }

      protected:
-        Tensor<Context> bias_multiplier_;
-        Tensor<Context> sum_buffer_;
-        Tensor<Context> comp_r_buf_;
+       Tensor bias_multiplier_{Context::GetDeviceType()};
+       Tensor sum_buffer_{Context::GetDeviceType()};
+       Tensor comp_r_buf_{Context::GetDeviceType()};
    };

 }  // namespace caffe2
--- a/caffe2/experiments/operators/fully_connected_op_sparse.h
+++ b/caffe2/experiments/operators/fully_connected_op_sparse.h
@ -140,7 +140,7 @@ class FullyConnectedOp_SPARSE final : public Operator<Context> {
  }

 protected:
-  Tensor<Context> bias_multiplier_;
+  Tensor bias_multiplier_{Context::GetDeviceType()};
 };


--- a/caffe2/experiments/operators/sparse_matrix_reshape_op.h
+++ b/caffe2/experiments/operators/sparse_matrix_reshape_op.h
@ -104,7 +104,6 @@ class SparseMatrixReshapeOp : public Operator<Context> {
    CAFFE_ENFORCE(
        old_row.size() == nnz,
        "Column and row tensors must have the same size.");
-
    auto* new_col = Output(0);
    auto* new_row = Output(1);
    new_col->Resize(nnz);
--- a/caffe2/ideep/operators/concat_split_op.cc
+++ b/caffe2/ideep/operators/concat_split_op.cc
@ -27,7 +27,7 @@ class IDEEPConcatOp final : public IDEEPOperator {
  bool RunOnDevice() override {
    const auto& input_zero = Input(INPUT0);
    auto* output = Output(OUTPUT);
-    TensorCPU* axis_info = OperatorBase::Output<TensorCPU>(AXIS_INFO);
+    TensorCPU* axis_info = OperatorBase::Output<TensorCPU>(AXIS_INFO, CPU);

    vector<itensor> inputs;
    for (int i = 0; i < InputSize(); ++i) {
@ -88,7 +88,7 @@ class IDEEPSplitOp final : public IDEEPOperator {
          0,
          "If you set split with an input blob, do not pass in "
          "split in the argument.");
-      auto& axis_info = OperatorBase::Input<TensorCPU>(AXIS_INFO);
+      auto& axis_info = OperatorBase::Input<Tensor>(AXIS_INFO, CPU);
      CAFFE_ENFORCE_EQ(axis_info.size(), OutputSize());
      auto* axis_data = axis_info.template data<int>();
      axis_vdata.assign(axis_data, axis_data + OutputSize());
--- a/caffe2/ideep/operators/operator_fallback_ideep.h
+++ b/caffe2/ideep/operators/operator_fallback_ideep.h
@ -74,7 +74,7 @@ class IDEEPFallbackOp final : public IDEEPOperator {
    for (int i = 0; i < InputSize(); ++i) {
      if (InputIsType<itensor>(i) && Input(i).get_data_type() == itensor::data_type::f32) {
        auto& input = Input(i);
-        auto dtensor = local_input_blobs_[i]->template GetMutable<TensorCPU>();
+        auto dtensor = local_input_blobs_[i]->GetMutableTensor(CPU);
        dtensor->Resize(input.get_dims());
        if (input.is_public_format()) {
          dtensor->ShareExternalPointer(static_cast<float*>(input.get_data_handle()));
@ -85,7 +85,7 @@ class IDEEPFallbackOp final : public IDEEPOperator {
          InputIsType<itensor>(i) &&
          Input(i).get_data_type() == itensor::data_type::s32) {
        auto& input = Input(i);
-        auto dtensor = local_input_blobs_[i]->template GetMutable<TensorCPU>();
+        auto dtensor = local_input_blobs_[i]->GetMutableTensor(CPU);
        dtensor->Resize(input.get_dims());
        if (input.is_public_format()) {
          dtensor->ShareExternalPointer(
@ -138,8 +138,8 @@ class IDEEPFallbackOp final : public IDEEPOperator {
        VLOG(2) << "Output " << base_def_.output(i) << " as CPUTensor";
        auto src_dims = src.dims();
        Blob* dst = OperatorBase::OutputBlob(i);
-        dst->Reset(new Tensor<CPUContext>());
-        auto dtensor = dst->template GetMutable<TensorCPU>();
+        dst->Reset(new Tensor(CPU));
+        auto dtensor = dst->GetMutableTensor(CPU);
        dtensor->Resize(src_dims);
        dtensor->ShareData(src);
      }
@ -156,4 +156,3 @@ class IDEEPFallbackOp final : public IDEEPOperator {
 };

 } // namespace caffe2
-
--- a/caffe2/ideep/operators/utility_ops.cc
+++ b/caffe2/ideep/operators/utility_ops.cc
@ -10,7 +10,7 @@ class CopyCPUToIDEEPOp final : public IDEEPOperator {
  USE_IDEEP_DEF_ALIASES();

  bool RunOnDevice() override {
-    const auto& X = OperatorBase::Input<TensorCPU>(0);
+    const auto& X = OperatorBase::Input<Tensor>(0, CPU);
    auto* Y = OperatorBase::OutputBlob(0);
    itensor::dims src_dims(X.dims().begin(), X.dims().end());
    if (!(Y->template IsType<itensor>() &&
@ -31,14 +31,14 @@ class CopyIDEEPToCPUOp final : public IDEEPOperator {
  USE_IDEEP_DEF_ALIASES();
  bool RunOnDevice() override {
    const auto& input_blob = OperatorBase::InputBlob(0);
-    if (input_blob.template IsType<TensorCPU>()) {
+    if (input_blob.template IsType<Tensor>(CPU)) {
      VLOG(2) << "Directing sharing of TensorCPU";
      const auto& X = OperatorBase::Input<TensorCPU>(0);
-      auto* Y = OperatorBase::Output<TensorCPU>(0);
+      auto* Y = OperatorBase::Output<Tensor>(0, CPU);
      Y->CopyFrom(X);
    } else {
      const auto& X = OperatorBase::Input<itensor>(0);
-      auto* Y = OperatorBase::Output<TensorCPU>(0);
+      auto* Y = OperatorBase::Output<Tensor>(0, CPU);
      Y->Resize(X.get_dims());
      if (X.get_data_type() == itensor::data_type::f32) {
        X.reorder_to(Y->template mutable_data<float>());
--- a/caffe2/ideep/utils/ideep_context.h
+++ b/caffe2/ideep/utils/ideep_context.h
@ -8,7 +8,9 @@

 namespace caffe2 {

-class IDEEPContext final {
+BaseStaticContext* GetIDEEPStaticContext();
+
+class IDEEPContext final : public BaseContext {
 public:
  typedef std::mt19937 rand_gen_type;
  IDEEPContext() : random_seed_(RandomNumberSeed()) {}
@ -21,11 +23,17 @@ class IDEEPContext final {

  ~IDEEPContext() noexcept {}

-  inline void SwitchToDevice(int /*stream_id*/) {}
-  inline void SwitchToDevice() {
-    SwitchToDevice(0);
+  BaseStaticContext* GetStaticContext() const override {
+    return GetIDEEPStaticContext();
  }

+  static BaseStaticContext* StaticContext() {
+    return GetIDEEPStaticContext();
+  }
+
+  inline void SwitchToDevice(int /*stream_id*/) {}
+  using BaseContext::SwitchToDevice;
+
  inline void WaitEvent(const Event& ev) {
    ev.Wait(IDEEP, this);
  }
@ -46,7 +54,29 @@ class IDEEPContext final {
  }

  inline static std::pair<void*, MemoryDeleter> New(size_t nbytes) {
-    return GetCPUAllocator()->New(nbytes);
+    return StaticContext()->New(nbytes);
+  }
+
+  void CopyBytesSameDevice(size_t nbytes, const void* src, void* dst) override {
+    if (nbytes == 0) {
+      return;
+    }
+    CAFFE_ENFORCE(src);
+    CAFFE_ENFORCE(dst);
+    memcpy(dst, src, nbytes);
+  }
+
+  void CopyBytesFromCPU(size_t nbytes, const void* src, void* dst) override {
+    CopyBytesSameDevice(nbytes, src, dst);
+  }
+
+  void CopyBytesToCPU(size_t nbytes, const void* src, void* dst) override {
+    CopyBytesSameDevice(nbytes, src, dst);
+  }
+
+  bool SupportsNonFundamentalTypes() const override {
+    // IDEEP meta copy is OK
+    return true;
  }

  // Two copy functions that deals with cross-device copies.
@ -89,6 +119,14 @@ class IDEEPContext final {
    return true;
  }

+  DeviceType GetDevicetype() const override {
+    return IDEEP;
+  }
+
+  static constexpr DeviceType GetDeviceType() {
+    return IDEEP;
+  }
+
 protected:
  // TODO(jiayq): instead of hard-coding a generator, make it more flexible.
  int random_seed_{1701};
@ -133,4 +171,25 @@ inline void IDEEPContext::CopyBytes<IDEEPContext, CPUContext>(
  CAFFE_ENFORCE(dst);
  memcpy(dst, src, nbytes);
 }
+
+class IDEEPStaticContext : public BaseStaticContext {
+ public:
+  inline std::pair<void*, MemoryDeleter> New(size_t nbytes) const override {
+    return GetCPUAllocator()->New(nbytes);
+  }
+
+  std::unique_ptr<BaseContext> CreateContext() override {
+    return caffe2::make_unique<IDEEPContext>();
+  }
+
+  std::unique_ptr<BaseContext> CreateContext(
+      const DeviceOption& option) override {
+    return caffe2::make_unique<IDEEPContext>(option);
+  }
+
+  DeviceType GetDeviceType() override {
+    return IDEEP;
+  }
+};
+
 } // namespace caffe2
--- a/caffe2/ideep/utils/ideep_register.cc
+++ b/caffe2/ideep/utils/ideep_register.cc
@ -1,7 +1,8 @@
-#include <ideep_pin_singletons.hpp>
+#include <caffe2/core/event_cpu.h>
 #include <caffe2/core/operator.h>
 #include <caffe2/proto/caffe2.pb.h>
-#include <caffe2/core/event_cpu.h>
+#include <ideep_pin_singletons.hpp>
+#include "ideep_context.h"

 namespace caffe2 {

@ -26,4 +27,11 @@ REGISTER_EVENT_ERROR_MESSAGE_FUNCTION(IDEEP, EventErrorMessageCPU);
 REGISTER_EVENT_SET_FINISHED_FUNCTION(IDEEP, EventSetFinishedCPU);
 REGISTER_EVENT_RESET_FUNCTION(IDEEP, EventResetCPU);

+BaseStaticContext* GetIDEEPStaticContext() {
+  static IDEEPStaticContext context;
+  return &context;
+}
+
+REGISTER_STATIC_CONTEXT(IDEEP, GetIDEEPStaticContext());
+
 } // namespace caffe2
--- a/caffe2/image/image_input_op.h
+++ b/caffe2/image/image_input_op.h
@ -87,12 +87,12 @@ class ImageInputOp final
  unique_ptr<db::DBReader> owned_reader_;
  const db::DBReader* reader_;
  CPUContext cpu_context_;
-  TensorCPU prefetched_image_;
-  TensorCPU prefetched_label_;
+  Tensor prefetched_image_{CPU};
+  Tensor prefetched_label_{CPU};
  vector<TensorCPU> prefetched_additional_outputs_;
-  Tensor<Context> prefetched_image_on_device_;
-  Tensor<Context> prefetched_label_on_device_;
-  vector<Tensor<Context>> prefetched_additional_outputs_on_device_;
+  Tensor prefetched_image_on_device_{Context::GetDeviceType()};
+  Tensor prefetched_label_on_device_{Context::GetDeviceType()};
+  vector<Tensor> prefetched_additional_outputs_on_device_;
  // Default parameters for images
  PerImageArg default_arg_;
  int batch_size_;
@ -118,8 +118,8 @@ class ImageInputOp final
  int crop_;
  std::vector<float> mean_;
  std::vector<float> std_;
-  Tensor<Context> mean_gpu_;
-  Tensor<Context> std_gpu_;
+  Tensor mean_gpu_{Context::GetDeviceType()};
+  Tensor std_gpu_{Context::GetDeviceType()};
  bool mirror_;
  bool is_test_;
  bool use_caffe_datum_;
@ -154,8 +154,6 @@ ImageInputOp<Context>::ImageInputOp(
    Workspace* ws)
    : PrefetchOperator<Context>(operator_def, ws),
      reader_(nullptr),
-      prefetched_additional_outputs_(OutputSize() - 2),
-      prefetched_additional_outputs_on_device_(OutputSize() - 2),
      batch_size_(
          OperatorBase::template GetSingleArgument<int>("batch_size", 0)),
      label_type_(static_cast<LABEL_TYPE>(
@ -385,6 +383,9 @@ ImageInputOp<Context>::ImageInputOp(
  }

  for (int i = 0; i < additional_output_sizes.size(); ++i) {
+    prefetched_additional_outputs_on_device_.emplace_back(
+        Context::GetDeviceType());
+    prefetched_additional_outputs_.emplace_back(CPU);
    prefetched_additional_outputs_[i].Resize(
        TIndex(batch_size_), TIndex(additional_output_sizes[i]));
  }
@ -1207,12 +1208,12 @@ bool ImageInputOp<Context>::Prefetch() {
  // If the context is not CPUContext, we will need to do a copy in the
  // prefetch function as well.
  if (!std::is_same<Context, CPUContext>::value) {
-    prefetched_image_on_device_.CopyFrom(prefetched_image_, &context_);
-    prefetched_label_on_device_.CopyFrom(prefetched_label_, &context_);
+    prefetched_image_on_device_.CopyFrom(prefetched_image_, &cpu_context_);
+    prefetched_label_on_device_.CopyFrom(prefetched_label_, &cpu_context_);

    for (int i = 0; i < prefetched_additional_outputs_on_device_.size(); ++i) {
      prefetched_additional_outputs_on_device_[i].CopyFrom(
-          prefetched_additional_outputs_[i], &context_);
+          prefetched_additional_outputs_[i], &cpu_context_);
    }
  }

@ -1223,13 +1224,13 @@ bool ImageInputOp<Context>::Prefetch() {

 template <class Context>
 bool ImageInputOp<Context>::CopyPrefetched() {
-  auto* image_output = OperatorBase::Output<Tensor<Context> >(0);
-  auto* label_output = OperatorBase::Output<Tensor<Context> >(1);
-  vector<Tensor<Context>*> additional_outputs_output;
+  auto type = Context::GetDeviceType();
+  auto* image_output = OperatorBase::Output<Tensor>(0, type);
+  auto* label_output = OperatorBase::Output<Tensor>(1, type);
+  vector<Tensor*> additional_outputs_output;

  for (int i = 2; i < OutputSize(); ++i) {
-    additional_outputs_output.push_back(
-        OperatorBase::Output<Tensor<Context>>(i));
+    additional_outputs_output.push_back(OperatorBase::Output<Tensor>(i, type));
  }

  // Note(jiayq): The if statement below should be optimized away by the
@ -1249,9 +1250,11 @@ bool ImageInputOp<Context>::CopyPrefetched() {
        mean_gpu_.Resize(mean_.size());
        std_gpu_.Resize(std_.size());

-        context_.template Copy<float, CPUContext, Context>(
-          mean_.size(), mean_.data(), mean_gpu_.template mutable_data<float>());
-        context_.template Copy<float, CPUContext, Context>(
+        context_.template CopyFromCPU<float>(
+            mean_.size(),
+            mean_.data(),
+            mean_gpu_.template mutable_data<float>());
+        context_.template CopyFromCPU<float>(
            std_.size(), std_.data(), std_gpu_.template mutable_data<float>());
        mean_std_copied_ = true;
      }
--- a/caffe2/image/transform_gpu.cu
+++ b/caffe2/image/transform_gpu.cu
@ -50,8 +50,11 @@ __global__ void transform_kernel(

 template <typename T_IN, typename T_OUT, class Context>

-bool TransformOnGPU(Tensor<Context>& X, Tensor<Context> *Y,
-                    Tensor<Context>& mean, Tensor<Context>& std,
+bool TransformOnGPU(
+    Tensor& X,
+    Tensor* Y,
+    Tensor& mean,
+    Tensor& std,
    Context* context) {
  // data comes in as NHWC
  const int N = X.dim32(0), C = X.dim32(3), H = X.dim32(1), W = X.dim32(2);
@ -68,16 +71,18 @@ bool TransformOnGPU(Tensor<Context>& X, Tensor<Context> *Y,
  return true;
 };

-template bool TransformOnGPU<uint8_t, float, CUDAContext>(Tensor<CUDAContext>& X,
-                                                          Tensor<CUDAContext> *Y,
-                                                          Tensor<CUDAContext>& mean,
-                                                          Tensor<CUDAContext>& std,
+template bool TransformOnGPU<uint8_t, float, CUDAContext>(
+    Tensor& X,
+    Tensor* Y,
+    Tensor& mean,
+    Tensor& std,
    CUDAContext* context);

-template bool TransformOnGPU<uint8_t, float16, CUDAContext>(Tensor<CUDAContext>& X,
-                                                            Tensor<CUDAContext> *Y,
-                                                            Tensor<CUDAContext>& mean,
-                                                            Tensor<CUDAContext>& std,
+template bool TransformOnGPU<uint8_t, float16, CUDAContext>(
+    Tensor& X,
+    Tensor* Y,
+    Tensor& mean,
+    Tensor& std,
    CUDAContext* context);

 }  // namespace caffe2
--- a/caffe2/image/transform_gpu.h
+++ b/caffe2/image/transform_gpu.h
@ -31,8 +31,11 @@
 namespace caffe2 {

 template <typename T_IN, typename T_OUT, class Context>
-bool TransformOnGPU(Tensor<Context>& X, Tensor<Context>* Y,
-                    Tensor<Context>& mean, Tensor<Context>& std,
+bool TransformOnGPU(
+    Tensor& X,
+    Tensor* Y,
+    Tensor& mean,
+    Tensor& std,
    Context* context);

 }  // namespace caffe2
--- a/caffe2/mkl/mkl_utils_test.cc
+++ b/caffe2/mkl/mkl_utils_test.cc
@ -23,10 +23,10 @@ TEST(MKLDNNTest, SimpleConvolutionTest) {
  int pads[2] = {0, 0};

  // Creating Input and output tensors
-  TensorCPU X(vector<TIndex>{16, 8, 32, 32});
-  TensorCPU W(vector<TIndex>{64, 8, 3, 3});
-  TensorCPU b(vector<TIndex>{64});
-  TensorCPU Y(vector<TIndex>{16, 64, 30, 30});
+  Tensor X(vector<TIndex>{16, 8, 32, 32}, CPU);
+  Tensor W(vector<TIndex>{64, 8, 3, 3}, CPU);
+  Tensor b(vector<TIndex>{64}, CPU);
+  Tensor Y(vector<TIndex>{16, 64, 30, 30}, CPU);

  float* data = X.mutable_data<float>();
  for (int i = 0; i < X.size(); ++i) {
@ -56,7 +56,7 @@ TEST(MKLDNNTest, SimpleConvolutionTest) {
  // Test if the resource wrapper works.
  MKLMemory<float> X_wrapper(X.dims(), primitive, dnnResourceSrc);
  X_wrapper.CopyFrom(X);
-  TensorCPU X_recover(X.dims());
+  Tensor X_recover(X.dims(), CPU);
  X_wrapper.CopyTo(&X_recover);
  const float* recover_data = X_recover.data<float>();
  for (int i = 0; i < X_recover.size(); ++i) {
@ -93,7 +93,7 @@ TEST(MKLDNNTest, MKLMemoryCopyTest) {
  // layout?). Test both cases.
  vector<vector<TIndex>> dims_list{{10, 3, 20, 20}, {0}, {0, 10}};
  for (const auto& dims : dims_list) {
-    auto X_cpu_in = caffe2::make_unique<TensorCPU>(dims);
+    auto X_cpu_in = caffe2::make_unique<Tensor>(dims, CPU);
    CPUContext ctx;
    math::RandUniform<float, CPUContext>(
        X_cpu_in->size(),
@ -117,7 +117,7 @@ TEST(MKLDNNTest, MKLMemoryCopyTest) {
    EXPECT_EQ(X_mkl1->size(), X_cpu_in->size());

    // CPU <- MKL1
-    auto X_cpu_out = caffe2::make_unique<TensorCPU>();
+    auto X_cpu_out = caffe2::make_unique<Tensor>(CPU);
    X_mkl1->CopyTo(X_cpu_out.get());
    EXPECT_EQ(X_cpu_out->dims(), dims);
    EXPECT_EQ(X_cpu_out->size(), X_cpu_in->size());
--- a/caffe2/mkl/operators/conv_op.cc
+++ b/caffe2/mkl/operators/conv_op.cc
@ -31,7 +31,7 @@ class MKLConvOp final : public ConvPoolOpBase<MKLContext> {

    const int M = filter.dim32(0);
    if (InputSize() == 2 && !zero_bias_) {
-      TensorCPU cpu_zero_bias;
+      Tensor cpu_zero_bias{CPU};
      cpu_zero_bias.Resize(M);
      CPUContext ctx;
      math::Set<T, CPUContext>(
@ -72,8 +72,8 @@ class MKLConvOp final : public ConvPoolOpBase<MKLContext> {
      size_t bdata_sizes[4] = {W, H, C, N};
      // We will utilize the SetOutputSize() function int he base class
      // with dummy TensorCPU input and output to calculate the sizes.
-      TensorCPU dummy_input(X.dims());
-      TensorCPU dummy_output;
+      Tensor dummy_input(X.dims(), CPU);
+      Tensor dummy_output(CPU);
      ConvPoolOpBase<MKLContext>::SetOutputSize(
          dummy_input, &dummy_output, M);
      size_t tdata_sizes[4] = {
--- a/caffe2/mkl/operators/conv_op_mkldnn.cc
+++ b/caffe2/mkl/operators/conv_op_mkldnn.cc
@ -28,7 +28,7 @@ class ConvMKLDNNOp final : public ConvPoolOpBase<CPUContext> {
    auto& X = Input(INPUT);
    auto& filter = Input(FILTER);
    auto& bias = Input(BIAS);
-    TensorCPU* Y = Output(0);
+    Tensor* Y = Output(0);
    CAFFE_ENFORCE(4 == X.ndim());
    const int N = X.dim32(0), C = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
    CAFFE_ENFORCE(4 == filter.ndim());
--- a/caffe2/mkl/operators/operator_fallback_mkl.h
+++ b/caffe2/mkl/operators/operator_fallback_mkl.h
@ -66,10 +66,10 @@ class MKLFallbackOp final : public Operator<MKLContext> {
    for (int i = 0; i < InputSize(); ++i) {
      if (OperatorBase::InputIsType<MKLMemory<float>>(i)) {
        OperatorBase::Input<MKLMemory<float>>(i).CopyTo(
-            local_input_blobs_[i]->template GetMutable<TensorCPU>());
+            local_input_blobs_[i]->GetMutableTensor(CPU));
      } else if (OperatorBase::InputIsType<MKLMemory<double>>(i)) {
        OperatorBase::Input<MKLMemory<double>>(i).CopyTo(
-            local_input_blobs_[i]->template GetMutable<TensorCPU>());
+            local_input_blobs_[i]->GetMutableTensor(CPU));
      } else {
        VLOG(1) << "Input " << i << " is not MKLMemory. Skipping copy.";
        // Note(jiayq): This removes a const but conceptually
--- a/caffe2/mkl/operators/packed_fc_op.cc
+++ b/caffe2/mkl/operators/packed_fc_op.cc
@ -49,7 +49,7 @@ class PackedFCOp final : public Operator<CPUContext> {

    // Check out what is the passed in format.
    const MKLPackedMatrix* packed_matrix = nullptr;
-    if (OperatorBase::InputIsType<TensorCPU>(1)) {
+    if (OperatorBase::InputIsType<Tensor>(1, CPU)) {
      const auto& W = Input(1);
      CAFFE_ENFORCE_EQ(W.ndim(), 2);
      CAFFE_ENFORCE_EQ(W.dim32(0), N);
@ -142,7 +142,7 @@ class PackedFCOp final : public Operator<CPUContext> {
  size_t axis_{1};
  uint32_t hash_{0};
  vector<TIndex> Y_shape_cache_;
-  Tensor<CPUContext> bias_multiplier_;
+  Tensor bias_multiplier_{CPU};
  std::unique_ptr<MKLPackedMatrix> local_packed_matrix_;
 };

--- a/caffe2/mkl/operators/pool_op.cc
+++ b/caffe2/mkl/operators/pool_op.cc
@ -61,8 +61,8 @@ bool MKLPoolOp<float>::RunOnDeviceWithOrderNCHW() {
  if (dims_changed || FLAGS_caffe2_mkl_memonger_in_use) {
    // We will utilize the SetOutputSize() function in the base class
    // with dummy TensorCPU input and output to calculate the sizes.
-    TensorCPU dummy_input(X.dims());
-    TensorCPU dummy_output;
+    Tensor dummy_input(X.dims(), CPU);
+    Tensor dummy_output(CPU);

    ConvPoolOpBase<MKLContext>::SetOutputSize(
        dummy_input, &dummy_output, X.dim32(1));
--- a/caffe2/mkl/operators/utility_ops.cc
+++ b/caffe2/mkl/operators/utility_ops.cc
@ -10,7 +10,7 @@ class CopyCPUToMKLOp final : public MKLOperator<float> {
 public:
  using MKLOperator<float>::MKLOperator;
  bool RunOnDevice() override {
-    const auto& X = OperatorBase::Input<TensorCPU>(0);
+    const auto& X = OperatorBase::Input<Tensor>(0, CPU);
    auto* Y = OperatorBase::OutputBlob(0);
    if (!Y->template IsType<MKLMemory<float>>() ||
        Y->Get<MKLMemory<float>>().dims() != X.dims()) {
@ -27,7 +27,7 @@ class CopyMKLToCPUOp final : public MKLOperator<float> {

  bool RunOnDevice() override {
    const auto& X = OperatorBase::Input<MKLMemory<float>>(0);
-    auto* Y = OperatorBase::Output<TensorCPU>(0);
+    auto* Y = OperatorBase::Output<Tensor>(0, CPU);
    X.CopyTo(Y);
    return true;
  }
--- a/caffe2/mkl/utils/mkl_context.cc
+++ b/caffe2/mkl/utils/mkl_context.cc
@ -1,5 +1,6 @@
 // #include "caffe2/mkl/utils/mkl_context.h"

+#include "mkl_context.h"
 #include "caffe2/core/event_cpu.h"

 namespace caffe2 {
@ -18,4 +19,11 @@ REGISTER_EVENT_ERROR_MESSAGE_FUNCTION(MKLDNN, EventErrorMessageCPU);
 REGISTER_EVENT_SET_FINISHED_FUNCTION(MKLDNN, EventSetFinishedCPU);
 REGISTER_EVENT_RESET_FUNCTION(MKLDNN, EventResetCPU);

+BaseStaticContext* GetMKLStaticContext() {
+  static MKLStaticContext context;
+  return &context;
+}
+
+REGISTER_STATIC_CONTEXT(MKLDNN, GetMKLStaticContext());
+
 } // namespace caffe2
--- a/caffe2/mkl/utils/mkl_context.h
+++ b/caffe2/mkl/utils/mkl_context.h
@ -6,9 +6,12 @@
 #include <random>

 #include "caffe2/core/context.h"
+#include "caffe2/core/context_base.h"

 namespace caffe2 {

+BaseStaticContext* GetMKLStaticContext();
+
 /**
 * The MKL Context, which is largely the same as the CPUContext. We instantiate
 * this mainly in order to have a first-class MKL device.
@ -17,7 +20,7 @@ namespace caffe2 {
 * operators to mainly perform input and output via MKLMemory. As a result,
 * most likely MKLContext::New and ::Delete won't be used as often.
 */
-class MKLContext final {
+class MKLContext : public BaseContext {
 public:
  MKLContext() : random_seed_(RandomNumberSeed()) {}
  explicit MKLContext(const DeviceOption& option)
@ -27,20 +30,28 @@ class MKLContext final {
    CAFFE_ENFORCE_EQ(option.device_type(), MKLDNN);
  }

-  ~MKLContext() {}
+  ~MKLContext() override {}

-  inline void SwitchToDevice(int /*stream_id*/ = 0) {}
+  BaseStaticContext* GetStaticContext() const override {
+    return GetMKLStaticContext();
+  }

-  inline void WaitEvent(const Event& ev) {
+  static BaseStaticContext* StaticContext() {
+    return GetMKLStaticContext();
+  }
+
+  inline void SwitchToDevice(int /*stream_id*/ = 0) override {}
+
+  inline void WaitEvent(const Event& ev) override {
    ev.Wait(MKLDNN, this);
  }

-  inline void Record(Event* ev, const char* err_msg = nullptr) const {
+  inline void Record(Event* ev, const char* err_msg = nullptr) const override {
    CAFFE_ENFORCE(ev, "Event must not be null.");
    ev->Record(MKLDNN, this, err_msg);
  }

-  inline void FinishDeviceComputation() {}
+  inline void FinishDeviceComputation() override {}

  inline std::mt19937& RandGenerator() {
    if (!random_generator_.get()) {
@ -50,7 +61,29 @@ class MKLContext final {
  }

  inline static std::pair<void*, MemoryDeleter> New(size_t nbytes) {
-    return GetCPUAllocator()->New(nbytes);
+    return StaticContext()->New(nbytes);
+  }
+
+  void CopyBytesSameDevice(size_t nbytes, const void* src, void* dst) override {
+    if (nbytes == 0) {
+      return;
+    }
+    CAFFE_ENFORCE(src);
+    CAFFE_ENFORCE(dst);
+    memcpy(dst, src, nbytes);
+  }
+
+  void CopyBytesFromCPU(size_t nbytes, const void* src, void* dst) override {
+    CopyBytesSameDevice(nbytes, src, dst);
+  }
+
+  void CopyBytesToCPU(size_t nbytes, const void* src, void* dst) override {
+    CopyBytesSameDevice(nbytes, src, dst);
+  }
+
+  bool SupportsNonFundamentalTypes() const override {
+    // MKL meta copy is OK
+    return true;
  }

  // Two copy functions that deals with cross-device copies.
@ -90,10 +123,18 @@ class MKLContext final {
    return false;
  }

-  static bool IsStreamFree(const DeviceOption& /* unused */, int /* unused */) {
+  static bool IsStreamFree(const DeviceOption& option, int stream_id) {
    return true;
  }

+  DeviceType GetDevicetype() const override {
+    return MKLDNN;
+  }
+
+  static constexpr DeviceType GetDeviceType() {
+    return MKLDNN;
+  }
+
 protected:
  // TODO(jiayq): instead of hard-coding a generator, make it more flexible.
  int random_seed_{1701};
@ -108,21 +149,26 @@ inline void MKLContext::CopyBytes<MKLContext, MKLContext>(
  memcpy(dst, src, nbytes);
 }

-template <>
-inline void MKLContext::CopyBytes<CPUContext, MKLContext>(
-    size_t nbytes,
-    const void* src,
-    void* dst) {
-  memcpy(dst, src, nbytes);
+class MKLStaticContext : public BaseStaticContext {
+ public:
+  inline std::pair<void*, MemoryDeleter> New(size_t nbytes) const override {
+    return GetCPUAllocator()->New(nbytes);
  }

-template <>
-inline void MKLContext::CopyBytes<MKLContext, CPUContext>(
-    size_t nbytes,
-    const void* src,
-    void* dst) {
-  memcpy(dst, src, nbytes);
+  std::unique_ptr<BaseContext> CreateContext() override {
+    return caffe2::make_unique<MKLContext>();
  }
+
+  std::unique_ptr<BaseContext> CreateContext(
+      const DeviceOption& option) override {
+    return caffe2::make_unique<MKLContext>(option);
+  }
+
+  DeviceType GetDeviceType() override {
+    return MKLDNN;
+  }
+};
+
 } // namespace caffe2

 #endif // CAFFE2_UTILS_MKL_CONTEXT_H_
--- a/caffe2/mobile/contrib/CMakeLists.txt
+++ b/caffe2/mobile/contrib/CMakeLists.txt
@ -1,7 +1,10 @@
 add_subdirectory(ios)
-add_subdirectory(opengl)
+# [FIX later or remove] opengl code will be broken because of tensor refactoring, remove this from CI to unblock
+if(USE_MOBILE_OPENGL AND (ANDROID OR IOS))
+  # add_subdirectory(opengl)
+endif()
 if (USE_ACL)
-  add_subdirectory(arm-compute)
+  # add_subdirectory(arm-compute)
 endif()
 # Finally pass the src lists back to the parent

--- a/caffe2/mobile/contrib/arm-compute/operators/copy_op.cc
+++ b/caffe2/mobile/contrib/arm-compute/operators/copy_op.cc
@ -43,7 +43,7 @@ bool CopyFromGLOp<T>::RunOnDevice() {
  if (first_run_) {
    first_run_ = false;
    for (int i = 0; i < Inputs().size(); ++i) {
-      auto* Y = OperatorBase::Outputs()[i]->template GetMutable<TensorCPU>();
+      auto* Y = OperatorBase::Outputs()[i]->GetMutableTensor(CPU);
      Y->Resize(inputs_[i]->dims());
      Y->template mutable_data<float>();
    }
@ -54,7 +54,7 @@ bool CopyFromGLOp<T>::RunOnDevice() {
      // GLTensor
      auto* X = inputs_[i].get();
      X->lazy_allocate(Xblob, second_run_, true);
-      auto* Y = OperatorBase::Outputs()[i]->template GetMutable<TensorCPU>();
+      auto* Y = OperatorBase::Outputs()[i]->GetMutableTensor(CPU);
      Timer timer;
      timer.Start();
      getTensorCPU(*X, *Y);
--- a/caffe2/mobile/contrib/arm-compute/test/gl_operator_test.h
+++ b/caffe2/mobile/contrib/arm-compute/test/gl_operator_test.h
@ -27,7 +27,7 @@ template<typename T = float>
 void PopulateCPUBlob(Workspace *ws, bool random, std::string name,
                     std::vector<int> dims, int val = 1, int dist_shift = 0, float variance = 1) {
  Blob *blob = ws->CreateBlob(name);
-  auto *tensor = blob->GetMutable<TensorCPU>();
+  auto* tensor = blob->GetMutableTensor(CPU);
  tensor->Resize(dims);
  T *t_data = tensor->mutable_data<T>();
  std::random_device rd;
--- a/caffe2/mobile/contrib/ios/ios_caffe.cc
+++ b/caffe2/mobile/contrib/ios/ios_caffe.cc
@ -41,7 +41,7 @@ void GenerateStylizedImage(std::vector<float>& originalImage,
  caffe2::Predictor p(init_net, predict_net);

  std::vector<int> dims({1, 3, height, width});
-  caffe2::TensorCPU input;
+  caffe2::Tensor input(caffe2::CPU);
  input.Resize(dims);
  input.ShareExternalPointer(originalImage.data());
  caffe2::Predictor::TensorVector input_vec{&input};
--- a/caffe2/mobile/contrib/ios/ios_caffe_predictor.cc
+++ b/caffe2/mobile/contrib/ios/ios_caffe_predictor.cc
@ -50,7 +50,7 @@ Caffe2IOSPredictor::Caffe2IOSPredictor(const caffe2::NetDef& init_net,

 void Caffe2IOSPredictor::run(const Tensor& inData, Tensor& outData, std::string& errorMessage) {
  caffe2::FLAGS_caffe2_force_shared_col_buffer = true;
-  caffe2::TensorCPU input;
+  caffe2::Tensor input(caffe2::CPU);
  input.Resize(inData.dims);
  input.ShareExternalPointer(inData.data);
  caffe2::Predictor::TensorVector input_vec{&input};
--- a/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm
+++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm
@ -256,9 +256,9 @@ void computeOutputHW(
    int W,
    int* OH,
    int* OW) {
-  Tensor<CPUContext> input, output;
+  Tensor input(CPU), output(CPU);
  input.Resize(1, 1, H, W);
-  op->SetOutputSize<CPUContext>(input, &output, 1);
+  op->SetOutputSize(input, &output, 1);
  CAFFE_ENFORCE_EQ(output.ndim(), 4);
  *OH = output.dim(2);
  *OW = output.dim(3);
@ -495,7 +495,7 @@ class MPSCNNPackedInt8BGRANHWCToNCHWCStylizerPreprocessOp final
      caffe2::Timer rt;
      // Initialize random noise on first use.
      // Cache it to maintain temporal consistency.
-      auto* t = noiseBlob->template GetMutable<TensorCPU>();
+      auto* t = noiseBlob->GetMutableTensor(CPU);
      t->Resize(noiseSize);
      math::RandGaussian<float, CPUContext>(
          t->size(),
--- a/caffe2/mobile/contrib/ios/pool_test.cc
+++ b/caffe2/mobile/contrib/ios/pool_test.cc
@ -16,7 +16,7 @@ void AddNoiseInput(const vector<TIndex>& shape, const string& name, Workspace* w
  DeviceOption option;
  CPUContext context(option);
  Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutable<TensorCPU>();
+  auto* tensor = blob->GetMutableTensor(CPU);
  tensor->Resize(shape);

  math::RandGaussian<float, CPUContext>(
--- a/caffe2/mobile/contrib/ios/resize_test.cc
+++ b/caffe2/mobile/contrib/ios/resize_test.cc
@ -16,7 +16,7 @@ void AddNoiseInput(const vector<TIndex>& shape, const string& name, Workspace* w
  DeviceOption option;
  CPUContext context(option);
  Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutable<TensorCPU>();
+  auto* tensor = blob->GetMutableTensor(CPU);
  tensor->Resize(shape);

  math::RandGaussian<float, CPUContext>(
--- a/caffe2/mobile/contrib/nnapi/nnapi.cc
+++ b/caffe2/mobile/contrib/nnapi/nnapi.cc
@ -679,7 +679,7 @@ void NNApi::init(const TensorVector& inputs, TensorVector* outputs) {
        output_dims.push_back(dim);
      }

-      auto* tensor = ws_.CreateBlob(blob)->GetMutable<TensorCPU>();
+      auto* tensor = ws_.CreateBlob(blob)->GetMutableTensor(CPU);
      tensor->Resize(output_dims);
      outputs->push_back(tensor);

--- a/caffe2/mobile/contrib/nnapi/nnapi_benchmark.cc
+++ b/caffe2/mobile/contrib/nnapi/nnapi_benchmark.cc
@ -43,14 +43,14 @@ static double benchmark_conv_caffe2(
    ws = &localWs;
  }
  {
-    auto* t = ws->CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    auto* t = ws->CreateBlob("X_cpu")->GetMutableTensor(CPU);
    t->Resize(N, C, H, W);
    CPUContext ctx;
    math::RandGaussian<float, CPUContext>(
        t->size(), 0, 30, t->mutable_data<float>(), &ctx);
  }
  {
-    auto* t = ws->CreateBlob("W")->GetMutable<TensorCPU>();
+    auto* t = ws->CreateBlob("W")->GetMutableTensor(CPU);
    if (group == 1) {
      t->Resize(K, C, kernel, kernel);
    } else {
@ -61,7 +61,7 @@ static double benchmark_conv_caffe2(
        t->size(), 0, 30, t->mutable_data<float>(), &ctx);
  }
  {
-    auto* t = ws->CreateBlob("B")->GetMutable<TensorCPU>();
+    auto* t = ws->CreateBlob("B")->GetMutableTensor(CPU);
    t->Resize(K);
    CPUContext ctx;
    math::RandGaussian<float, CPUContext>(
@ -129,14 +129,14 @@ static double benchmark_conv_nnapi(
    ws = &localWs;
  }
  {
-    auto* t = ws->CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    auto* t = ws->CreateBlob("X_cpu")->GetMutableTensor(CPU);
    t->Resize(N, H, W, C);
    CPUContext ctx;
    math::RandGaussian<float, CPUContext>(
        t->size(), 0, 30, t->mutable_data<float>(), &ctx);
  }
  {
-    auto* t = ws->CreateBlob("W")->GetMutable<TensorCPU>();
+    auto* t = ws->CreateBlob("W")->GetMutableTensor(CPU);
    if (group > 1) {
      CAFFE_ENFORCE_EQ(C, group);
      t->Resize(1, kernel, kernel, C);
@ -148,7 +148,7 @@ static double benchmark_conv_nnapi(
        t->size(), 0, 30, t->mutable_data<float>(), &ctx);
  }
  {
-    auto* t = ws->CreateBlob("B")->GetMutable<TensorCPU>();
+    auto* t = ws->CreateBlob("B")->GetMutableTensor(CPU);
    t->Resize(K);
    CPUContext ctx;
    math::RandGaussian<float, CPUContext>(
@ -190,7 +190,7 @@ static double benchmark_conv_nnapi(
  NetDef initNet;
  NNApi model(initNet, netdef, ws);
  std::vector<TensorCPU*> inputs, outputs;
-  inputs.push_back(ws->GetBlob("X_cpu")->GetMutable<TensorCPU>());
+  inputs.push_back(ws->GetBlob("X_cpu")->GetMutableTensor(CPU));
  CAFFE_ENFORCE(model.run(inputs, &outputs));

  for (int i = 0; i < warmup; i++) {
@ -220,14 +220,14 @@ static double benchmark_conv_nnapi_int8(
    ws = &localWs;
  }
  {
-    auto* t = ws->CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    auto* t = ws->CreateBlob("X_cpu")->GetMutableTensor(CPU);
    t->Resize(N, H, W, C);
    for (int i = 0; i < t->size(); i++) {
      t->mutable_data<uint8_t>()[i] = rand() % 10;
    }
  }
  {
-    auto* t = ws->CreateBlob("W")->GetMutable<TensorCPU>();
+    auto* t = ws->CreateBlob("W")->GetMutableTensor(CPU);
    if (group > 1) {
      CAFFE_ENFORCE_EQ(C, group);
      t->Resize(1, kernel, kernel, C);
@ -243,7 +243,7 @@ static double benchmark_conv_nnapi_int8(
  // should be of ANEURALNETWORKS_TENSOR_INT32, with zeroPoint of 0 and
  // bias_scale == input_scale * filter_scale.
  {
-    auto* t = ws->CreateBlob("B")->GetMutable<TensorCPU>();
+    auto* t = ws->CreateBlob("B")->GetMutableTensor(CPU);
    t->Resize(K);
    for (int i = 0; i < t->size(); i++) {
      t->mutable_data<int32_t>()[i] = rand() % 10;
@ -322,7 +322,7 @@ static double benchmark_conv_nnapi_int8(
  NetDef initNet;
  NNApi model(initNet, netdef, ws);
  std::vector<TensorCPU*> inputs, outputs;
-  inputs.push_back(ws->GetBlob("X_cpu")->GetMutable<TensorCPU>());
+  inputs.push_back(ws->GetBlob("X_cpu")->GetMutableTensor(CPU));
  CAFFE_ENFORCE(model.run(inputs, &outputs));

  for (int i = 0; i < warmup; i++) {
--- a/caffe2/mobile/contrib/nnapi/nnapi_test.cc
+++ b/caffe2/mobile/contrib/nnapi/nnapi_test.cc
@ -55,7 +55,7 @@ static void test_relu(int N, int C, int H, int W) {
  // CPU reference
  Workspace ws;
  {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
    t->Resize(N, H, W, C);
    CPUContext ctx;
    math::RandGaussian<float, CPUContext>(
@ -81,7 +81,7 @@ static void test_relu(int N, int C, int H, int W) {
  NetDef initNet;
  NNApi model(initNet, netdef, &ws);
  std::vector<TensorCPU*> inputs, outputs;
-  inputs.push_back(ws.GetBlob("X_cpu")->GetMutable<TensorCPU>());
+  inputs.push_back(ws.GetBlob("X_cpu")->GetMutableTensor(CPU));
  EXPECT_TRUE(model.run(inputs, &outputs));
  const auto& t_nn = *outputs[0];

@ -103,21 +103,21 @@ static void test_conv_NHWC(
    int stride_w) {
  Workspace ws;
  {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
    t->Resize(N, H, W, C);
    CPUContext ctx;
    math::RandGaussian<float, CPUContext>(
        t->size(), 0, 30, t->mutable_data<float>(), &ctx);
  }
  {
-    auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
    t->Resize(K, kernel, kernel, C);
    CPUContext ctx;
    math::RandGaussian<float, CPUContext>(
        t->size(), 0, 30, t->mutable_data<float>(), &ctx);
  }
  {
-    auto* t = ws.CreateBlob("B")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("B")->GetMutableTensor(CPU);
    t->Resize(K);
    CPUContext ctx;
    math::RandGaussian<float, CPUContext>(
@ -189,7 +189,7 @@ static void test_conv_NHWC(
  NetDef initNet;
  NNApi model(initNet, netdef, &ws);
  std::vector<TensorCPU*> inputs, outputs;
-  inputs.push_back(ws.GetBlob("X_cpu")->GetMutable<TensorCPU>());
+  inputs.push_back(ws.GetBlob("X_cpu")->GetMutableTensor(CPU));
  EXPECT_TRUE(model.run(inputs, &outputs));
  const auto& t_nn = *outputs[0];

@ -211,21 +211,21 @@ static void test_depthwise_conv_NHWC(
    int stride_w) {
  Workspace ws;
  {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
    t->Resize(N, H, W, C);
    CPUContext ctx;
    math::RandGaussian<float, CPUContext>(
        t->size(), 0, 30, t->mutable_data<float>(), &ctx);
  }
  {
-    auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
    t->Resize(1, kernel, kernel, D);
    CPUContext ctx;
    math::RandGaussian<float, CPUContext>(
        t->size(), 0, 30, t->mutable_data<float>(), &ctx);
  }
  {
-    auto* t = ws.CreateBlob("B")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("B")->GetMutableTensor(CPU);
    t->Resize(D);
    CPUContext ctx;
    math::RandGaussian<float, CPUContext>(
@ -406,7 +406,7 @@ static void test_depthwise_conv_NHWC(
  NetDef initNet;
  NNApi model(initNet, netdef, &ws);
  std::vector<TensorCPU*> inputs, outputs;
-  inputs.push_back(ws.GetBlob("X_cpu")->GetMutable<TensorCPU>());
+  inputs.push_back(ws.GetBlob("X_cpu")->GetMutableTensor(CPU));
  EXPECT_TRUE(model.run(inputs, &outputs));
  const auto& t_nn = *outputs[0];

@ -428,7 +428,7 @@ static void test_pooling(
    int stride_w) {
  Workspace ws;
  {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
    t->Resize(N, H, W, C);
    CPUContext ctx;
    math::RandGaussian<float, CPUContext>(
@ -496,7 +496,7 @@ static void test_pooling(
  NetDef initNet;
  NNApi model(initNet, netdef, &ws);
  std::vector<TensorCPU*> inputs, outputs;
-  inputs.push_back(ws.GetBlob("X_cpu")->GetMutable<TensorCPU>());
+  inputs.push_back(ws.GetBlob("X_cpu")->GetMutableTensor(CPU));
  EXPECT_TRUE(model.run(inputs, &outputs));
  const auto& t_nn = *outputs[0];

@ -506,7 +506,7 @@ static void test_pooling(
 static void test_softmax(int N, int C, int H = 1, int W = 1) {
  Workspace ws;
  {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
    if (H == 1 && W == 1) {
      t->Resize(N, C);
    } else {
@ -538,7 +538,7 @@ static void test_softmax(int N, int C, int H = 1, int W = 1) {
  NetDef initNet;
  NNApi model(initNet, netdef, &ws);
  std::vector<TensorCPU*> inputs, outputs;
-  inputs.push_back(ws.GetBlob("X_cpu")->GetMutable<TensorCPU>());
+  inputs.push_back(ws.GetBlob("X_cpu")->GetMutableTensor(CPU));
  EXPECT_TRUE(model.run(inputs, &outputs));
  const auto& t_nn = *outputs[0];

--- a/caffe2/mobile/contrib/opengl/CMakeLists.txt
+++ b/caffe2/mobile/contrib/opengl/CMakeLists.txt
@ -1,4 +1,3 @@
-if(USE_MOBILE_OPENGL AND (ANDROID OR IOS))
 add_subdirectory(core)
 add_subdirectory(operators)

@ -9,6 +8,4 @@ if(USE_MOBILE_OPENGL AND (ANDROID OR IOS))
 if (IOS)
  add_subdirectory(ios)
 endif()
-endif()
-
 set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
--- a/caffe2/mobile/contrib/opengl/test/opengl_test.cc
+++ b/caffe2/mobile/contrib/opengl/test/opengl_test.cc
@ -178,7 +178,7 @@ void testOpenGLCopyOps(int N, int C, int H, int W, float error, int tile_x = 1,
  LOG(INFO) << "OPENGLCopyFrom/To Test";
  Workspace ws;
  {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
    t->Resize(N, C, H, W);
    CPUContext ctx;
    math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
@ -275,7 +275,7 @@ void testOpenGLConv(int N,
            << " Op: " << glPoolOperationName[poolOp];
  Workspace ws;
  {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
    t->Resize(N, C, H, W);
    CPUContext ctx;
    if (random_input) {
@ -301,7 +301,7 @@ void testOpenGLConv(int N,
  }

  if (poolOp != AveragePool && poolOp != MaxPool) {
-    auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
    if (poolOp != ConvTranspose && poolOp != ConvTransposePRelu && poolOp != ConvTransposeRelu) {
      t->Resize(K, C, kernel_h, kernel_w);
    } else {
@ -343,7 +343,7 @@ void testOpenGLConv(int N,

    // bias
    {
-      auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
+      auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
      t->Resize(K);
      CPUContext ctx;
      if (random_input) {
@ -367,7 +367,7 @@ void testOpenGLConv(int N,
  }

  if (poolOp == ConvPRelu || poolOp == ConvTransposePRelu) {
-    auto* t = ws.CreateBlob("p")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("p")->GetMutableTensor(CPU);
    t->Resize(K);
    CPUContext ctx;
    if (random_input) {
@ -532,7 +532,7 @@ void testOpenGLPRelu(
            << "C: " << C << ", H: " << H << ", W: " << W;
  Workspace ws;
  {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
    t->Resize(N, C, H, W);
    CPUContext ctx;
    // Too noisy.
@ -541,7 +541,7 @@ void testOpenGLPRelu(

  // prelu scale
  {
-    auto* t = ws.CreateBlob("p")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("p")->GetMutableTensor(CPU);
    t->Resize(prelu_size);
    CPUContext ctx;
    math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
@ -603,7 +603,7 @@ void testOpenGLRelu(int N, int C, int H, int W, int input_tile_x, int input_tile
            << "C: " << C << ", H: " << H << ", W: " << W;
  Workspace ws;
  {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
    t->Resize(N, C, H, W);
    CPUContext ctx;
    // Too noisy.
@ -664,13 +664,13 @@ void testOpenGLAdd(int N, int C, int H, int W, float error = 0.1, int input_tile
            << "C: " << C << ", H: " << H << ", W: " << W;
  Workspace ws;
  {
-    auto* t0 = ws.CreateBlob("X_cpu0")->GetMutable<TensorCPU>();
+    auto* t0 = ws.CreateBlob("X_cpu0")->GetMutableTensor(CPU);
    t0->Resize(N, C, H, W);
    CPUContext ctx0;
    // Too noisy.
    math::RandGaussian<float, CPUContext>(t0->size(), 0, 30, t0->mutable_data<float>(), &ctx0);

-    auto* t1 = ws.CreateBlob("X_cpu1")->GetMutable<TensorCPU>();
+    auto* t1 = ws.CreateBlob("X_cpu1")->GetMutableTensor(CPU);
    t1->Resize(N, C, H, W);
    CPUContext ctx1;
    // Too noisy.
@ -750,13 +750,13 @@ void testOpenGLSub(int N, int C, int H, int W, float error = 0.1) {

  Workspace ws;
  {
-    auto* t0 = ws.CreateBlob("X_cpu0")->GetMutable<TensorCPU>();
+    auto* t0 = ws.CreateBlob("X_cpu0")->GetMutableTensor(CPU);
    t0->Resize(N, C, H, W);
    CPUContext ctx0;
    // Too noisy.
    math::RandGaussian<float, CPUContext>(t0->size(), 0, 30, t0->mutable_data<float>(), &ctx0);

-    auto* t1 = ws.CreateBlob("X_cpu1")->GetMutable<TensorCPU>();
+    auto* t1 = ws.CreateBlob("X_cpu1")->GetMutableTensor(CPU);
    t1->Resize(N, C, H, W);
    CPUContext ctx1;
    // Too noisy.
@ -814,7 +814,8 @@ void testOpenGLConcat(int N, std::vector<int> Cs, int H, int W, bool tiling = fa
            << "H: " << H << ", W: " << W;
  Workspace ws;
  for (int i = 0; i < Cs.size(); i++) {
-    auto* t = ws.CreateBlob("X_cpu" + caffe2::to_string(i))->GetMutable<TensorCPU>();
+    auto* t =
+        ws.CreateBlob("X_cpu" + caffe2::to_string(i))->GetMutableTensor(CPU);
    t->Resize(N, Cs[i], H, W);
    CPUContext ctx0;
    // Too noisy.
@ -890,7 +891,7 @@ void testOpenGLSigmoid(int N, int C, int H, int W, float error) {
            << "C: " << C << ", H: " << H << ", W: " << W;
  Workspace ws;
  {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
    t->Resize(N, C, H, W);
    CPUContext ctx;
    // Too noisy.
@ -941,7 +942,7 @@ void testOpenGLTanh(int N, int C, int H, int W, float error) {
            << "C: " << C << ", H: " << H << ", W: " << W;
  Workspace ws;
  {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
    t->Resize(N, C, H, W);
    CPUContext ctx;
    math::RandGaussian<float, CPUContext>(t->size(), 0, 2, t->mutable_data<float>(), &ctx);
@ -991,14 +992,14 @@ void testOpenGLMul(int N, int C, int H, int W, float error) {
            << "C: " << C << ", H: " << H << ", W: " << W;
  Workspace ws;
  {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
    t->Resize(N, C, H, W);
    CPUContext ctx;
    math::RandGaussian<float, CPUContext>(t->size(), -10, 10, t->mutable_data<float>(), &ctx);
  }

  {
-    auto* t = ws.CreateBlob("B")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("B")->GetMutableTensor(CPU);
    t->Resize(1);
    CPUContext ctx;
    math::RandGaussian<float, CPUContext>(t->size(), -10, 10, t->mutable_data<float>(), &ctx);
@ -1059,7 +1060,7 @@ void testOpenGLSoftmax(int N, int D, float error, bool tiled = false) {
  LOG(INFO) << "OpenGL Softmax Test "
            << "N: " << N << " D: " << D << " Tiled:" << tiled;
  Workspace ws;
-  auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+  auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
  {
    t->Resize(N, D);
    CPUContext ctx;
@ -1150,7 +1151,7 @@ void testOpenGLInstanceNorm(int N, int C, int H, int W, float error) {
            << "C: " << C << ", H: " << H << ", W: " << W;
  Workspace ws;
  {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
    t->Resize(N, C, H, W);
    CPUContext ctx;
    // Too noisy.
@ -1162,7 +1163,7 @@ void testOpenGLInstanceNorm(int N, int C, int H, int W, float error) {

  // scale
  {
-    auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
    t->Resize(C);
    CPUContext ctx;
    for (auto i = 0; i < t->size(); ++i) {
@ -1171,7 +1172,7 @@ void testOpenGLInstanceNorm(int N, int C, int H, int W, float error) {
  }
  // bias
  {
-    auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
    t->Resize(C);
    CPUContext ctx;
    for (auto i = 0; i < t->size(); ++i) {
@ -1253,7 +1254,7 @@ void testOpenGLInstanceNormPRelu(int N, int C, int H, int W, float error) {
            << "C: " << C << ", H: " << H << ", W: " << W;
  Workspace ws;
  {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
    t->Resize(N, C, H, W);
    CPUContext ctx;
    // Too noisy.
@ -1265,7 +1266,7 @@ void testOpenGLInstanceNormPRelu(int N, int C, int H, int W, float error) {

  // scale
  {
-    auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
    t->Resize(C);
    CPUContext ctx;
    for (auto i = 0; i < t->size(); ++i) {
@ -1274,7 +1275,7 @@ void testOpenGLInstanceNormPRelu(int N, int C, int H, int W, float error) {
  }
  // bias
  {
-    auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
    t->Resize(C);
    CPUContext ctx;
    for (auto i = 0; i < t->size(); ++i) {
@ -1283,7 +1284,7 @@ void testOpenGLInstanceNormPRelu(int N, int C, int H, int W, float error) {
  }
  // prelu scale
  {
-    auto* t = ws.CreateBlob("p")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("p")->GetMutableTensor(CPU);
    t->Resize(C);
    CPUContext ctx;
    math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
@ -1384,7 +1385,7 @@ void OpenGL_speedtest(int N,
            << " C: " << C << " H: " << H << " W: " << W;
  Workspace ws;
  {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
    t->Resize(N, C, H, W);
    CPUContext ctx;
    if (random_input) {
@ -1398,7 +1399,7 @@ void OpenGL_speedtest(int N,
  }

  {
-    auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
    t->Resize(K, C, kernel_h, kernel_w);
    CPUContext ctx;
    if (random_input) {
@ -1412,7 +1413,7 @@ void OpenGL_speedtest(int N,
  }

  {
-    auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
    t->Resize(K);
    CPUContext ctx;
    if (random_input) {
@ -1478,7 +1479,7 @@ void testOpenGLPadImage(
  {
    Workspace ws;
    {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
      t->Resize(N, C, H, W);
      CPUContext ctx;
      math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
@ -1592,7 +1593,7 @@ void testOpenGLResize(int N,
  {
    Workspace ws;
    {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
      t->Resize(N, C, H, W);
      CPUContext ctx;
      math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
@ -1674,7 +1675,7 @@ void testOpenGLPreprocess(int N, int C, int H, int W, float error) {
  LOG(INFO) << "OpenGL Preprocess Test";
  Workspace ws;
  {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
    t->Resize(N, H, W, C);
    CPUContext ctx;
    for (auto i = 0; i < t->size(); ++i) {
@ -1683,7 +1684,7 @@ void testOpenGLPreprocess(int N, int C, int H, int W, float error) {
  }

  {
-    auto* t = ws.CreateBlob("mean")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU);
    t->Resize(3);
    CPUContext ctx;
    t->mutable_data<float>()[0] = 100;
@ -1747,7 +1748,7 @@ void testOpenGLDeprocess(int N, int C, int H, int W, float error) {
  LOG(INFO) << "OpenGLDeprocess Test";
  Workspace ws;
  {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
    t->Resize(N, C, H, W);
    CPUContext ctx;
    for (auto i = 0; i < t->size(); ++i) {
@ -1756,7 +1757,7 @@ void testOpenGLDeprocess(int N, int C, int H, int W, float error) {
  }

  {
-    auto* t = ws.CreateBlob("mean")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU);
    t->Resize(3);
    CPUContext ctx;
    t->mutable_data<float>()[0] = 30;
@ -1799,7 +1800,7 @@ void testOpenGLNormPlanarYUV(int N, int C, int H, int W, float error) {
  LOG(INFO) << "OpenGLNormPlanarYUV Test";
  Workspace ws;
  {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
    t->Resize(N, 3, H, W);
    CPUContext ctx;
    for (auto i = 0; i < t->size(); ++i) {
@ -1808,7 +1809,7 @@ void testOpenGLNormPlanarYUV(int N, int C, int H, int W, float error) {
  }

  {
-    auto* t = ws.CreateBlob("mean")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU);
    t->Resize(1, 3);
    CPUContext ctx;
    t->mutable_data<float>()[0] = 30;
@ -1817,7 +1818,7 @@ void testOpenGLNormPlanarYUV(int N, int C, int H, int W, float error) {
  }

  {
-    auto* t = ws.CreateBlob("stdev")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("stdev")->GetMutableTensor(CPU);
    t->Resize(1, 3);
    CPUContext ctx;
    t->mutable_data<float>()[0] = 6;
@ -1878,7 +1879,7 @@ void OpenGL_copyops_speedtest(int N,
  LOG(INFO) << "OpenGL CopyOps Speed Test";
  Workspace ws;
  {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
    t->Resize(N, C, H, W);
    CPUContext ctx;
    if (random_input) {
@ -1892,7 +1893,7 @@ void OpenGL_copyops_speedtest(int N,
  }

  {
-    auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
    t->Resize(K, C, kernel_h, kernel_w);
    CPUContext ctx;
    if (random_input) {
@ -1906,7 +1907,7 @@ void OpenGL_copyops_speedtest(int N,
  }

  {
-    auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
+    auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
    t->Resize(K);
    CPUContext ctx;
    if (random_input) {
@ -1989,7 +1990,8 @@ void compareModelsForOpenGL(std::string name,
    Workspace cws;
    cws.RunNetOnce(initNet);

-    auto* t_cpu = cws.CreateBlob(truncatedPredictNet.external_input(0))->GetMutable<TensorCPU>();
+    auto* t_cpu = cws.CreateBlob(truncatedPredictNet.external_input(0))
+                      ->GetMutableTensor(CPU);
    if (name == "styleTransfer") {
      CAFFE_ENFORCE_EQ(input_order, "NHWC");
      CAFFE_ENFORCE_EQ(input_type, "uint8_t");
@ -2030,8 +2032,8 @@ void compareModelsForOpenGL(std::string name,
    Workspace mws;
    mws.RunNetOnce(initNet);

-    auto* t_gl =
-        mws.CreateBlob(truncatedOpenGLPredictNet.external_input(0))->GetMutable<TensorCPU>();
+    auto* t_gl = mws.CreateBlob(truncatedOpenGLPredictNet.external_input(0))
+                     ->GetMutableTensor(CPU);
    if (name == "styleTransfer") {
      CAFFE_ENFORCE_EQ(input_order, "NHWC");
      CAFFE_ENFORCE_EQ(input_type, "uint8_t");
@ -2113,7 +2115,8 @@ void compareBatchedToTiledModels(std::string name,
    Workspace tws;
    tws.RunNetOnce(initNet);

-    auto* t_batch = tws.CreateBlob(bachedNet.external_input(0))->GetMutable<TensorCPU>();
+    auto* t_batch =
+        tws.CreateBlob(bachedNet.external_input(0))->GetMutableTensor(CPU);
    if (name == "styleTransfer") {
      CAFFE_ENFORCE_EQ(input_order, "NHWC");
      CAFFE_ENFORCE_EQ(input_type, "uint8_t");
@ -2139,7 +2142,8 @@ void compareBatchedToTiledModels(std::string name,
    Workspace bws;
    bws.RunNetOnce(initNet);

-    auto* t_tiling = bws.CreateBlob(tiledNet.external_input(0))->GetMutable<TensorCPU>();
+    auto* t_tiling =
+        bws.CreateBlob(tiledNet.external_input(0))->GetMutableTensor(CPU);
    if (name == "styleTransfer") {
      CAFFE_ENFORCE_EQ(input_order, "NHWC");
      CAFFE_ENFORCE_EQ(input_type, "uint8_t");
--- a/caffe2/mobile/contrib/snpe/snpe_op.cc
+++ b/caffe2/mobile/contrib/snpe/snpe_op.cc
@ -111,7 +111,8 @@ class SNPEOp final : public Operator<CPUContext> {
    X(snpe_copy_output_to);
    snpe_copy_output_to_f(ctx_.get(), Output(0)->mutable_data<float>());

-    CAFFE_ENFORCE(Output(0)->data<float>(), "nullptr where output should be!\n");
+    CAFFE_ENFORCE(
+        Output(0)->data<float>(), "nullptr where output should be!\n");
    return true;
  }

--- a/caffe2/mobile/contrib/snpe/snpe_op_benchmark.cc
+++ b/caffe2/mobile/contrib/snpe/snpe_op_benchmark.cc
@ -11,17 +11,19 @@
 #if TEST_REAL_DATA
 #include "data_chw.h"
 #include "data_hwc.h"
-#define POPULATE_DATA(_n, _s, _l) do {\
+#define POPULATE_DATA(_n, _s, _l)                                         \
+  do {                                                                    \
    Blob* _blob = ws.CreateBlob((_n));                                    \
-  auto* _tensor = _blob->GetMutable<TensorCPU>();\
+    auto* _tensor = _blob->GetMutableTensor(CPU);                         \
    _tensor->Resize((_s));                                                \
    memcpy(_tensor->mutable_data<float>(), data_##_l, _tensor->nbytes()); \
  } while (0)
 #else
 // Rough test on static data
-#define POPULATE_DATA(_n, _s, _l) do {\
+#define POPULATE_DATA(_n, _s, _l)                                 \
+  do {                                                            \
    Blob* _blob = ws.CreateBlob((_n));                            \
-  auto* _tensor = _blob->GetMutable<TensorCPU>();\
+    auto* _tensor = _blob->GetMutableTensor(CPU);                 \
    _tensor->Resize((_s));                                        \
    memset(_tensor->mutable_data<float>(), 1, _tensor->nbytes()); \
  } while (0)
@ -41,7 +43,7 @@ void AddConstInput(const vector<TIndex>& shape,
  DeviceOption option;
  CPUContext context(option);
  Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutable<TensorCPU>();
+  auto* tensor = blob->GetMutableTensor(CPU);
  tensor->Resize(shape);
  math::Set<float, CPUContext>(tensor->size(), value,
                               tensor->mutable_data<float>(),
@ -54,7 +56,7 @@ void AddNoiseInput(const vector<TIndex>& shape,
  DeviceOption option;
  CPUContext context(option);
  Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutable<TensorCPU>();
+  auto* tensor = blob->GetMutableTensor(CPU);
  tensor->Resize(shape);

  math::RandGaussian<float, CPUContext>(
@ -176,7 +178,7 @@ int main(int argc, char** argv) {
  float avg_diff = total_diff; // Avg difference as percentage (not a great metric)
  printf("Average difference is %f%%\n", avg_diff * 100);
  printf("JS Divergence is %f\n", JS_divergence); // Jensen-Shannon
-  printf("KL Divergence is %f\n", KL_divergence); // Kullback–Leibler
+  printf("KL Divergence is %f\n", KL_divergence); // Kullback-Leibler
  printf("Predicted %d with %f%% confidence\n", max_index, max * 100);

  printf ("Caffe2: %f microseconds.\n", t_caffe2);
--- a/caffe2/mobile/contrib/ulp2/ulp.cc
+++ b/caffe2/mobile/contrib/ulp2/ulp.cc
@ -261,14 +261,14 @@ std::unique_ptr<QConvState> create2b1bConvState(Workspace* ws,
  state->XQs.resize(k2b1bXBits);
  state->YQs.resize(k2b1bXBits);
  for (auto i = 0; i < k2b1bXBits; ++i) {
-    state->XQs[i] = caffe2::make_unique<TensorCPU>();
-    state->YQs[i] = caffe2::make_unique<TensorCPU>();
+    state->XQs[i] = caffe2::make_unique<Tensor>(CPU);
+    state->YQs[i] = caffe2::make_unique<Tensor>(CPU);
  }
-  state->WQ = caffe2::make_unique<TensorCPU>();
-  state->WQN = caffe2::make_unique<TensorCPU>();
-  state->WQL1Norm = caffe2::make_unique<TensorCPU>();
-  state->scratch = caffe2::make_unique<TensorCPU>();
-  state->scratchColBuffer = caffe2::make_unique<TensorCPU>();
+  state->WQ = caffe2::make_unique<Tensor>(CPU);
+  state->WQN = caffe2::make_unique<Tensor>(CPU);
+  state->WQL1Norm = caffe2::make_unique<Tensor>(CPU);
+  state->scratch = caffe2::make_unique<Tensor>(CPU);
+  state->scratchColBuffer = caffe2::make_unique<Tensor>(CPU);

  signQuantize(W, state->WQ.get());
  filterNormalization11(*(state->WQ), state->WQN.get());
@ -290,7 +290,7 @@ std::unique_ptr<QConvState> create2b1bConvState(Workspace* ws,
  };
  if (b) {
    CPUContext context;
-    state->bias = caffe2::make_unique<TensorCPU>(*b, &context);
+    state->bias = caffe2::make_unique<Tensor>(*b, &context, CPU);
  }
  return state;
 }
--- a/caffe2/mobile/contrib/ulp2/ulp_neon.cc
+++ b/caffe2/mobile/contrib/ulp2/ulp_neon.cc
@ -438,7 +438,7 @@ void run2b1bConvIm2ColGEMM(QConvState* state,
  const size_t QK = KH * KW * divRoundUp(X.dim32(3), 8);
  Y->Resize(X.dim32(0), OH, OW, OC);
  if (!state->WQPacked) {
-    state->WQPacked = caffe2::make_unique<TensorCPU>();
+    state->WQPacked = caffe2::make_unique<Tensor>(CPU);
    qpack_tiles<kGEMMTileSize, kGEMMTileDepthBytes>(state, *(state->WQ), 1, state->WQPacked.get());
    CAFFE_ENFORCE_EQ(state->WQPacked->dim32(0), divRoundUp(OC, kGEMMTileSize));
    CAFFE_ENFORCE_EQ(state->WQPacked->dim32(1), divRoundUp(QK, kGEMMTileDepthBytes));
--- a/caffe2/mobile/contrib/ulp2/ulp_test.cc
+++ b/caffe2/mobile/contrib/ulp2/ulp_test.cc
@ -63,7 +63,7 @@ int randInt(int a, int b) {
 }

 TensorCPU genTensor11(std::vector<TIndex> shape) {
-  TensorCPU r;
+  Tensor r(CPU);
  r.Resize(shape);

  std::random_device rd;
@ -77,7 +77,7 @@ TensorCPU genTensor11(std::vector<TIndex> shape) {
 }

 TensorCPU genTensorUniform11(std::vector<TIndex> shape) {
-  TensorCPU r;
+  Tensor r(CPU);
  r.Resize(shape);

  std::random_device rd;
@ -91,7 +91,7 @@ TensorCPU genTensorUniform11(std::vector<TIndex> shape) {
 }

 TensorCPU genTensor0123(std::vector<TIndex> shape) {
-  TensorCPU r;
+  Tensor r(CPU);
  r.Resize(shape);

  std::random_device rd;
@ -114,7 +114,7 @@ TEST(ULP, QPadZero) {
  const auto ICQ = 1;

  auto X = genTensor11({1, 10, 10, ICQ * 8});
-  TensorCPU XQ, XQPad;
+  Tensor XQ(CPU), XQPad(CPU);
  signQuantize(X, &XQ);
  qpad_zero(args, XQ, &XQPad);

@ -174,7 +174,7 @@ inline void qgemmNT(int M, int N, int K, const uint8_t* A, const uint8_t* B, flo
 void gemmTest(TIndex M, TIndex N, TIndex K) {
  auto X = genTensor11({M, K});
  auto W = genTensor11({N, K});
-  TensorCPU XQ, WQ, YQ, Y;
+  Tensor XQ(CPU), WQ(CPU), YQ(CPU), Y(CPU);
  {
    signQuantize(X, &XQ);
    signQuantize(W, &WQ);
@ -207,7 +207,7 @@ TEST(QConv, ConvTest) {
  int K = 3;
  auto X = genTensor11({1, S, S, IC});
  auto W = genTensor11({OC, K, K, IC});
-  TensorCPU XQ, WQ, YQ, Y;
+  Tensor XQ(CPU), WQ(CPU), YQ(CPU), Y(CPU);
  {
    signQuantize(X, &XQ);
    signQuantize(W, &WQ);
@ -235,16 +235,16 @@ void ConvTest2b1b(int IC, int KH, int KW, int H, int W, int OC, int N, ConvArgs
  auto X = genTensor0123({N, H, W, IC});
  auto W_ = genTensor11({OC, KH, KW, IC});
  auto bias = genTensorUniform11({OC});
-  TensorCPU Y, YQ, Y2b1b, YOP;
+  Tensor Y(CPU), YQ(CPU), Y2b1b(CPU), YOP(CPU);

  {
    std::vector<std::unique_ptr<TensorCPU>> XQs(k2b1bXBits);
    std::vector<std::unique_ptr<TensorCPU>> YQs(k2b1bXBits);
    for (auto i = 0; i < k2b1bXBits; ++i) {
-      XQs[i] = caffe2::make_unique<TensorCPU>();
-      YQs[i] = caffe2::make_unique<TensorCPU>();
+      XQs[i] = caffe2::make_unique<Tensor>(CPU);
+      YQs[i] = caffe2::make_unique<Tensor>(CPU);
    }
-    TensorCPU WQN, WQ;
+    Tensor WQN(CPU), WQ(CPU);
    uniformQuantize2b1b(X, XQs, 0.5, 1.0);
    signQuantize(W_, &WQ);
    filterNormalization11(WQ, &WQN);
@ -289,17 +289,17 @@ void ConvTest2b1b(int IC, int KH, int KW, int H, int W, int OC, int N, ConvArgs
    def.add_arg()->CopyFrom(MakeArgument("pad_r", args.pad_r));
    def.add_arg()->CopyFrom(MakeArgument("pad_t", args.pad_t));
    def.add_arg()->CopyFrom(MakeArgument("pad_b", args.pad_b));
-    auto* Xws = ws.CreateBlob("X")->GetMutable<TensorCPU>();
+    auto* Xws = ws.CreateBlob("X")->GetMutableTensor(CPU);
    Xws->ResizeLike(X);
    Xws->ShareExternalPointer(X.mutable_data<float>(), X.size());
-    auto* Wws = ws.CreateBlob("W")->GetMutable<TensorCPU>();
+    auto* Wws = ws.CreateBlob("W")->GetMutableTensor(CPU);
    Wws->ResizeLike(W_);
    Wws->ShareExternalPointer(W_.mutable_data<float>(), W_.size());
-    auto* bws = ws.CreateBlob("b")->GetMutable<TensorCPU>();
+    auto* bws = ws.CreateBlob("b")->GetMutableTensor(CPU);
    bws->ResizeLike(bias);
    bws->ShareExternalPointer(bias.mutable_data<float>(), bias.size());
    ws.RunOperatorOnce(def);
-    YOP.CopyFrom<CPUContext>(ws.GetBlob("Y")->Get<TensorCPU>());
+    YOP.CopyFrom(ws.GetBlob("Y")->Get<TensorCPU>());
  }

  { conv(args, X, W_, &bias, &Y); }
--- a/caffe2/mpi/mpi_gpu_test.cc
+++ b/caffe2/mpi/mpi_gpu_test.cc
@ -55,7 +55,6 @@ TEST(MPITest, TestMPIBroadcast) {
  arg->set_f(rank);
  int size;
  MPI_Comm_size(MPI_COMM_WORLD, &size);
-
  for (int root = 0; root < size; ++root) {
    net_def.mutable_op(2)->mutable_arg(0)->set_i(root);
    Workspace ws;
@ -63,8 +62,8 @@ TEST(MPITest, TestMPIBroadcast) {
    EXPECT_NE(nullptr, net.get());
    EXPECT_TRUE(net->Run());
    // Let's test the value.
-    auto& X = ws.GetBlob("X")->Get<TensorCUDA>();
-    TensorCPU X_cpu(X);
+    auto& X = ws.GetBlob("X")->Get<Tensor>();
+    Tensor X_cpu(X, CPU);
    EXPECT_EQ(X.size(), 10);
    for (int i = 0; i < X.size(); ++i) {
      EXPECT_EQ(X_cpu.data<float>()[i], root);
@ -133,7 +132,7 @@ TEST(MPITest, TestMPIReduce) {
      auto& X = ws.GetBlob("X_reduced")->Get<TensorCUDA>();
      EXPECT_EQ(X.size(), 10);
      int expected_result = size * (size - 1) / 2;
-      TensorCPU X_cpu(X);
+      Tensor X_cpu(X, CPU);
      for (int i = 0; i < X.size(); ++i) {
        EXPECT_EQ(X_cpu.data<float>()[i], expected_result);
      }
@ -190,7 +189,7 @@ TEST(MPITest, TestMPIAllgather) {
  EXPECT_TRUE(net->Run());
  // Let's test the value.
  auto& X = ws.GetBlob("X")->Get<TensorCUDA>();
-  TensorCPU X_cpu(X);
+  Tensor X_cpu(X, CPU);
  EXPECT_EQ(X.size(), 20);
  for (int i = 0; i < X.size(); ++i) {
    EXPECT_EQ(X_cpu.data<float>()[i], rank);
@ -199,7 +198,7 @@ TEST(MPITest, TestMPIAllgather) {
  EXPECT_EQ(X_gathered.size(), 20 * size);
  EXPECT_EQ(X_gathered.dim(0), 2 * size);
  EXPECT_EQ(X_gathered.dim(1), 10);
-  TensorCPU X_gathered_cpu(X_gathered);
+  Tensor X_gathered_cpu(X_gathered, CPU);
  for (int i = 0; i < X_gathered.size(); ++i) {
    EXPECT_EQ(X_gathered_cpu.data<float>()[i], i / 20);
  }
@ -254,14 +253,14 @@ TEST(MPITest, TestMPIAllreduce) {
  // Let's test the value.
  auto& X = ws.GetBlob("X")->Get<TensorCUDA>();
  EXPECT_EQ(X.size(), 10);
-  TensorCPU X_cpu(X);
+  Tensor X_cpu(X, CPU);
  for (int i = 0; i < X.size(); ++i) {
    EXPECT_EQ(X_cpu.data<float>()[i], rank);
  }
  auto& X_reduced = ws.GetBlob("X_reduced")->Get<TensorCUDA>();
  EXPECT_EQ(X_reduced.size(), 10);
  int expected_result = size * (size - 1) / 2;
-  TensorCPU X_reduced_cpu(X_reduced);
+  Tensor X_reduced_cpu(X_reduced, CPU);
  for (int i = 0; i < X_reduced.size(); ++i) {
    EXPECT_EQ(X_reduced_cpu.data<float>()[i], expected_result);
  }
@ -316,7 +315,7 @@ TEST(MPITest, TestInPlaceMPIAllreduce) {
  auto& X_reduced = ws.GetBlob("X")->Get<TensorCUDA>();
  EXPECT_EQ(X_reduced.size(), 10);
  int expected_result = size * (size - 1) / 2;
-  TensorCPU X_reduced_cpu(X_reduced);
+  Tensor X_reduced_cpu(X_reduced, CPU);
  for (int i = 0; i < X_reduced.size(); ++i) {
    EXPECT_EQ(X_reduced_cpu.data<float>()[i], expected_result);
  }
--- a/caffe2/mpi/mpi_ops.h
+++ b/caffe2/mpi/mpi_ops.h
@ -36,8 +36,7 @@ class MPIBroadcastOp final : public Operator<Context> {
  bool RunOnDevice() override {
    MPI_Comm comm = OperatorBase::Input<MPICommonWorldWrapper>(0).comm();
    CAFFE_ENFORCE(
-        OperatorBase::OutputIsType<Tensor<Context>>(0),
-        "Output is of wrong type.");
+        OperatorBase::OutputIsType<Tensor>(0), "Output is of wrong type.");
    auto* output = Output(0);
    // Make sure that output is already allocated.
    CAFFE_ENFORCE(
@ -168,8 +167,8 @@ class MPISendTensorOp final : public Operator<Context> {
    MPI_Comm comm = OperatorBase::Input<MPICommonWorldWrapper>(COMM).comm();
    auto& input = Input(INPUT);
    if (InputSize() == 4) {
-      dst_ = OperatorBase::Input<TensorCPU>(DST).template data<int>()[0];
-      tag_ = OperatorBase::Input<TensorCPU>(TAG).template data<int>()[0];
+      dst_ = OperatorBase::Input<Tensor>(DST, CPU).template data<int>()[0];
+      tag_ = OperatorBase::Input<Tensor>(TAG, CPU).template data<int>()[0];
    }
    if (raw_buffer_) {
      // We need to do a const cast to cope with the fact that, before OpenMPI
@ -211,8 +210,8 @@ class MPIReceiveTensorOp final : public Operator<Context> {
  bool RunOnDevice() override {
    MPI_Comm comm = OperatorBase::Input<MPICommonWorldWrapper>(COMM).comm();
    if (InputSize() == 4) {
-      src_ = OperatorBase::Input<TensorCPU>(SRC_IN).template data<int>()[0];
-      tag_ = OperatorBase::Input<TensorCPU>(TAG_IN).template data<int>()[0];
+      src_ = OperatorBase::Input<Tensor>(SRC_IN, CPU).template data<int>()[0];
+      tag_ = OperatorBase::Input<Tensor>(TAG_IN, CPU).template data<int>()[0];
    }
    MPI_Status status;
    if (raw_buffer_) {
@ -228,10 +227,10 @@ class MPIReceiveTensorOp final : public Operator<Context> {
    } else {
      CAFFE_NOT_IMPLEMENTED;
    }
-    auto* src_out = OperatorBase::Output<TensorCPU>(SRC_OUT);
+    auto* src_out = OperatorBase::Output<Tensor>(SRC_OUT, CPU);
    src_out->Resize();
    src_out->template mutable_data<int>()[0] = status.MPI_SOURCE;
-    auto* tag_out = OperatorBase::Output<TensorCPU>(TAG_OUT);
+    auto* tag_out = OperatorBase::Output<Tensor>(TAG_OUT, CPU);
    tag_out->Resize();
    tag_out->template mutable_data<int>()[0] = status.MPI_TAG;
    return true;
--- a/caffe2/observers/profile_observer_gpu.cc
+++ b/caffe2/observers/profile_observer_gpu.cc
@ -26,17 +26,10 @@ void ProfileOperatorObserver::Dump() const {
  LOG(INFO) << "--------- Starting operator " << subject_->debug_def().type()
            << " op#" << getId() << " ---------";
  for (int i = 0; i < subject_->InputSize(); ++i) {
-    if (subject_->InputIsType<TensorCPU>(i)) {
-      const auto& tensor = subject_->Input<TensorCPU>(i);
+    const auto& tensor = subject_->Input<Tensor>(i);
    const auto& name = subject_->debug_def().input(i);
    TensorPrinter printer(name);
    LOG(INFO) << "Input " << i << ": " << printer.MetaStr(tensor);
-    } else if (subject_->InputIsType<TensorCUDA>(i)) {
-      const auto& tensor = subject_->Input<TensorCUDA>(i);
-      const auto& name = subject_->debug_def().input(i);
-      TensorPrinter printer(name);
-      LOG(INFO) << "Input " << i << ": " << printer.MetaStr(tensor);
-    }
  }

  int a = 0;
@ -46,13 +39,13 @@ void ProfileOperatorObserver::Dump() const {
  }

  for (int o = 0; o < subject_->OutputSize(); ++o) {
-    if (subject_->OutputIsType<TensorCPU>(o)) {
-      auto* tensor = subject_->Output<TensorCPU>(o);
+    if (subject_->OutputIsType<Tensor>(o, CPU)) {
+      auto* tensor = subject_->Output<Tensor>(o, CPU);
      const auto& name = subject_->debug_def().output(o);
      TensorPrinter printer(name);
      LOG(INFO) << "Output " << o << ": " << printer.MetaStr(*tensor);
-    } else if (subject_->OutputIsType<TensorCUDA>(o)) {
-      auto* tensor = subject_->Output<TensorCUDA>(o);
+    } else if (subject_->OutputIsType<Tensor>(o, CUDA)) {
+      auto* tensor = subject_->Output<Tensor>(o, CUDA);
      const auto& name = subject_->debug_def().output(o);
      TensorPrinter printer(name);
      LOG(INFO) << "Output " << o << ": " << printer.MetaStr(*tensor);
--- a/caffe2/operators/accuracy_op.cc
+++ b/caffe2/operators/accuracy_op.cc
@ -38,7 +38,7 @@ bool AccuracyOp<float, CPUContext>::RunOnDevice() {
    }
  }
  CAFFE_ENFORCE_LE(correct, N);
-  *(Y->mutable_data<float>()) = static_cast<float>(correct) / N;
+  *(Y->template mutable_data<float>()) = static_cast<float>(correct) / N;

  return true;
 }
@ -61,11 +61,20 @@ classes, it is considered a correct prediction.
        "top_k",
        "Count as correct by comparing the true label to the top k scoring "
        "classes (default 1: only compare to the top scoring class i.e. argmax)")
-  .Input(0, "predictions", "2-D tensor (Tensor<float>) of size "
+    .Input(
+        0,
+        "predictions",
+        "2-D tensor (Tensor<float>) of size "
        "(num_batches x num_classes) containing scores")
-  .Input(1, "labels", "1-D tensor (Tensor<int>) of size (num_batches) having "
+    .Input(
+        1,
+        "labels",
+        "1-D tensor (Tensor<float>) of size (num_batches) having "
        "the indices of true labels")
-  .Output(0, "accuracy", "1-D tensor (Tensor<float>) of size 1 containing "
+    .Output(
+        0,
+        "accuracy",
+        "1-D tensor (Tensor<float>) of size 1 containing "
        "accuracy");

 SHOULD_NOT_DO_GRADIENT(Accuracy);
--- a/caffe2/operators/accuracy_op.cu
+++ b/caffe2/operators/accuracy_op.cu
@ -54,7 +54,7 @@ bool AccuracyOp<float, CUDAContext>::RunOnDevice() {
  CAFFE_ENFORCE_EQ(label.ndim(), 1);
  CAFFE_ENFORCE_EQ(label.dim32(0), N);
  Y->Resize(vector<TIndex>());
-  float* Ydata = Y->mutable_data<float>();
+  float* Ydata = Y->template mutable_data<float>();
  math::Set<float, CUDAContext>(1, 0, Ydata, &context_);
  AccuracyKernel<<<
      std::min(CAFFE_MAXIMUM_NUM_BLOCKS, N),
--- a/caffe2/operators/affine_channel_op.cc
+++ b/caffe2/operators/affine_channel_op.cc
@ -70,7 +70,7 @@ bool AffineChannelGradientOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
      scale_dims.data(),
      dY_data,
      scale_data,
-      dX->mutable_data<float>(),
+      dX->template mutable_data<float>(),
      &context_);
  if (is_learnable_) {
    const auto& X = Input(1);
@ -85,8 +85,8 @@ bool AffineChannelGradientOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
        HxW,
        dY_data,
        X_data,
-        dscale->mutable_data<float>(),
-        dbias->mutable_data<float>());
+        dscale->template mutable_data<float>(),
+        dbias->template mutable_data<float>());
  }
  return true;
 }
@ -104,7 +104,12 @@ bool AffineChannelGradientOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
  const float* dY_data = dY.data<float>();
  const float* scale_data = scale.data<float>();
  math::RowwiseMul<float, CPUContext>(
-      rows, cols, dY_data, scale_data, dX->mutable_data<float>(), &context_);
+      rows,
+      cols,
+      dY_data,
+      scale_data,
+      dX->template mutable_data<float>(),
+      &context_);
  if (is_learnable_) {
    const auto& X = Input(1);
    const float* X_data = X.data<float>();
@ -120,8 +125,8 @@ bool AffineChannelGradientOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
        HxW,
        dY_data,
        X_data,
-        dscale->mutable_data<float>(),
-        dbias->mutable_data<float>());
+        dscale->template mutable_data<float>(),
+        dbias->template mutable_data<float>());
  }
  return true;
 }
--- a/caffe2/operators/affine_channel_op.cu
+++ b/caffe2/operators/affine_channel_op.cu
@ -71,7 +71,7 @@ bool AffineChannelGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
      scale_dims.data(),
      dY_data,
      scale_data,
-      dX->mutable_data<float>(),
+      dX->template mutable_data<float>(),
      &context_);
  if (is_learnable_) {
    const auto& X = Input(1);
@ -91,8 +91,8 @@ bool AffineChannelGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
            HxW,
            dY_data,
            X_data,
-            dscale->mutable_data<float>(),
-            dbias->mutable_data<float>());
+            dscale->template mutable_data<float>(),
+            dbias->template mutable_data<float>());
  }
  return true;
 }
@ -110,7 +110,12 @@ bool AffineChannelGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
  const float* dY_data = dY.data<float>();
  const float* scale_data = scale.data<float>();
  math::RowwiseMul<float, CUDAContext>(
-      rows, cols, dY_data, scale_data, dX->mutable_data<float>(), &context_);
+      rows,
+      cols,
+      dY_data,
+      scale_data,
+      dX->template mutable_data<float>(),
+      &context_);
  if (is_learnable_) {
    const auto& X = Input(1);
    const float* X_data = X.data<float>();
@ -130,8 +135,8 @@ bool AffineChannelGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
            HxW,
            dY_data,
            X_data,
-            dscale->mutable_data<float>(),
-            dbias->mutable_data<float>());
+            dscale->template mutable_data<float>(),
+            dbias->template mutable_data<float>());
  }
  return true;
 }
--- a/caffe2/operators/apmeter_op.cc
+++ b/caffe2/operators/apmeter_op.cc
@ -58,7 +58,7 @@ bool APMeterOp<float, CPUContext>::RunOnDevice() {

  const auto* Xdata = X.data<float>();
  const auto* labelData = label.data<int>();
-  auto* Ydata = Y->mutable_data<float>();
+  auto* Ydata = Y->template mutable_data<float>();

  BufferPredictions(Xdata, labelData, N, D);

@ -116,7 +116,7 @@ per class for the average precision of that class.
    .Input(
        1,
        "labels",
-        "2-D tensor (Tensor<int>) of size (num_samples) "
+        "2-D tensor (Tensor<float>) of size (num_samples) "
        "containing true labels for each sample")
    .Output(
        0,
--- a/caffe2/operators/assert_op.h
+++ b/caffe2/operators/assert_op.h
@ -41,7 +41,7 @@ class AssertOp final : public Operator<Context> {
  }

 private:
-  TensorCPU cmp_tensor_;
+  Tensor cmp_tensor_{CPU};
  std::string error_msg_;
 };

--- a/caffe2/operators/atomic_ops.cc
+++ b/caffe2/operators/atomic_ops.cc
@ -33,8 +33,8 @@ class AtomicFetchAddOp final : public Operator<CPUContext> {
    d->Resize(std::vector<TIndex>());
    auto* aPtr = a.data<int32_t>();
    auto* bPtr = b.data<int32_t>();
-    auto* cPtr = c->mutable_data<int32_t>();
-    auto* dPtr = d->mutable_data<int32_t>();
+    auto* cPtr = c->template mutable_data<int32_t>();
+    auto* dPtr = d->template mutable_data<int32_t>();
    std::lock_guard<std::mutex> lg(*mutex);
    *dPtr = *aPtr;
    *cPtr = *aPtr + *bPtr;
@ -77,7 +77,7 @@ class CheckAtomicBoolOp final : public Operator<CPUContext> {
  bool RunOnDevice() override {
    auto& ptr = OperatorBase::Input<std::unique_ptr<std::atomic<bool>>>(0);
    Output(0)->Resize(1);
-    *Output(0)->mutable_data<bool>() = ptr->load();
+    *Output(0)->template mutable_data<bool>() = ptr->load();
    return true;
  }
 };
--- a/caffe2/operators/batch_gather_ops.cu
+++ b/caffe2/operators/batch_gather_ops.cu
@ -31,7 +31,7 @@ __global__ void BatchGatherKernel(
 template <>
 bool BatchGatherOp<CUDAContext>::RunOnDevice() {
  return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
-      this, OperatorBase::Input<TensorCUDA>(INDICES));
+      this, OperatorBase::Input<Tensor>(INDICES, CUDA));
 }

 template <>
@ -99,7 +99,7 @@ __global__ void BatchGatherGradientKernel(
 template <>
 bool BatchGatherGradientOp<CUDAContext>::RunOnDevice() {
  return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
-      this, OperatorBase::Input<TensorCUDA>(INDICES));
+      this, OperatorBase::Input<Tensor>(INDICES, CUDA));
 }

 template <>
@ -107,7 +107,7 @@ template <typename TInd>
 bool BatchGatherGradientOp<CUDAContext>::DoRunWithType() {
  return DispatchHelper<
      TensorTypes2<float, GenericTensorImplementation>,
-      TInd>::call(this, OperatorBase::Input<TensorCUDA>(DATA));
+      TInd>::call(this, OperatorBase::Input<Tensor>(DATA, CUDA));
 }

 template <>
--- a/caffe2/operators/batch_gather_ops.h
+++ b/caffe2/operators/batch_gather_ops.h
@ -15,7 +15,7 @@ class BatchGatherOp final : public Operator<Context> {

  bool RunOnDevice() override {
    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
-        this, OperatorBase::Input<TensorCPU>(INDICES));
+        this, OperatorBase::Input<Tensor>(INDICES, CPU));
  }

  template <typename TInd>
@ -54,8 +54,7 @@ class BatchGatherOp final : public Operator<Context> {
        auto src =
            src_base + idx * block_bytesize + batch * data_batch_bytesize;
        auto dst = out + i * block_bytesize + batch * gathered_batch_bytesize;
-        context_.template CopyItems<Context, Context>(
-            data.meta(), block_size, src, dst);
+        context_.CopyItemsSameDevice(data.meta(), block_size, src, dst);
      }
    }
    return true;
@ -72,7 +71,7 @@ class BatchGatherGradientOp final : public Operator<Context> {

  bool RunOnDevice() override {
    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
-        this, OperatorBase::Input<TensorCPU>(INDICES));
+        this, OperatorBase::Input<Tensor>(INDICES, CPU));
  }

  template <typename TInd>
--- a/caffe2/operators/batch_matmul_op.h
+++ b/caffe2/operators/batch_matmul_op.h
@ -20,7 +20,7 @@ class BatchMatMulOp final : public Operator<Context> {
        broadcast_(OperatorBase::GetSingleArgument<int>("broadcast", 0)),
        use_scratch_(OperatorBase::GetSingleArgument<int>("use_scratch", 0)) {
    if (use_scratch_) {
-      scratch_ = std::make_shared<Tensor<Context>>();
+      scratch_ = std::make_shared<Tensor>(Context::GetDeviceType());
    }
  }

@ -282,7 +282,7 @@ class BatchMatMulOp final : public Operator<Context> {
  bool broadcast_;

  bool use_scratch_;
-  std::shared_ptr<Tensor<Context>> scratch_;
+  std::shared_ptr<Tensor> scratch_;
 };

 } // namespace caffe2
--- a/caffe2/operators/batch_matmul_op_gpu_test.cc
+++ b/caffe2/operators/batch_matmul_op_gpu_test.cc
@ -30,20 +30,20 @@ class BatchMatMulOpGPUTest : public testing::Test {
      const float value,
      const string& name) {
    Blob* blob = ws_.CreateBlob(name);
-    auto* tensor = blob->GetMutable<Tensor<CUDAContext>>();
+    auto* tensor = blob->GetMutableTensor(CUDA);
    tensor->Resize(dims);
    math::Set<float, CUDAContext>(
        tensor->size(),
        value,
-        tensor->mutable_data<float>(),
+        tensor->template mutable_data<float>(),
        cuda_context_.get());
  }

  void VerifyOutput(const std::vector<TIndex>& dims, const float value) const {
    const Blob* Y_blob = ws_.GetBlob("Y");
    ASSERT_NE(nullptr, Y_blob);
-    const auto& Y = Y_blob->Get<Tensor<CUDAContext>>();
-    TensorCPU Y_cpu(Y);
+    const auto& Y = Y_blob->Get<Tensor>();
+    Tensor Y_cpu(Y, CPU);
    const auto& Y_dims = Y_cpu.dims();
    ASSERT_EQ(dims.size(), Y_dims.size());
    for (std::size_t i = 0; i < dims.size(); ++i) {
--- a/caffe2/operators/batch_matmul_op_test.cc
+++ b/caffe2/operators/batch_matmul_op_test.cc
@ -24,12 +24,12 @@ class BatchMatMulOpTest : public testing::Test {
      const float value,
      const string& name) {
    Blob* blob = ws_.CreateBlob(name);
-    auto* tensor = blob->GetMutable<TensorCPU>();
+    auto* tensor = blob->GetMutableTensor(CPU);
    tensor->Resize(dims);
    math::Set<float, CPUContext>(
        tensor->size(),
        value,
-        tensor->mutable_data<float>(),
+        tensor->template mutable_data<float>(),
        cpu_context_.get());
  }

--- a/caffe2/operators/bbox_transform_op.cc
+++ b/caffe2/operators/bbox_transform_op.cc
@ -144,7 +144,9 @@ bool BBoxTransformOp<float, CPUContext>::RunOnDevice() {

  box_out->ResizeLike(delta_in);
  Eigen::Map<ERArrXXf> new_boxes(
-      box_out->mutable_data<float>(), box_out->dim32(0), box_out->dim32(1));
+      box_out->template mutable_data<float>(),
+      box_out->dim32(0),
+      box_out->dim32(1));

  // We assume roi_in and delta_in over multiple batches are grouped
  // together in increasing order as generated by GenerateProposalsOp
@ -187,7 +189,7 @@ bool BBoxTransformOp<float, CPUContext>::RunOnDevice() {
    auto* roi_batch_splits = Output(1);
    roi_batch_splits->Resize(batch_size);
    Eigen::Map<EArrXf> roi_batch_splits_map(
-        roi_batch_splits->mutable_data<float>(), batch_size);
+        roi_batch_splits->template mutable_data<float>(), batch_size);
    roi_batch_splits_map =
        Eigen::Map<const EArrXi>(num_rois_per_batch.data(), batch_size)
            .cast<float>();
--- a/caffe2/operators/boolean_mask_ops.cc
+++ b/caffe2/operators/boolean_mask_ops.cc
@ -91,8 +91,7 @@ bool BooleanMaskOp<CPUContext>::RunOnDevice() {
      const auto* src = inPtr + lastStart * innerSizeBytes;
      auto* dst = outPtr + outStart * innerSizeBytes;
      int numItems = i - lastStart;
-      context_.template CopyItems<CPUContext, CPUContext>(
-          data.meta(), numItems * innerSize, src, dst);
+      context_.CopyItemsSameDevice(data.meta(), numItems * innerSize, src, dst);
      outStart += numItems;
      lastStart = -1;
    }
@ -356,9 +355,9 @@ bool SequenceMaskOp<CPUContext>::RunOnDevice() {
 template <>
 template <class T>
 bool SequenceMaskOp<CPUContext>::DoRunWithType() {
-  const Tensor<CPUContext>* input = &Input(0);
-  const Tensor<CPUContext>* sequence_lengths = nullptr;
-  const Tensor<CPUContext>* window_centers = nullptr;
+  const Tensor* input = &Input(0);
+  const Tensor* sequence_lengths = nullptr;
+  const Tensor* window_centers = nullptr;

  if (mode_ == "sequence") {
    sequence_lengths = &Input(1);
@ -413,7 +412,7 @@ bool SequenceMaskOp<CPUContext>::DoRunWithType() {
          SequenceFunctor(
              sequence_lengths->data<int>(), sequence_lengths->size()),
          fill_val,
-          output->mutable_data<T>());
+          output->template mutable_data<T>());
    } else {
      MaskWithFunctor(
          left,
@ -423,7 +422,7 @@ bool SequenceMaskOp<CPUContext>::DoRunWithType() {
          SequenceFunctor(
              sequence_lengths->data<int>(), sequence_lengths->size()),
          fill_val,
-          output->mutable_data<T>());
+          output->template mutable_data<T>());
    }
  } else if (mode_ == "window") {
    MaskWithFunctor(
@ -433,7 +432,7 @@ bool SequenceMaskOp<CPUContext>::DoRunWithType() {
        input->data<T>(),
        WindowFunctor(window_centers->data<int>(), radius_),
        fill_val,
-        output->mutable_data<T>());
+        output->template mutable_data<T>());
  } else if (mode_ == "upper") {
    MaskWithFunctor(
        left,
@ -442,7 +441,7 @@ bool SequenceMaskOp<CPUContext>::DoRunWithType() {
        input->data<T>(),
        UpperFunctor(),
        fill_val,
-        output->mutable_data<T>());
+        output->template mutable_data<T>());
  } else if (mode_ == "lower") {
    MaskWithFunctor(
        left,
@ -451,7 +450,7 @@ bool SequenceMaskOp<CPUContext>::DoRunWithType() {
        input->data<T>(),
        LowerFunctor(),
        fill_val,
-        output->mutable_data<T>());
+        output->template mutable_data<T>());
  } else if (mode_ == "upperdiag") {
    MaskWithFunctor(
        left,
@ -460,7 +459,7 @@ bool SequenceMaskOp<CPUContext>::DoRunWithType() {
        input->data<T>(),
        UpperDiagFunctor(),
        fill_val,
-        output->mutable_data<T>());
+        output->template mutable_data<T>());
  } else if (mode_ == "lowerdiag") {
    MaskWithFunctor(
        left,
@ -469,7 +468,7 @@ bool SequenceMaskOp<CPUContext>::DoRunWithType() {
        input->data<T>(),
        LowerDiagFunctor(),
        fill_val,
-        output->mutable_data<T>());
+        output->template mutable_data<T>());
  } else {
    CAFFE_ENFORCE(false, "Unsupported mode for SequenceMaskOp!");
    return false;
--- a/caffe2/operators/boolean_mask_ops.cu
+++ b/caffe2/operators/boolean_mask_ops.cu
@ -73,8 +73,7 @@ class BooleanMaskOp<CUDAContext> final : public Operator<CUDAContext> {

    // Copy numOfOutput from gpu to cpu
    TIndex numOfOutput;
-    context_.Copy<TIndex, CUDAContext, CPUContext>(
-        1, numOfOutputData, &numOfOutput);
+    context_.CopyToCPU(1, numOfOutputData, &numOfOutput);

    indices_.Resize(numOfOutput);
    std::vector<TIndex> dims = src.dims();
@ -85,7 +84,7 @@ class BooleanMaskOp<CUDAContext> final : public Operator<CUDAContext> {
    if (OutputSize() == 2) {
      auto* indicesOut = Output(1);
      indicesOut->Resize(numOfOutput);
-      indicesOut->mutable_data<TIndex>();
+      indicesOut->template mutable_data<TIndex>();
    }

    if (numOfOutput > 0) {
@ -109,8 +108,8 @@ class BooleanMaskOp<CUDAContext> final : public Operator<CUDAContext> {
  }

 private:
-  Tensor<CUDAContext> indices_;
-  Tensor<CUDAContext> scratch_;
+  Tensor indices_{CUDA};
+  Tensor scratch_{CUDA};
 };

 REGISTER_CUDA_OPERATOR(BooleanMask, BooleanMaskOp<CUDAContext>);
@ -297,9 +296,9 @@ bool SequenceMaskOp<CUDAContext>::RunOnDevice() {
 template <>
 template <class T>
 bool SequenceMaskOp<CUDAContext>::DoRunWithType() {
-  const Tensor<CUDAContext>* input = &Input(0);
-  const Tensor<CUDAContext>* sequence_lengths = nullptr;
-  const Tensor<CUDAContext>* window_centers = nullptr;
+  const Tensor* input = &Input(0);
+  const Tensor* sequence_lengths = nullptr;
+  const Tensor* window_centers = nullptr;

  if (mode_ == "sequence") {
    sequence_lengths = &Input(1);
@ -355,7 +354,7 @@ bool SequenceMaskOp<CUDAContext>::DoRunWithType() {
          input->data<T>(),
          sequence_lengths->data<int>(),
          fill_val,
-          output->mutable_data<T>());
+          output->template mutable_data<T>());
    } else {
      sequenceMaskKernel<<<
          CAFFE_GET_BLOCKS(left * right),
@ -368,7 +367,7 @@ bool SequenceMaskOp<CUDAContext>::DoRunWithType() {
          input->data<T>(),
          sequence_lengths->data<int>(),
          fill_val,
-          output->mutable_data<T>());
+          output->template mutable_data<T>());
    }
  } else if (mode_ == "window") {
    windowMaskKernel<<<
@ -383,7 +382,7 @@ bool SequenceMaskOp<CUDAContext>::DoRunWithType() {
        window_centers->data<int>(),
        radius_,
        fill_val,
-        output->mutable_data<T>());
+        output->template mutable_data<T>());
  } else if (mode_ == "upper") {
    upperMaskKernel<<<
        CAFFE_GET_BLOCKS(left * right),
@ -395,7 +394,7 @@ bool SequenceMaskOp<CUDAContext>::DoRunWithType() {
        batch_dim,
        input->data<T>(),
        fill_val,
-        output->mutable_data<T>());
+        output->template mutable_data<T>());
  } else if (mode_ == "lower") {
    lowerMaskKernel<<<
        CAFFE_GET_BLOCKS(left * right),
@ -407,7 +406,7 @@ bool SequenceMaskOp<CUDAContext>::DoRunWithType() {
        batch_dim,
        input->data<T>(),
        fill_val,
-        output->mutable_data<T>());
+        output->template mutable_data<T>());
  } else if (mode_ == "upperdiag") {
    upperDiagMaskKernel<<<
        CAFFE_GET_BLOCKS(left * right),
@ -419,7 +418,7 @@ bool SequenceMaskOp<CUDAContext>::DoRunWithType() {
        batch_dim,
        input->data<T>(),
        fill_val,
-        output->mutable_data<T>());
+        output->template mutable_data<T>());
  } else if (mode_ == "lowerdiag") {
    lowerDiagMaskKernel<<<
        CAFFE_GET_BLOCKS(left * right),
@ -431,7 +430,7 @@ bool SequenceMaskOp<CUDAContext>::DoRunWithType() {
        batch_dim,
        input->data<T>(),
        fill_val,
-        output->mutable_data<T>());
+        output->template mutable_data<T>());
  } else {
    CAFFE_ENFORCE(false, "Unsupported mode for SequenceMaskOp!");
  }
--- a/caffe2/operators/boolean_unmask_ops.cu
+++ b/caffe2/operators/boolean_unmask_ops.cu
@ -77,9 +77,9 @@ class BooleanUnmaskOp<CUDAContext> final : public Operator<CUDAContext> {
      hostValuesData[i] = (char*)value.raw_data();
      hostValueSizesData[i] = value.size();
    }
-    masks_.CopyFrom(hostMasks_, &context_);
-    values_.CopyFrom(hostValues_, &context_);
-    valueSizes_.CopyFrom(hostValueSizes_, &context_);
+    masks_.CopyFrom(hostMasks_);
+    values_.CopyFrom(hostValues_);
+    valueSizes_.CopyFrom(hostValueSizes_);

    indices_.Resize(maskSize);
    auto* indicesData = indices_.mutable_data<int>();
@ -109,14 +109,14 @@ class BooleanUnmaskOp<CUDAContext> final : public Operator<CUDAContext> {
  }

 private:
-  Tensor<CUDAContext> indices_;
-  Tensor<CUDAContext> masks_;
-  Tensor<CUDAContext> values_;
-  Tensor<CUDAContext> valueSizes_;
+  Tensor indices_{CUDA};
+  Tensor masks_{CUDA};
+  Tensor values_{CUDA};
+  Tensor valueSizes_{CUDA};

-  Tensor<CPUContext> hostMasks_;
-  Tensor<CPUContext> hostValues_;
-  Tensor<CPUContext> hostValueSizes_;
+  Tensor hostMasks_{CPU};
+  Tensor hostValues_{CPU};
+  Tensor hostValueSizes_{CPU};
 };

 REGISTER_CUDA_OPERATOR(BooleanUnmask, BooleanUnmaskOp<CUDAContext>);
--- a/caffe2/operators/boolean_unmask_ops_test.cc
+++ b/caffe2/operators/boolean_unmask_ops_test.cc
@ -16,13 +16,13 @@ static void AddScalarInput(
    Workspace* ws,
    bool isEmpty = false) {
  Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutable<TensorCPU>();
+  auto* tensor = blob->GetMutableTensor(CPU);
  if (!isEmpty) {
    tensor->Resize(vector<TIndex>{1});
-    *(tensor->mutable_data<DataT>()) = value;
+    *(tensor->template mutable_data<DataT>()) = value;
  } else {
    tensor->Resize(vector<TIndex>{0});
-    tensor->mutable_data<DataT>();
+    tensor->template mutable_data<DataT>();
  }
  return;
 }
--- a/caffe2/operators/box_with_nms_limit_op.cc
+++ b/caffe2/operators/box_with_nms_limit_op.cc
@ -77,8 +77,8 @@ bool BoxWithNMSLimitOp<CPUContext>::RunOnDevice() {
  out_boxes->Resize(0, box_dim);
  out_classes->Resize(0);

-  TensorCPU* out_keeps = nullptr;
-  TensorCPU* out_keeps_size = nullptr;
+  Tensor* out_keeps = nullptr;
+  Tensor* out_keeps_size = nullptr;
  if (OutputSize() > 4) {
    out_keeps = Output(4);
    out_keeps_size = Output(5);
@ -194,7 +194,8 @@ bool BoxWithNMSLimitOp<CPUContext>::RunOnDevice() {
      auto cur_boxes = boxes.block(0, j * box_dim, boxes.rows(), box_dim);
      auto& cur_keep = keeps[j];
      Eigen::Map<EArrXf> cur_out_scores(
-          out_scores->mutable_data<float>() + cur_start_idx + cur_out_idx,
+          out_scores->template mutable_data<float>() + cur_start_idx +
+              cur_out_idx,
          cur_keep.size());
      Eigen::Map<ERArrXXf> cur_out_boxes(
          out_boxes->mutable_data<float>() +
@ -202,7 +203,8 @@ bool BoxWithNMSLimitOp<CPUContext>::RunOnDevice() {
          cur_keep.size(),
          box_dim);
      Eigen::Map<EArrXf> cur_out_classes(
-          out_classes->mutable_data<float>() + cur_start_idx + cur_out_idx,
+          out_classes->template mutable_data<float>() + cur_start_idx +
+              cur_out_idx,
          cur_keep.size());

      utils::GetSubArray(
@ -220,9 +222,11 @@ bool BoxWithNMSLimitOp<CPUContext>::RunOnDevice() {
      out_keeps->Extend(total_keep_count, 50, &context_);

      Eigen::Map<EArrXi> out_keeps_arr(
-          out_keeps->mutable_data<int>() + cur_start_idx, total_keep_count);
+          out_keeps->template mutable_data<int>() + cur_start_idx,
+          total_keep_count);
      Eigen::Map<EArrXi> cur_out_keeps_size(
-          out_keeps_size->mutable_data<int>() + b * num_classes, num_classes);
+          out_keeps_size->template mutable_data<int>() + b * num_classes,
+          num_classes);

      cur_out_idx = 0;
      for (int j = 0; j < num_classes; j++) {
@ -240,7 +244,7 @@ bool BoxWithNMSLimitOp<CPUContext>::RunOnDevice() {
    auto* batch_splits_out = Output(3);
    batch_splits_out->Resize(batch_size);
    Eigen::Map<EArrXf> batch_splits_out_map(
-        batch_splits_out->mutable_data<float>(), batch_size);
+        batch_splits_out->template mutable_data<float>(), batch_size);
    batch_splits_out_map =
        Eigen::Map<const EArrXi>(total_keep_per_batch.data(), batch_size)
            .cast<float>();
--- a/caffe2/operators/ceil_op.cu
+++ b/caffe2/operators/ceil_op.cu
@ -22,7 +22,7 @@ bool CeilOp<float, CUDAContext>::RunOnDevice() {
      CAFFE_CUDA_NUM_THREADS,
      0,
      context_.cuda_stream()>>>(
-      X.size(), X.data<float>(), Y->mutable_data<float>());
+      X.size(), X.data<float>(), Y->template mutable_data<float>());
  return true;
 }

--- a/Show More
+++ b/Show More