Removing some dependency edges from Blob to other caffe2 (#11923)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11923 This is pre-work to allow moving Blob to ATen/core, which cannot depend on caffe2 anymore. (1) Removing the Blob -> Tensor dependency allows us to move Blob to ATen/core and use it inside IValue without having to wait for the Tensor merge to be complete. (2) In the final Blob design, we want it to be a very small class that doesn't have any special treatment for Tensor (or to be more correct, doesn't allow storing Tensor anymore), so this is anyhow the direction we want to go. This changes call sites that will have to be moved to IValue later, but they cannot be moved to IValue directly, because for that, IValue first needs to be able to store Blob, which in turn first needs this diff and some other changes coming up in future diffs. Codemods: $ codemod --extensions h,hpp,c,cpp,cc "([a-zA-Z0-9_]+)\\.IsTensorType\\(" "BlobIsTensorType(\\1, " $ codemod --extensions h,hpp,c,cpp,cc "([a-zA-Z0-9_]+)->IsTensorType\\(" "BlobIsTensorType(*\\1, " $ codemod --extensions h,hpp,c,cpp,cc "([a-zA-Z0-9_]+)\\.GetMutableTensor\\(" "BlobGetMutableTensor(\\1, " $ codemod --extensions h,hpp,c,cpp,cc "([a-zA-Z0-9_]+)->GetMutableTensor\\(" "BlobGetMutableTensor(*\\1, " It is, however, not only these codemods because regex based refactoring was only able to match a small amount of the call sites. To catch more, I wouldn've needed a AST aware tool like clangr, which I didn't figure out how to use. Reviewed By: ezyang Differential Revision: D9979976 fbshipit-source-id: 2ea17724e223b5b73b44f99362727759ca689e61
2025-10-21 13:44:15 +08:00 · 2018-09-24 22:52:14 -07:00
parent dfa03e94eb
commit 17a65bf9b6
66 changed files with 380 additions and 371 deletions
--- a/binaries/benchmark_helper.cc
+++ b/binaries/benchmark_helper.cc
@ -163,7 +163,7 @@ void loadInput(
          CAFFE_THROW("Not support GPU on mobile.");
 #endif
        } else {
-          caffe2::TensorCPU* tensor = blob->GetMutableTensor(caffe2::CPU);
+          caffe2::TensorCPU* tensor = BlobGetMutableTensor(blob, caffe2::CPU);
          CHECK_NOTNULL(tensor);
          tensor->Resize(input_dims);
          if (input_type_list[i] == "uint8_t") {
@ -200,7 +200,7 @@ void fillInputBlob(
    int protos_size = tensor_kv.second.protos_size();
    caffe2::TensorProto* tensor_proto =
        tensor_kv.second.mutable_protos(iteration % protos_size);
-    caffe2::TensorCPU* tensor = blob->GetMutableTensor(caffe2::CPU);
+    caffe2::TensorCPU* tensor = BlobGetMutableTensor(blob, caffe2::CPU);
    if (tensor_proto->data_type() == caffe2::TensorProto::STRING) {
      int total_size = tensor_proto->string_data_size();
      for (size_t i = 0; i < total_size; i++) {
@ -298,7 +298,7 @@ void writeOutput(
 #endif
        } else {
          writeTextOutput<caffe2::CPUContext, caffe2::TensorCPU>(
-              workspace->GetBlob(name)->GetMutableTensor(caffe2::CPU),
+              BlobGetMutableTensor(workspace->GetBlob(name), caffe2::CPU),
              output_prefix,
              name);
        }
--- a/binaries/speed_benchmark.cc
+++ b/binaries/speed_benchmark.cc
@ -137,7 +137,7 @@ int main(int argc, char** argv) {
        if (blob == nullptr) {
          blob = workspace->CreateBlob(input_names[i]);
        }
-        caffe2::TensorCPU* tensor = blob->GetMutableTensor(caffe2::CPU);
+        caffe2::TensorCPU* tensor = BlobGetMutableTensor(blob, caffe2::CPU);
        CHECK_NOTNULL(tensor);
        tensor->Resize(input_dims);
        if (input_type_list[i] == "uint8_t") {
--- a/caffe2/contrib/gloo/common.cc
+++ b/caffe2/contrib/gloo/common.cc
@ -12,7 +12,7 @@ namespace caffe2 {
 namespace gloo {
 void signalFailure(Blob* status_blob, std::exception& /* unused */) {
-  auto* res = status_blob->GetMutableTensor(CPU);
+  auto* res = BlobGetMutableTensor(status_blob, CPU);
  res->Resize(1);
  res->template mutable_data<int32_t>()[0] = 1;
 }
--- a/caffe2/contrib/nervana/nervana_fc_op_gpu_test.cc
+++ b/caffe2/contrib/nervana/nervana_fc_op_gpu_test.cc
@ -22,7 +22,7 @@ static void AddConstInput(const std::vector<int>& shape, const float value,
  option.set_device_type(PROTO_CUDA);
  CUDAContext context(option);
  Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CUDA);
+  auto* tensor = BlobGetMutableTensor(blob, CUDA);
  tensor->Resize(shape);
  math::Set<float, CUDAContext>(tensor->size(), value,
                                tensor->mutable_data<float>(),
--- a/caffe2/contrib/tensorrt/tensorrt_tranformer.cc
+++ b/caffe2/contrib/tensorrt/tensorrt_tranformer.cc
@ -95,10 +95,10 @@ void BlobToTensorProto(
  }
  // Set values
-  if (blob->IsTensorType(CPU)) {
+  if (BlobIsTensorType(*blob, CPU)) {
    const auto& cpu_tensor = blob->template Get<TensorCPU>();
    CPUTensorToTensorProto(cpu_tensor, t);
-  } else if (blob->IsTensorType(CUDA)) {
+  } else if (BlobIsTensorType(*blob, CUDA)) {
    const auto& cuda_tensor = blob->template Get<TensorCUDA>();
    const auto cpu_tensor = TensorCPU(cuda_tensor, context);
    context->FinishDeviceComputation();
--- a/caffe2/core/blob.h
+++ b/caffe2/core/blob.h
@ -6,16 +6,16 @@
 #include <typeinfo>
 #include <type_traits>
 #include <vector>
 #include "caffe2/core/blob_serializer_base.h"
 #include "caffe2/core/common.h"
 #include <ATen/core/typeid.h>
 #include "caffe2/core/logging.h"
 #include "caffe2/core/tensor.h"
 #include "caffe2/core/typeid.h"
 #include "caffe2/proto/caffe2_pb.h"
 namespace caffe2 {
 class Tensor;
 /**
 * @brief Blob is a general container that hosts a typed pointer.
 *
@ -50,15 +50,6 @@ class CAFFE2_API Blob final {
    return meta_.Match<T>();
  }
  bool IsTensorType(DeviceType device_type) const {
    bool is_match = meta_.Match<Tensor>();
    auto* tensor = static_cast<Tensor*>(pointer_);
    if (is_match && tensor && tensor->GetDeviceType() == device_type) {
      return true;
    }
    return false;
  }
  /**
   * Returns the meta info of the blob.
   */
@ -109,9 +100,6 @@ class CAFFE2_API Blob final {
        std::is_default_constructible<T>::value,
        "GetMutable can't be called with non-default-constructible types. "
        "Try using specialized methods");
    static_assert(
        !std::is_same<T, Tensor>::value,
        "Use GetMutableTensor(DeviceType) instead");
    if (IsType<T>()) {
      return static_cast<T*>(pointer_);
    } else {
@ -129,16 +117,6 @@ class CAFFE2_API Blob final {
    }
  }
  inline Tensor* GetMutableTensor(DeviceType device_type) {
    if (IsTensorType(device_type)) {
      return static_cast<Tensor*>(pointer_);
    } else {
      VLOG(1) << "Create new mutable object " << TypeMeta::TypeName<Tensor>()
              << " DeviceType:" << device_type;
      return Reset<Tensor>(new Tensor(device_type));
    }
  }
  /**
   * Sets the underlying object to the allocated one. The Blob then takes over
   * the ownership of the passed in pointer. If there is already an object in
@ -248,5 +226,29 @@ inline void swap(Blob& lhs, Blob& rhs) {
  lhs.swap(rhs);
 }
 inline bool BlobIsTensorType(const Blob& blob, DeviceType device_type) {
  bool is_match = blob.meta().Match<Tensor>();
  if (!is_match) {
    return false;
  }
  const Tensor* tensor = &blob.Get<Tensor>();
  return tensor && tensor->GetDeviceType() == device_type;
 }
 inline Tensor* BlobGetMutableTensor(Blob* blob, DeviceType device_type) {
  if (blob->IsType<Tensor>()) {
    Tensor* tensor = blob->GetMutable<Tensor>();
    if (tensor->GetDeviceType() == device_type) {
      return tensor;
    }
  }
  // if we're here, then either Blob didn't hold a Tensor
  // or that Tensor had the wrong DeviceType.
  VLOG(1) << "Create new mutable object " << TypeMeta::TypeName<Tensor>()
          << " DeviceType:" << device_type;
  return blob->Reset<Tensor>(new Tensor(device_type));
 }
 }  // namespace caffe2
 #endif  // CAFFE2_CORE_BLOB_H_
--- a/caffe2/core/blob_gpu_test.cc
+++ b/caffe2/core/blob_gpu_test.cc
@ -132,7 +132,7 @@ TYPED_TEST(TensorGPUDeathTest, CannotAccessDataWhenEmpty) {
    for (int i = 0; i < 6; ++i) {                                          \
      cpu_tensor.mutable_data<TypeParam>()[i] = static_cast<TypeParam>(i); \
    }                                                                      \
-    blob.GetMutableTensor(CUDA)->CopyFrom(cpu_tensor);                     \
+    BlobGetMutableTensor(&blob, CUDA)->CopyFrom(cpu_tensor);               \
    string serialized = SerializeBlob(blob, "test");                       \
    BlobProto proto;                                                       \
    CAFFE_ENFORCE(proto.ParseFromString(serialized));                      \
@ -149,7 +149,7 @@ TYPED_TEST(TensorGPUDeathTest, CannotAccessDataWhenEmpty) {
    }                                                                      \
    Blob new_blob;                                                         \
    EXPECT_NO_THROW(DeserializeBlob(serialized, &new_blob));               \
-    EXPECT_TRUE(new_blob.IsTensorType(CUDA));                              \
+    EXPECT_TRUE(BlobIsTensorType(new_blob, CUDA));                         \
    Tensor new_cpu_tensor(blob.Get<Tensor>(), CPU);                        \
    EXPECT_EQ(new_cpu_tensor.ndim(), 2);                                   \
    EXPECT_EQ(new_cpu_tensor.dim(0), 2);                                   \
@ -199,7 +199,7 @@ TEST(TensorTest, TensorSerializationMultiDevices) {
    // Test if the restored blob is still of the same device.
    blob.Reset();
    EXPECT_NO_THROW(DeserializeBlob(serialized, &blob));
-    EXPECT_TRUE(blob.IsTensorType(CUDA));
+    EXPECT_TRUE(BlobIsTensorType(blob, CUDA));
    EXPECT_EQ(GetGPUIDForPointer(blob.Get<TensorCUDA>().data<float>()),
              gpu_id);
    // Test if we force the restored blob on a different device, we
@ -207,7 +207,7 @@ TEST(TensorTest, TensorSerializationMultiDevices) {
    blob.Reset();
    proto.mutable_tensor()->mutable_device_detail()->set_cuda_gpu_id(0);
    EXPECT_NO_THROW(DeserializeBlob(proto.SerializeAsString(), &blob));
-    EXPECT_TRUE(blob.IsTensorType(CUDA));
+    EXPECT_TRUE(BlobIsTensorType(blob, CUDA));
    EXPECT_EQ(GetGPUIDForPointer(blob.Get<TensorCUDA>().data<float>()), 0);
  }
 }
--- a/caffe2/core/blob_serialization.cc
+++ b/caffe2/core/blob_serialization.cc
@ -363,7 +363,8 @@ void TensorDeserializer::Deserialize(const BlobProto& blob_proto, Blob* blob) {
  auto tensor_proto = blob_proto.tensor();
  Deserialize(
      tensor_proto,
-      blob->GetMutableTensor(
+      BlobGetMutableTensor(
          blob,
          static_cast<DeviceType>(tensor_proto.device_detail().device_type())));
 }
--- a/caffe2/core/blob_test.cc
+++ b/caffe2/core/blob_test.cc
@ -86,15 +86,15 @@ TEST(BlobTest, Blob) {
  int* int_unused CAFFE2_UNUSED = blob.GetMutable<int>();
  EXPECT_TRUE(blob.IsType<int>());
  EXPECT_FALSE(blob.IsType<BlobTestFoo>());
-  EXPECT_FALSE(blob.IsTensorType(CPU));
+  EXPECT_FALSE(BlobIsTensorType(blob, CPU));
  BlobTestFoo* foo_unused CAFFE2_UNUSED = blob.GetMutable<BlobTestFoo>();
  EXPECT_TRUE(blob.IsType<BlobTestFoo>());
  EXPECT_FALSE(blob.IsType<int>());
-  EXPECT_FALSE(blob.IsTensorType(CPU));
+  EXPECT_FALSE(BlobIsTensorType(blob, CPU));
-  Tensor* tensor_unused CAFFE2_UNUSED = blob.GetMutableTensor(CPU);
+  Tensor* tensor_unused CAFFE2_UNUSED = BlobGetMutableTensor(&blob, CPU);
-  EXPECT_TRUE(blob.IsTensorType(CPU));
+  EXPECT_TRUE(BlobIsTensorType(blob, CPU));
  EXPECT_FALSE(blob.IsType<BlobTestFoo>());
  EXPECT_FALSE(blob.IsType<int>());
 }
@ -600,7 +600,7 @@ TEST(TensorDeathTest, CannotCastDownLargeDims) {
 #define TEST_SERIALIZATION_WITH_TYPE(TypeParam, field_name)               \
  TEST(TensorTest, TensorSerialization_##TypeParam) {                     \
    Blob blob;                                                            \
-    Tensor* tensor = blob.GetMutableTensor(CPU);                          \
+    Tensor* tensor = BlobGetMutableTensor(&blob, CPU);                    \
    tensor->Resize(2, 3);                                                 \
    for (int i = 0; i < 6; ++i) {                                         \
      tensor->mutable_data<TypeParam>()[i] = static_cast<TypeParam>(i);   \
@ -621,7 +621,7 @@ TEST(TensorDeathTest, CannotCastDownLargeDims) {
    }                                                                     \
    Blob new_blob;                                                        \
    EXPECT_NO_THROW(DeserializeBlob(serialized, &new_blob));              \
-    EXPECT_TRUE(new_blob.IsTensorType(CPU));                              \
+    EXPECT_TRUE(BlobIsTensorType(new_blob, CPU));                         \
    const TensorCPU& new_tensor = blob.Get<TensorCPU>();                  \
    EXPECT_EQ(new_tensor.ndim(), 2);                                      \
    EXPECT_EQ(new_tensor.dim(0), 2);                                      \
@ -634,7 +634,7 @@ TEST(TensorDeathTest, CannotCastDownLargeDims) {
                                                                          \
  TEST(EmptyTensorTest, TensorSerialization_##TypeParam) {                \
    Blob blob;                                                            \
-    TensorCPU* tensor = blob.GetMutableTensor(CPU);                       \
+    TensorCPU* tensor = BlobGetMutableTensor(&blob, CPU);                 \
    tensor->Resize(0, 3);                                                 \
    tensor->mutable_data<TypeParam>();                                    \
    string serialized = SerializeBlob(blob, "test");                      \
@ -650,7 +650,7 @@ TEST(TensorDeathTest, CannotCastDownLargeDims) {
    EXPECT_EQ(tensor_proto.field_name##_size(), 0);                       \
    Blob new_blob;                                                        \
    EXPECT_NO_THROW(DeserializeBlob(serialized, &new_blob));              \
-    EXPECT_TRUE(new_blob.IsTensorType(CPU));                              \
+    EXPECT_TRUE(BlobIsTensorType(new_blob, CPU));                         \
    const TensorCPU& new_tensor = blob.Get<TensorCPU>();                  \
    EXPECT_EQ(new_tensor.ndim(), 2);                                      \
    EXPECT_EQ(new_tensor.dim(0), 0);                                      \
@ -669,7 +669,7 @@ TEST_SERIALIZATION_WITH_TYPE(int64_t, int64_data)
 TEST(TensorTest, TensorSerialization_CustomType) {
  Blob blob;
-  TensorCPU* tensor = blob.GetMutableTensor(CPU);
+  TensorCPU* tensor = BlobGetMutableTensor(&blob, CPU);
  tensor->Resize(2, 3);
  for (int i = 0; i < 6; ++i) {
    tensor->mutable_data<BlobTestFoo>()[i].val = i;
@ -681,7 +681,7 @@ TEST(TensorTest, TensorSerialization_CustomType) {
  EXPECT_EQ(proto.type(), "Tensor");
  Blob new_blob;
  EXPECT_NO_THROW(DeserializeBlob(serialized, &new_blob));
-  EXPECT_TRUE(new_blob.IsTensorType(CPU));
+  EXPECT_TRUE(BlobIsTensorType(new_blob, CPU));
  const TensorCPU& new_tensor = blob.Get<TensorCPU>();
  EXPECT_EQ(new_tensor.ndim(), 2);
  EXPECT_EQ(new_tensor.dim(0), 2);
@ -696,7 +696,7 @@ TEST(TensorTest, TensorSerialization_CustomType) {
 TEST(TensorTest, Half) {
  const int64_t kSize = 3000000;
  Blob blob;
-  TensorCPU* tensor = blob.GetMutableTensor(CPU);
+  TensorCPU* tensor = BlobGetMutableTensor(&blob, CPU);
  tensor->Resize(kSize);
  for (int i = 0; i < tensor->size(); ++i) {
    tensor->mutable_data<at::Half>()[i].x = i % 10000;
@ -724,7 +724,7 @@ TEST(TensorTest, Half) {
  }
  Blob new_blob;
  EXPECT_NO_THROW(DeserializeBlob(serialized, &new_blob));
-  EXPECT_TRUE(new_blob.IsTensorType(CPU));
+  EXPECT_TRUE(BlobIsTensorType(new_blob, CPU));
  const TensorCPU& new_tensor = blob.Get<TensorCPU>();
  EXPECT_EQ(new_tensor.ndim(), 1);
  EXPECT_EQ(new_tensor.dim(0), kSize);
@ -860,7 +860,7 @@ TYPED_TEST(TypedTensorTest, BigTensorSerialization) {
  {
    VLOG(1) << "Test begin";
    Blob blob;
-    Tensor* tensor = blob.GetMutableTensor(CPU);
+    Tensor* tensor = BlobGetMutableTensor(&blob, CPU);
    VLOG(1) << "Allocating blob";
    tensor->Resize(d1, d2);
    auto mutableData = tensor->mutable_data<TypeParam>();
@ -903,7 +903,7 @@ TYPED_TEST(TypedTensorTest, BigTensorSerialization) {
    load_op->Run();
    VLOG(1) << "Reading blob from workspace";
    auto new_blob = ws.GetBlob("test");
-    EXPECT_TRUE(new_blob->IsTensorType(CPU));
+    EXPECT_TRUE(BlobIsTensorType(*new_blob, CPU));
    const auto& new_tensor = new_blob->Get<TensorCPU>();
    EXPECT_EQ(new_tensor.ndim(), d1);
@ -1030,7 +1030,7 @@ TEST(CustomChunkSize, BigTensorSerialization) {
  int64_t size = d1 * d2;
  Blob blob;
-  TensorCPU* tensor = blob.GetMutableTensor(CPU);
+  TensorCPU* tensor = BlobGetMutableTensor(&blob, CPU);
  tensor->Resize(d1, d2);
  tensor->mutable_data<float>();
  std::mutex mutex;
--- a/caffe2/core/operator.h
+++ b/caffe2/core/operator.h
@ -122,7 +122,7 @@ class CAFFE2_API OperatorBase : public Observable<OperatorBase> {
    static_assert(
        std::is_same<T, Tensor>::value,
        "Output(int, DeviceType) is only available for Tensor");
-    return outputs_.at(idx)->GetMutableTensor(type);
+    return BlobGetMutableTensor(outputs_.at(idx), type);
  }
  template <typename T>
@ -149,7 +149,7 @@ class CAFFE2_API OperatorBase : public Observable<OperatorBase> {
  }
  inline bool InputIsTensorType(int idx, DeviceType device_type) {
-    return inputs_.at(idx)->IsTensorType(device_type);
+    return BlobIsTensorType(*inputs_.at(idx), device_type);
  }
  template <typename T>
@ -162,7 +162,7 @@ class CAFFE2_API OperatorBase : public Observable<OperatorBase> {
  }
  inline bool OutputIsTensorType(int idx, DeviceType type) {
-    return outputs_.at(idx)->IsTensorType(type);
+    return BlobIsTensorType(*outputs_.at(idx), type);
  }
  inline int InputSize() const {
--- a/caffe2/core/plan_executor.cc
+++ b/caffe2/core/plan_executor.cc
@ -131,7 +131,8 @@ struct WorkspaceIdInjector {
          "Integer overflow while calculating GLOBAL_WORKSPACE_ID blob");
      int32_t global_ws_id = (seq_++) + (static_cast<int32_t>(node_id) << 16);
      Blob* global_ws_id_blob = workspace->CreateLocalBlob(GLOBAL_WORKSPACE_ID);
-      TensorCPU* global_ws_id_tensor = global_ws_id_blob->GetMutableTensor(CPU);
+      TensorCPU* global_ws_id_tensor =
          BlobGetMutableTensor(global_ws_id_blob, CPU);
      global_ws_id_tensor->Resize();
      global_ws_id_tensor->template mutable_data<int32_t>()[0] = global_ws_id;
      VLOG(1) << "Adding " << GLOBAL_WORKSPACE_ID << " = " << global_ws_id;
--- a/caffe2/core/workspace.h
+++ b/caffe2/core/workspace.h
@ -151,7 +151,7 @@ class CAFFE2_API Workspace {
      auto* to_blob = CreateBlob(blob);
      CAFFE_ENFORCE(to_blob);
      const auto& from_tensor = from_blob->template Get<Tensor>();
-      auto* to_tensor = to_blob->GetMutableTensor(Context::GetDeviceType());
+      auto* to_tensor = BlobGetMutableTensor(to_blob, Context::GetDeviceType());
      to_tensor->CopyFrom(from_tensor);
    }
  }
--- a/caffe2/ideep/operators/concat_split_op.cc
+++ b/caffe2/ideep/operators/concat_split_op.cc
@ -33,8 +33,9 @@ class IDEEPConcatOp final : public IDEEPOperator {
      if (OperatorBase::InputBlob(i).template IsType<itensor>()) {
        inputs.emplace_back(Input(i));
      } else {
-        CAFFE_ENFORCE(OperatorBase::InputBlob(i).IsTensorType(CPU),
+        CAFFE_ENFORCE(
-                      "Expect cpu tensor if not itensor");
+            BlobIsTensorType(OperatorBase::InputBlob(i), CPU),
            "Expect cpu tensor if not itensor");
        auto& tensor_cpu = OperatorBase::Input<Tensor>(i, CPU);
        CAFFE_ENFORCE(tensor_cpu.dims().size() == 0 ||
                      tensor_cpu.size_from_dim(0) == 0,
--- a/caffe2/ideep/operators/operator_fallback_ideep.h
+++ b/caffe2/ideep/operators/operator_fallback_ideep.h
@ -89,7 +89,7 @@ class IDEEPFallbackOp final : public IDEEPOperator {
          local_input_blobs_[i]->Reset();
        }
        input_share_[i] = false;
-        auto dtensor = local_input_blobs_[i]->GetMutableTensor(CPU);
+        auto dtensor = BlobGetMutableTensor(local_input_blobs_[i], CPU);
        dtensor->Resize(input.get_dims());
        if (input.is_public_format()) {
          dtensor->ShareExternalPointer(
@ -121,7 +121,7 @@ class IDEEPFallbackOp final : public IDEEPOperator {
        continue;
      }
      CAFFE_ENFORCE(
-          local_output_blobs_[i]->IsTensorType(CPU),
+          BlobIsTensorType(*local_output_blobs_[i], CPU),
          "IDEEP fallback op currently does not support non-TensorCPU "
          "output type who needs copying.");
      const auto& src = local_output_blobs_[i]->template Get<TensorCPU>();
@ -153,7 +153,7 @@ class IDEEPFallbackOp final : public IDEEPOperator {
        VLOG(2) << "Output " << base_def_.output(i) << " as CPUTensor";
        Blob* dst = OperatorBase::OutputBlob(i);
        dst->Reset(new Tensor(CPU));
-        auto dtensor = dst->GetMutableTensor(CPU);
+        auto dtensor = BlobGetMutableTensor(dst, CPU);
        dtensor->Resize(src_dims);
        dtensor->ShareData(src);
      }
--- a/caffe2/ideep/operators/utility_ops.cc
+++ b/caffe2/ideep/operators/utility_ops.cc
@ -31,7 +31,7 @@ class CopyIDEEPToCPUOp final : public IDEEPOperator {
  USE_IDEEP_DEF_ALIASES();
  bool RunOnDevice() override {
    const auto& input_blob = OperatorBase::InputBlob(0);
-    if (input_blob.IsTensorType(CPU)) {
+    if (BlobIsTensorType(input_blob, CPU)) {
      VLOG(2) << "Directing sharing of TensorCPU";
      const auto& X = OperatorBase::Input<Tensor>(0, CPU);
      auto* Y = OperatorBase::Output<Tensor>(0, CPU);
--- a/caffe2/mkl/operators/operator_fallback_mkl.h
+++ b/caffe2/mkl/operators/operator_fallback_mkl.h
@ -66,10 +66,10 @@ class MKLFallbackOp final : public Operator<MKLContext> {
    for (int i = 0; i < InputSize(); ++i) {
      if (OperatorBase::InputIsType<MKLMemory<float>>(i)) {
        OperatorBase::Input<MKLMemory<float>>(i).CopyTo(
-            local_input_blobs_[i]->GetMutableTensor(CPU));
+            BlobGetMutableTensor(local_input_blobs_[i], CPU));
      } else if (OperatorBase::InputIsType<MKLMemory<double>>(i)) {
        OperatorBase::Input<MKLMemory<double>>(i).CopyTo(
-            local_input_blobs_[i]->GetMutableTensor(CPU));
+            BlobGetMutableTensor(local_input_blobs_[i], CPU));
      } else {
        VLOG(1) << "Input " << i << " is not MKLMemory. Skipping copy.";
        // Note(jiayq): This removes a const but conceptually
@ -93,7 +93,7 @@ class MKLFallbackOp final : public Operator<MKLContext> {
        continue;
      }
      CAFFE_ENFORCE(
-          local_output_blobs_[i]->IsTensorType(CPU),
+          BlobIsTensorType(*local_output_blobs_[i], CPU),
          "MKL fallback op currently does not support non-TensorCPU "
          "output type who needs copying.");
      const auto& src = local_output_blobs_[i]->template Get<TensorCPU>();
--- a/caffe2/mobile/contrib/arm-compute/operators/copy_op.cc
+++ b/caffe2/mobile/contrib/arm-compute/operators/copy_op.cc
@ -43,7 +43,7 @@ bool CopyFromGLOp<T>::RunOnDevice() {
  if (first_run_) {
    first_run_ = false;
    for (int i = 0; i < Inputs().size(); ++i) {
-      auto* Y = OperatorBase::Outputs()[i]->GetMutableTensor(CPU);
+      auto* Y = BlobGetMutableTensor(OperatorBase::Outputs()[i], CPU);
      Y->Resize(inputs_[i]->dims());
      Y->template mutable_data<float>();
    }
@ -54,7 +54,7 @@ bool CopyFromGLOp<T>::RunOnDevice() {
      // GLTensor
      auto* X = inputs_[i].get();
      X->lazy_allocate(Xblob, second_run_, true);
-      auto* Y = OperatorBase::Outputs()[i]->GetMutableTensor(CPU);
+      auto* Y = BlobGetMutableTensor(OperatorBase::Outputs()[i], CPU);
      Timer timer;
      timer.Start();
      getTensorCPU(*X, *Y);
--- a/caffe2/mobile/contrib/arm-compute/test/gl_operator_test.h
+++ b/caffe2/mobile/contrib/arm-compute/test/gl_operator_test.h
@ -27,7 +27,7 @@ template<typename T = float>
 void PopulateCPUBlob(Workspace *ws, bool random, std::string name,
                     std::vector<int> dims, int val = 1, int dist_shift = 0, float variance = 1) {
  Blob *blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob, CPU);
  tensor->Resize(dims);
  T *t_data = tensor->mutable_data<T>();
  std::random_device rd;
--- a/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm
+++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm
@ -489,13 +489,13 @@ class MPSCNNPackedInt8BGRANHWCToNCHWCStylizerPreprocessOp final
        "noise_size", 491 /* prime to avoid artifacts */);
    // Treaded as half4 in the kernel, so need half4 here.
    noiseSize = divRoundUp(noiseSize, 4) * 4;
-    if (!noiseBlob->IsTensorType(CPU) ||
+    if (!BlobIsTensorType(*noiseBlob, CPU) ||
        noiseBlob->Get<TensorCPU>().size() != noiseSize) {
      VLOG(2) << "Initializing stylizer with noise: " << noiseSize;
      caffe2::Timer rt;
      // Initialize random noise on first use.
      // Cache it to maintain temporal consistency.
-      auto* t = noiseBlob->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(noiseBlob, CPU);
      t->Resize(noiseSize);
      math::RandGaussian<float, CPUContext>(
          t->size(),
--- a/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm
+++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm
@ -94,7 +94,7 @@ void testMPSCNN() {
              Workspace ws;
              for (auto i = 0; i < N; ++i) {
-                auto* t = ws.CreateBlob(cpu(i))->GetMutableTensor(CPU);
+                auto* t = BlobGetMutableTensor(ws.CreateBlob(cpu(i)), CPU);
                t->Resize(BS, C, H, W);
                CPUContext ctx;
                math::RandGaussian<float, CPUContext>(
@ -152,7 +152,7 @@ void testMPSCNN() {
        Workspace ws;
        for (auto i = 0; i < N; ++i) {
-          auto* t = ws.CreateBlob(cpu(i))->GetMutableTensor(CPU);
+          auto* t = BlobGetMutableTensor(ws.CreateBlob(cpu(i)), CPU);
          switch (ndim) {
            case 1:
              t->Resize(5);
@ -210,7 +210,7 @@ void testMPSCNN() {
        LOG(INFO) << "MPSCNNNormalizePlanarYUV Test: ";
        Workspace ws;
        {
-          auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+          auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
          t->Resize(batch_size, channels, 8, 13);
          CPUContext ctx;
          math::RandGaussian<float, CPUContext>(
@ -218,14 +218,14 @@ void testMPSCNN() {
        }
        {
-          auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU);
+          auto* t = BlobGetMutableTensor(ws.CreateBlob("mean"), CPU);
          t->Resize(1, channels);
          CPUContext ctx;
          math::RandGaussian<float, CPUContext>(
              t->size(), 0, 1, t->mutable_data<float>(), &ctx);
        }
        {
-          auto* t = ws.CreateBlob("stddev")->GetMutableTensor(CPU);
+          auto* t = BlobGetMutableTensor(ws.CreateBlob("stddev"), CPU);
          t->Resize(1, channels);
          CPUContext ctx;
          math::RandUniform<float, CPUContext>(
@ -290,7 +290,7 @@ void testMPSCNN() {
          for (const auto dim : {10, 40}) {
            Workspace ws;
            {
-              auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+              auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
              t->Resize(batchSize, channels, dim, dim);
              CPUContext ctx;
              // Too noisy.
@ -299,7 +299,7 @@ void testMPSCNN() {
            }
            {
-              auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+              auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
              t->Resize(channels);
              CPUContext ctx;
              for (auto i = 0; i < t->size(); ++i) {
@ -310,7 +310,7 @@ void testMPSCNN() {
              // t->mutable_data<float>(), &ctx);
            }
            {
-              auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+              auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
              t->Resize(channels);
              CPUContext ctx;
              for (auto i = 0; i < t->size(); ++i) {
@ -321,7 +321,7 @@ void testMPSCNN() {
              // t->mutable_data<float>(), &ctx);
            }
            {
-              auto* t = ws.CreateBlob("pw")->GetMutableTensor(CPU);
+              auto* t = BlobGetMutableTensor(ws.CreateBlob("pw"), CPU);
              t->Resize(prelu == PreluTy::SHARED ? 1 : channels);
              CPUContext ctx;
              // Too noisy.
@ -409,7 +409,7 @@ void testMPSCNN() {
          Workspace ws;
          const auto channels = array ? 12 : 3;
          {
-            auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+            auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
            t->Resize(batch_size, channels, 8, 13);
            CPUContext ctx;
            math::RandGaussian<float, CPUContext>(
@ -417,7 +417,7 @@ void testMPSCNN() {
          }
          {
-            auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+            auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
            t->Resize(shared ? channels : 1);
            CPUContext ctx;
            math::RandGaussian<float, CPUContext>(
@ -480,7 +480,7 @@ void testMPSCNN() {
        LOG(INFO) << "MPSCNNSpatialBN Test: " << channels;
        Workspace ws;
        {
-          auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+          auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
          t->Resize(batch_size, channels, 8, 13);
          CPUContext ctx;
          math::RandGaussian<float, CPUContext>(
@ -488,7 +488,7 @@ void testMPSCNN() {
        }
        for (const std::string name : {"scale", "bias", "mean", "var"}) {
-          auto* t = ws.CreateBlob(name)->GetMutableTensor(CPU);
+          auto* t = BlobGetMutableTensor(ws.CreateBlob(name), CPU);
          t->Resize(channels);
          CPUContext ctx;
          // High mean to avoid var division by zero.
@ -575,7 +575,7 @@ void testMPSCNN() {
              LOG(INFO) << "MPSCNNFC Test";
              Workspace ws;
              {
-                auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+                auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
                t->Resize(batchSize, CIn, H, W);
                CPUContext ctx;
                math::RandGaussian<float, CPUContext>(
@ -583,7 +583,7 @@ void testMPSCNN() {
              }
              {
-                auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+                auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
                t->Resize(COut, CIn * H * W);
                CPUContext ctx;
                math::RandGaussian<float, CPUContext>(
@ -591,7 +591,7 @@ void testMPSCNN() {
              }
              {
-                auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+                auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
                t->Resize(COut);
                CPUContext ctx;
                math::RandGaussian<float, CPUContext>(
@ -682,8 +682,8 @@ void testMPSCNN() {
                          LOG(INFO) << "MPSCNNPool Test: " << pool;
                          Workspace ws;
                          {
-                            auto* t =
+                            auto* t = BlobGetMutableTensor(
-                                ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+                                ws.CreateBlob("X_cpu"), CPU);
                            t->Resize(batchSize, 8, 8, 13);
                            CPUContext ctx;
                            math::RandGaussian<float, CPUContext>(
@ -784,7 +784,7 @@ void testMPSCNN() {
         std::vector<std::vector<size_t>>{{1, 3, 50, 80}, {1, 12, 50, 80}}) {
      Workspace ws;
      {
-        auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+        auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
        t->Resize(dims);
        CPUContext ctx;
        math::RandGaussian<float, CPUContext>(
@ -860,7 +860,7 @@ void testMPSCNN() {
    LOG(INFO) << "MPSCNNPreprocess Test";
    Workspace ws;
    {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
      t->Resize(1, 8, 13, 4);
      CPUContext ctx;
      for (auto i = 0; i < t->size(); ++i) {
@ -869,7 +869,7 @@ void testMPSCNN() {
    }
    {
-      auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("mean"), CPU);
      t->Resize(3);
      CPUContext ctx;
      t->mutable_data<float>()[0] = 100;
@ -940,7 +940,7 @@ void testMPSCNN() {
    LOG(INFO) << "MPSCNNDeprocess Test";
    Workspace ws;
    {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
      t->Resize(1, 3, 8, 24);
      CPUContext ctx;
      for (auto i = 0; i < t->size(); ++i) {
@ -949,7 +949,7 @@ void testMPSCNN() {
    }
    {
-      auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("mean"), CPU);
      t->Resize(3);
      CPUContext ctx;
      t->mutable_data<float>()[0] = 100;
@ -999,7 +999,7 @@ void testMPSCNN() {
    LOG(INFO) << "MPSCNNDeprocess Test";
    Workspace ws;
    {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
      t->Resize(1, 3, 1280, 720);
      CPUContext ctx;
      for (auto i = 0; i < t->size(); ++i) {
@ -1008,7 +1008,7 @@ void testMPSCNN() {
    }
    {
-      auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("mean"), CPU);
      t->Resize(3);
      CPUContext ctx;
      t->mutable_data<float>()[0] = 30;
@ -1072,7 +1072,8 @@ void testMPSCNN() {
                      LOG(INFO) << "MPSCNNConv Test";
                      Workspace ws;
                      {
-                        auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+                        auto* t =
                            BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
                        t->Resize(batchSize, 12, 57, 72);
                        CPUContext ctx;
                        math::RandGaussian<float, CPUContext>(
@ -1080,7 +1081,7 @@ void testMPSCNN() {
                      }
                      {
-                        auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+                        auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
                        t->Resize(8, 12, kernel_h, kernel_w);
                        CPUContext ctx;
                        math::RandGaussian<float, CPUContext>(
@ -1092,7 +1093,7 @@ void testMPSCNN() {
                      }
                      {
-                        auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+                        auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
                        t->Resize(8);
                        CPUContext ctx;
                        math::RandGaussian<float, CPUContext>(
@ -1188,7 +1189,7 @@ void testMPSCNN() {
            Workspace ws;
            int output_channels = input_channels * channel_multiplier;
            {
-              auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+              auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
              t->Resize(batchSize, input_channels, 57, 72);
              CPUContext ctx;
              math::RandGaussian<float, CPUContext>(
@ -1196,7 +1197,7 @@ void testMPSCNN() {
            }
            {
-              auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+              auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
              t->Resize(output_channels, 1, 3, 3);
              CPUContext ctx;
              math::RandGaussian<float, CPUContext>(
@ -1204,7 +1205,7 @@ void testMPSCNN() {
            }
            {
-              auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+              auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
              t->Resize(output_channels);
              CPUContext ctx;
              math::RandGaussian<float, CPUContext>(
@ -1275,7 +1276,7 @@ void testMPSCNN() {
    LOG(INFO) << "MPSCNNConvRelu Test";
    Workspace ws;
    {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
      t->Resize(1, 12, 57, 72);
      CPUContext ctx;
      math::RandGaussian<float, CPUContext>(
@ -1283,7 +1284,7 @@ void testMPSCNN() {
    }
    {
-      auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
      t->Resize(8, 12, 3, 3);
      CPUContext ctx;
      math::RandGaussian<float, CPUContext>(
@ -1291,7 +1292,7 @@ void testMPSCNN() {
    }
    {
-      auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
      t->Resize(8);
      CPUContext ctx;
      math::RandGaussian<float, CPUContext>(
@ -1385,7 +1386,7 @@ void testMPSCNN() {
    LOG(INFO) << "MPSConv Test";
    Workspace ws;
    {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
      t->Resize(1, 12, 57, 72);
      CPUContext ctx;
      math::RandGaussian<float, CPUContext>(
@ -1393,7 +1394,7 @@ void testMPSCNN() {
    }
    {
-      auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
      t->Resize(8, 12, 3, 3);
      CPUContext ctx;
      math::RandGaussian<float, CPUContext>(
@ -1401,7 +1402,7 @@ void testMPSCNN() {
    }
    {
-      auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
      t->Resize(8);
      CPUContext ctx;
      math::RandGaussian<float, CPUContext>(
@ -1493,7 +1494,7 @@ void testMPSCNN() {
              LOG(INFO) << "MPSConv Test";
              Workspace ws;
              {
-                auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+                auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
                t->Resize(batchSize, C, 12, 16);
                CPUContext ctx;
                math::RandGaussian<float, CPUContext>(
@ -1501,7 +1502,7 @@ void testMPSCNN() {
              }
              {
-                auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+                auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
                t->Resize(M, C, K, K);
                CPUContext ctx;
                math::RandGaussian<float, CPUContext>(
@ -1509,7 +1510,7 @@ void testMPSCNN() {
              }
              {
-                auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+                auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
                t->Resize(M);
                CPUContext ctx;
                math::RandGaussian<float, CPUContext>(
@ -1607,7 +1608,7 @@ void testMPSCNN() {
                LOG(INFO) << "MPSCNNConv Test - group";
                Workspace ws;
                {
-                  auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+                  auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
                  t->Resize(batchSize, C, 12, 16);
                  CPUContext ctx;
                  math::RandGaussian<float, CPUContext>(
@ -1615,7 +1616,7 @@ void testMPSCNN() {
                }
                {
-                  auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+                  auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
                  t->Resize(M, C / group, K, K);
                  CPUContext ctx;
                  math::RandGaussian<float, CPUContext>(
@ -1623,7 +1624,7 @@ void testMPSCNN() {
                }
                {
-                  auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+                  auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
                  t->Resize(M);
                  CPUContext ctx;
                  math::RandGaussian<float, CPUContext>(
@ -1726,7 +1727,7 @@ void testMPSCNN() {
    LOG(INFO) << "MPSCNNMul Test";
    Workspace ws;
    {
-      auto* t = ws.CreateBlob("X0_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X0_cpu"), CPU);
      t->Resize(1, 12, 57, 72);
      CPUContext ctx;
      math::RandGaussian<float, CPUContext>(
@ -1734,7 +1735,7 @@ void testMPSCNN() {
    }
    {
-      auto* t = ws.CreateBlob("X1_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X1_cpu"), CPU);
      t->Resize(72);
      CPUContext ctx;
      math::RandGaussian<float, CPUContext>(
@ -1791,7 +1792,7 @@ void testMPSCNN() {
    LOG(INFO) << "MPSCNNSub Test";
    Workspace ws;
    {
-      auto* t = ws.CreateBlob("X0_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X0_cpu"), CPU);
      t->Resize(1, 12, 57, 72);
      CPUContext ctx;
      math::RandGaussian<float, CPUContext>(
@ -1799,7 +1800,7 @@ void testMPSCNN() {
    }
    {
-      auto* t = ws.CreateBlob("X1_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X1_cpu"), CPU);
      t->Resize(72);
      CPUContext ctx;
      math::RandGaussian<float, CPUContext>(
@ -1856,7 +1857,7 @@ void testMPSCNN() {
    LOG(INFO) << "MPSAdd Test";
    Workspace ws;
    {
-      auto* t = ws.CreateBlob("X0_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X0_cpu"), CPU);
      t->Resize(1, 12, 57, 72);
      CPUContext ctx;
      math::RandGaussian<float, CPUContext>(
@ -1864,7 +1865,7 @@ void testMPSCNN() {
    }
    {
-      auto* t = ws.CreateBlob("X1_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X1_cpu"), CPU);
      t->Resize(1, 12, 57, 72);
      CPUContext ctx;
      math::RandGaussian<float, CPUContext>(
@ -1921,7 +1922,7 @@ void testMPSCNN() {
    LOG(INFO) << "MPSAdd Test";
    Workspace ws;
    {
-      auto* t = ws.CreateBlob("X0_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X0_cpu"), CPU);
      t->Resize(1, 12, 57, 72);
      CPUContext ctx;
      math::RandGaussian<float, CPUContext>(
@ -1929,7 +1930,7 @@ void testMPSCNN() {
    }
    {
-      auto* t = ws.CreateBlob("X1_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X1_cpu"), CPU);
      t->Resize(1, 12, 57, 72);
      CPUContext ctx;
      math::RandGaussian<float, CPUContext>(
@ -2011,7 +2012,7 @@ void testMPSCNN() {
      LOG(INFO) << "MPSCNNNeuron Test: " << n;
      Workspace ws;
      {
-        auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+        auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
        t->Resize(1, 4, 12, 12);
        CPUContext ctx;
        math::RandGaussian<float, CPUContext>(
@ -2065,7 +2066,7 @@ void testMPSCNN() {
    LOG(INFO) << "MPSCNNDropout Test";
    Workspace ws;
    {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
      t->Resize(1, 12, 57, 72);
      CPUContext ctx;
      math::RandGaussian<float, CPUContext>(
@ -2136,7 +2137,7 @@ void testMPSCNN() {
                      << " - scale: " << scale;
            Workspace ws;
            {
-              auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+              auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
              t->Resize(1, channels, 40, 40);
              CPUContext ctx;
              math::RandGaussian<float, CPUContext>(
@ -2144,7 +2145,7 @@ void testMPSCNN() {
            }
            {
              // Use the batch-first encoding (n, [bbox])
-              auto* t = ws.CreateBlob("R")->GetMutableTensor(CPU);
+              auto* t = BlobGetMutableTensor(ws.CreateBlob("R"), CPU);
              t->Resize(6, 5);
              for (auto i = 0; i < t->dim32(0); ++i) {
                t->mutable_data<float>()[5 * i + 0] = 0; // batch
@ -2250,14 +2251,14 @@ void testMPSCNN() {
        LOG(INFO) << "MPSCNNRoIWarp Test 2";
        Workspace ws;
        {
-          auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+          auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
          t->Resize(1, 8, 40, 40);
          CPUContext ctx;
          math::RandGaussian<float, CPUContext>(
              t->size(), 4, 2, t->mutable_data<float>(), &ctx);
        }
        {
-          auto* t = ws.CreateBlob("R")->GetMutableTensor(CPU);
+          auto* t = BlobGetMutableTensor(ws.CreateBlob("R"), CPU);
          t->Resize(6, 4);
          for (auto i = 0; i < t->dim32(0); ++i) {
            t->mutable_data<float>()[4 * i + 0] = (i % 4 + 1) * 1.0 / scale;
@ -2362,7 +2363,7 @@ void testMPSCNN() {
            LOG(INFO) << "MPSCNNResizeNearestOp Test";
            Workspace ws;
            {
-              auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+              auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
              t->Resize(N, C, 37, 89);
              CPUContext ctx;
              math::RandGaussian<float, CPUContext>(
@ -2497,7 +2498,7 @@ void testMPSCNN() {
    vector<float> im_info{60, 80, 0.166667};
    vector<float> anchors{-38, -16, 53, 31, -120, -120, 135, 135};
    {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
      t->Resize(num_images, A, H, W);
      for (auto i = 0; i < t->size(); ++i) {
        t->mutable_data<float>()[i] = scores[i];
@ -2505,7 +2506,7 @@ void testMPSCNN() {
    }
    {
-      auto* t = ws.CreateBlob("bbox_delta_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("bbox_delta_cpu"), CPU);
      t->Resize(num_images, 4 * A, H, W);
      for (auto i = 0; i < t->size(); ++i) {
        t->mutable_data<float>()[i] = bbx[i];
@ -2513,7 +2514,7 @@ void testMPSCNN() {
    }
    {
-      auto* t = ws.CreateBlob("im_info")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("im_info"), CPU);
      t->Resize(num_images, 3);
      for (auto i = 0; i < t->size(); ++i) {
        t->mutable_data<float>()[i] = im_info[i];
@ -2521,7 +2522,7 @@ void testMPSCNN() {
    }
    {
-      auto* t = ws.CreateBlob("anchors")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("anchors"), CPU);
      t->Resize(A, 4);
      for (auto i = 0; i < t->size(); ++i) {
        t->mutable_data<float>()[i] = anchors[i];
@ -2587,7 +2588,7 @@ void testMPSCNN() {
      LOG(INFO) << "MPSCNNSoftmax Test";
      Workspace ws;
      {
-        auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+        auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
        // Only works for spatial dimension of (1, 1) - weird.
        t->Resize(batchSize, 12, 1, 1);
        CPUContext ctx;
@ -2661,8 +2662,8 @@ void testMPSCNN() {
                            LOG(INFO) << "MPSConvTranspose Test";
                            Workspace ws;
                            {
-                              auto* t =
+                              auto* t = BlobGetMutableTensor(
-                                  ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+                                  ws.CreateBlob("X_cpu"), CPU);
                              t->Resize(batchSize, inputChannels, 8, 12);
                              CPUContext ctx;
                              math::RandGaussian<float, CPUContext>(
@ -2675,7 +2676,7 @@ void testMPSCNN() {
                            {
                              auto* t =
-                                  ws.CreateBlob("W")->GetMutableTensor(CPU);
+                                  BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
                              t->Resize(
                                  inputChannels,
                                  outputChannels,
@ -2692,7 +2693,7 @@ void testMPSCNN() {
                            {
                              auto* t =
-                                  ws.CreateBlob("b")->GetMutableTensor(CPU);
+                                  BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
                              t->Resize(outputChannels);
                              CPUContext ctx;
                              math::RandGaussian<float, CPUContext>(
@ -2809,7 +2810,7 @@ void testMPSCNN() {
                    << batchSize;
          Workspace ws;
          for (auto i = 0; i < numInputs; ++i) {
-            auto* t = ws.CreateBlob(cpu(i))->GetMutableTensor(CPU);
+            auto* t = BlobGetMutableTensor(ws.CreateBlob(cpu(i)), CPU);
            t->Resize(batchSize, array ? (i + 1) * 4 : 4, 10, 10);
            CPUContext ctx;
            math::RandGaussian<float, CPUContext>(
@ -2891,7 +2892,7 @@ void testMPSCNN() {
          }
          Workspace ws;
          {
-            auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+            auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
            t->Resize(batchSize, inputChannels, 53, 47);
            CPUContext ctx;
            math::RandGaussian<float, CPUContext>(
@ -2964,7 +2965,7 @@ void testMPSCNN() {
                    << numInputs << ", " << batchSize;
          Workspace ws;
          for (auto i = 0; i < numInputs; ++i) {
-            auto* t = ws.CreateBlob(cpu(i))->GetMutableTensor(CPU);
+            auto* t = BlobGetMutableTensor(ws.CreateBlob(cpu(i)), CPU);
            t->Resize(batchSize, channelCount, 9, 17);
            CPUContext ctx;
            math::RandGaussian<float, CPUContext>(
@ -3336,8 +3337,8 @@ void compareModels(const NetDef& initNet, NetDef predictNet) {
    Workspace cws;
    cws.RunNetOnce(initNet);
    {
-      auto* t =
+      auto* t = BlobGetMutableTensor(
-          cws.CreateBlob(predictNet.external_input(0))->GetMutableTensor(CPU);
+          cws.CreateBlob(predictNet.external_input(0)), CPU);
      t->Resize(1, 224, 224, 4);
      for (auto i = 0; i < t->size(); ++i) {
        t->mutable_data<uint8_t>()[i] = i % 225;
@ -3348,8 +3349,8 @@ void compareModels(const NetDef& initNet, NetDef predictNet) {
    Workspace mws;
    mws.RunNetOnce(initNet);
    {
-      auto* t =
+      auto* t = BlobGetMutableTensor(
-          mws.CreateBlob(predictNet.external_input(0))->GetMutableTensor(CPU);
+          mws.CreateBlob(predictNet.external_input(0)), CPU);
      t->Resize(1, 224, 224, 4);
      for (auto i = 0; i < t->size(); ++i) {
        t->mutable_data<uint8_t>()[i] = i % 225;
@ -3397,16 +3398,16 @@ void verifyRewrite(
  dumpDef(predictNet);
  dumpDef(metalPredictNet);
-#define RUN_NET(ws, predictNet)                                             \
+#define RUN_NET(ws, predictNet)                            \
-  ws.RunNetOnce(initNet);                                                   \
+  ws.RunNetOnce(initNet);                                  \
-  {                                                                         \
+  {                                                        \
-    auto* t =                                                               \
+    auto* t = BlobGetMutableTensor(                        \
-        ws.CreateBlob(predictNet.external_input(0))->GetMutableTensor(CPU); \
+        ws.CreateBlob(predictNet.external_input(0)), CPU); \
-    t->Resize(inputDims);                                                   \
+    t->Resize(inputDims);                                  \
-    CPUContext ctx;                                                         \
+    CPUContext ctx;                                        \
-    math::RandGaussian<float, CPUContext>(                                  \
+    math::RandGaussian<float, CPUContext>(                 \
-        t->size(), 0, 1, t->mutable_data<float>(), &ctx);                   \
+        t->size(), 0, 1, t->mutable_data<float>(), &ctx);  \
-  }                                                                         \
+  }                                                        \
  ws.RunNetOnce(predictNet);
  // initialize
--- a/caffe2/mobile/contrib/ios/pool_test.cc
+++ b/caffe2/mobile/contrib/ios/pool_test.cc
@ -16,7 +16,7 @@ void AddNoiseInput(const vector<int64_t>& shape, const string& name, Workspace*
  DeviceOption option;
  CPUContext context(option);
  Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob, CPU);
  tensor->Resize(shape);
  math::RandGaussian<float, CPUContext>(
--- a/caffe2/mobile/contrib/ios/resize_test.cc
+++ b/caffe2/mobile/contrib/ios/resize_test.cc
@ -16,7 +16,7 @@ void AddNoiseInput(const vector<int64_t>& shape, const string& name, Workspace*
  DeviceOption option;
  CPUContext context(option);
  Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob, CPU);
  tensor->Resize(shape);
  math::RandGaussian<float, CPUContext>(
--- a/caffe2/mobile/contrib/nnapi/nnapi.cc
+++ b/caffe2/mobile/contrib/nnapi/nnapi.cc
@ -679,7 +679,7 @@ void NNApi::init(const TensorVector& inputs, TensorVector* outputs) {
        output_dims.push_back(dim);
      }
-      auto* tensor = ws_.CreateBlob(blob)->GetMutableTensor(CPU);
+      auto* tensor = BlobGetMutableTensor(ws_.CreateBlob(blob), CPU);
      tensor->Resize(output_dims);
      outputs->push_back(tensor);
--- a/caffe2/mobile/contrib/nnapi/nnapi_benchmark.cc
+++ b/caffe2/mobile/contrib/nnapi/nnapi_benchmark.cc
@ -13,7 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
- 
+
 #include "caffe2/core/init.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/core/tensor.h"
@ -43,14 +43,14 @@ static double benchmark_conv_caffe2(
    ws = &localWs;
  }
  {
-    auto* t = ws->CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws->CreateBlob("X_cpu"), CPU);
    t->Resize(N, C, H, W);
    CPUContext ctx;
    math::RandGaussian<float, CPUContext>(
        t->size(), 0, 30, t->mutable_data<float>(), &ctx);
  }
  {
-    auto* t = ws->CreateBlob("W")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws->CreateBlob("W"), CPU);
    if (group == 1) {
      t->Resize(K, C, kernel, kernel);
    } else {
@ -61,7 +61,7 @@ static double benchmark_conv_caffe2(
        t->size(), 0, 30, t->mutable_data<float>(), &ctx);
  }
  {
-    auto* t = ws->CreateBlob("B")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws->CreateBlob("B"), CPU);
    t->Resize(K);
    CPUContext ctx;
    math::RandGaussian<float, CPUContext>(
@ -129,14 +129,14 @@ static double benchmark_conv_nnapi(
    ws = &localWs;
  }
  {
-    auto* t = ws->CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws->CreateBlob("X_cpu"), CPU);
    t->Resize(N, H, W, C);
    CPUContext ctx;
    math::RandGaussian<float, CPUContext>(
        t->size(), 0, 30, t->mutable_data<float>(), &ctx);
  }
  {
-    auto* t = ws->CreateBlob("W")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws->CreateBlob("W"), CPU);
    if (group > 1) {
      CAFFE_ENFORCE_EQ(C, group);
      t->Resize(1, kernel, kernel, C);
@ -148,7 +148,7 @@ static double benchmark_conv_nnapi(
        t->size(), 0, 30, t->mutable_data<float>(), &ctx);
  }
  {
-    auto* t = ws->CreateBlob("B")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws->CreateBlob("B"), CPU);
    t->Resize(K);
    CPUContext ctx;
    math::RandGaussian<float, CPUContext>(
@ -190,7 +190,7 @@ static double benchmark_conv_nnapi(
  NetDef initNet;
  NNApi model(initNet, netdef, ws);
  std::vector<TensorCPU*> inputs, outputs;
-  inputs.push_back(ws->GetBlob("X_cpu")->GetMutableTensor(CPU));
+  inputs.push_back(BlobGetMutableTensor(ws->GetBlob("X_cpu"), CPU));
  CAFFE_ENFORCE(model.run(inputs, &outputs));
  for (int i = 0; i < warmup; i++) {
@ -220,14 +220,14 @@ static double benchmark_conv_nnapi_int8(
    ws = &localWs;
  }
  {
-    auto* t = ws->CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws->CreateBlob("X_cpu"), CPU);
    t->Resize(N, H, W, C);
    for (int i = 0; i < t->size(); i++) {
      t->mutable_data<uint8_t>()[i] = rand() % 10;
    }
  }
  {
-    auto* t = ws->CreateBlob("W")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws->CreateBlob("W"), CPU);
    if (group > 1) {
      CAFFE_ENFORCE_EQ(C, group);
      t->Resize(1, kernel, kernel, C);
@ -243,7 +243,7 @@ static double benchmark_conv_nnapi_int8(
  // should be of ANEURALNETWORKS_TENSOR_INT32, with zeroPoint of 0 and
  // bias_scale == input_scale * filter_scale.
  {
-    auto* t = ws->CreateBlob("B")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws->CreateBlob("B"), CPU);
    t->Resize(K);
    for (int i = 0; i < t->size(); i++) {
      t->mutable_data<int32_t>()[i] = rand() % 10;
@ -322,7 +322,7 @@ static double benchmark_conv_nnapi_int8(
  NetDef initNet;
  NNApi model(initNet, netdef, ws);
  std::vector<TensorCPU*> inputs, outputs;
-  inputs.push_back(ws->GetBlob("X_cpu")->GetMutableTensor(CPU));
+  inputs.push_back(BlobGetMutableTensor(ws->GetBlob("X_cpu"), CPU));
  CAFFE_ENFORCE(model.run(inputs, &outputs));
  for (int i = 0; i < warmup; i++) {
--- a/caffe2/mobile/contrib/nnapi/nnapi_test.cc
+++ b/caffe2/mobile/contrib/nnapi/nnapi_test.cc
@ -55,7 +55,7 @@ static void test_relu(int N, int C, int H, int W) {
  // CPU reference
  Workspace ws;
  {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
    t->Resize(N, H, W, C);
    CPUContext ctx;
    math::RandGaussian<float, CPUContext>(
@ -81,7 +81,7 @@ static void test_relu(int N, int C, int H, int W) {
  NetDef initNet;
  NNApi model(initNet, netdef, &ws);
  std::vector<TensorCPU*> inputs, outputs;
-  inputs.push_back(ws.GetBlob("X_cpu")->GetMutableTensor(CPU));
+  inputs.push_back(BlobGetMutableTensor(ws.GetBlob("X_cpu"), CPU));
  EXPECT_TRUE(model.run(inputs, &outputs));
  const auto& t_nn = *outputs[0];
@ -103,21 +103,21 @@ static void test_conv_NHWC(
    int stride_w) {
  Workspace ws;
  {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
    t->Resize(N, H, W, C);
    CPUContext ctx;
    math::RandGaussian<float, CPUContext>(
        t->size(), 0, 30, t->mutable_data<float>(), &ctx);
  }
  {
-    auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
    t->Resize(K, kernel, kernel, C);
    CPUContext ctx;
    math::RandGaussian<float, CPUContext>(
        t->size(), 0, 30, t->mutable_data<float>(), &ctx);
  }
  {
-    auto* t = ws.CreateBlob("B")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("B"), CPU);
    t->Resize(K);
    CPUContext ctx;
    math::RandGaussian<float, CPUContext>(
@ -189,7 +189,7 @@ static void test_conv_NHWC(
  NetDef initNet;
  NNApi model(initNet, netdef, &ws);
  std::vector<TensorCPU*> inputs, outputs;
-  inputs.push_back(ws.GetBlob("X_cpu")->GetMutableTensor(CPU));
+  inputs.push_back(BlobGetMutableTensor(ws.GetBlob("X_cpu"), CPU));
  EXPECT_TRUE(model.run(inputs, &outputs));
  const auto& t_nn = *outputs[0];
@ -211,21 +211,21 @@ static void test_depthwise_conv_NHWC(
    int stride_w) {
  Workspace ws;
  {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
    t->Resize(N, H, W, C);
    CPUContext ctx;
    math::RandGaussian<float, CPUContext>(
        t->size(), 0, 30, t->mutable_data<float>(), &ctx);
  }
  {
-    auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
    t->Resize(1, kernel, kernel, D);
    CPUContext ctx;
    math::RandGaussian<float, CPUContext>(
        t->size(), 0, 30, t->mutable_data<float>(), &ctx);
  }
  {
-    auto* t = ws.CreateBlob("B")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("B"), CPU);
    t->Resize(D);
    CPUContext ctx;
    math::RandGaussian<float, CPUContext>(
@ -406,7 +406,7 @@ static void test_depthwise_conv_NHWC(
  NetDef initNet;
  NNApi model(initNet, netdef, &ws);
  std::vector<TensorCPU*> inputs, outputs;
-  inputs.push_back(ws.GetBlob("X_cpu")->GetMutableTensor(CPU));
+  inputs.push_back(BlobGetMutableTensor(ws.GetBlob("X_cpu"), CPU));
  EXPECT_TRUE(model.run(inputs, &outputs));
  const auto& t_nn = *outputs[0];
@ -428,7 +428,7 @@ static void test_pooling(
    int stride_w) {
  Workspace ws;
  {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
    t->Resize(N, H, W, C);
    CPUContext ctx;
    math::RandGaussian<float, CPUContext>(
@ -496,7 +496,7 @@ static void test_pooling(
  NetDef initNet;
  NNApi model(initNet, netdef, &ws);
  std::vector<TensorCPU*> inputs, outputs;
-  inputs.push_back(ws.GetBlob("X_cpu")->GetMutableTensor(CPU));
+  inputs.push_back(BlobGetMutableTensor(ws.GetBlob("X_cpu"), CPU));
  EXPECT_TRUE(model.run(inputs, &outputs));
  const auto& t_nn = *outputs[0];
@ -506,7 +506,7 @@ static void test_pooling(
 static void test_softmax(int N, int C, int H = 1, int W = 1) {
  Workspace ws;
  {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
    if (H == 1 && W == 1) {
      t->Resize(N, C);
    } else {
@ -538,7 +538,7 @@ static void test_softmax(int N, int C, int H = 1, int W = 1) {
  NetDef initNet;
  NNApi model(initNet, netdef, &ws);
  std::vector<TensorCPU*> inputs, outputs;
-  inputs.push_back(ws.GetBlob("X_cpu")->GetMutableTensor(CPU));
+  inputs.push_back(BlobGetMutableTensor(ws.GetBlob("X_cpu"), CPU));
  EXPECT_TRUE(model.run(inputs, &outputs));
  const auto& t_nn = *outputs[0];
--- a/caffe2/mobile/contrib/opengl/test/opengl_test.cc
+++ b/caffe2/mobile/contrib/opengl/test/opengl_test.cc
@ -178,7 +178,7 @@ void testOpenGLCopyOps(int N, int C, int H, int W, float error, int tile_x = 1,
  LOG(INFO) << "OPENGLCopyFrom/To Test";
  Workspace ws;
  {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
    t->Resize(N, C, H, W);
    CPUContext ctx;
    math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
@ -275,7 +275,7 @@ void testOpenGLConv(int N,
            << " Op: " << glPoolOperationName[poolOp];
  Workspace ws;
  {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
    t->Resize(N, C, H, W);
    CPUContext ctx;
    if (random_input) {
@ -301,7 +301,7 @@ void testOpenGLConv(int N,
  }
  if (poolOp != AveragePool && poolOp != MaxPool) {
-    auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
    if (poolOp != ConvTranspose && poolOp != ConvTransposePRelu && poolOp != ConvTransposeRelu) {
      t->Resize(K, C, kernel_h, kernel_w);
    } else {
@ -343,7 +343,7 @@ void testOpenGLConv(int N,
    // bias
    {
-      auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
      t->Resize(K);
      CPUContext ctx;
      if (random_input) {
@ -367,7 +367,7 @@ void testOpenGLConv(int N,
  }
  if (poolOp == ConvPRelu || poolOp == ConvTransposePRelu) {
-    auto* t = ws.CreateBlob("p")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("p"), CPU);
    t->Resize(K);
    CPUContext ctx;
    if (random_input) {
@ -532,7 +532,7 @@ void testOpenGLPRelu(
            << "C: " << C << ", H: " << H << ", W: " << W;
  Workspace ws;
  {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
    t->Resize(N, C, H, W);
    CPUContext ctx;
    // Too noisy.
@ -541,7 +541,7 @@ void testOpenGLPRelu(
  // prelu scale
  {
-    auto* t = ws.CreateBlob("p")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("p"), CPU);
    t->Resize(prelu_size);
    CPUContext ctx;
    math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
@ -603,7 +603,7 @@ void testOpenGLRelu(int N, int C, int H, int W, int input_tile_x, int input_tile
            << "C: " << C << ", H: " << H << ", W: " << W;
  Workspace ws;
  {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
    t->Resize(N, C, H, W);
    CPUContext ctx;
    // Too noisy.
@ -664,13 +664,13 @@ void testOpenGLAdd(int N, int C, int H, int W, float error = 0.1, int input_tile
            << "C: " << C << ", H: " << H << ", W: " << W;
  Workspace ws;
  {
-    auto* t0 = ws.CreateBlob("X_cpu0")->GetMutableTensor(CPU);
+    auto* t0 = BlobGetMutableTensor(ws.CreateBlob("X_cpu0"), CPU);
    t0->Resize(N, C, H, W);
    CPUContext ctx0;
    // Too noisy.
    math::RandGaussian<float, CPUContext>(t0->size(), 0, 30, t0->mutable_data<float>(), &ctx0);
-    auto* t1 = ws.CreateBlob("X_cpu1")->GetMutableTensor(CPU);
+    auto* t1 = BlobGetMutableTensor(ws.CreateBlob("X_cpu1"), CPU);
    t1->Resize(N, C, H, W);
    CPUContext ctx1;
    // Too noisy.
@ -750,13 +750,13 @@ void testOpenGLSub(int N, int C, int H, int W, float error = 0.1) {
  Workspace ws;
  {
-    auto* t0 = ws.CreateBlob("X_cpu0")->GetMutableTensor(CPU);
+    auto* t0 = BlobGetMutableTensor(ws.CreateBlob("X_cpu0"), CPU);
    t0->Resize(N, C, H, W);
    CPUContext ctx0;
    // Too noisy.
    math::RandGaussian<float, CPUContext>(t0->size(), 0, 30, t0->mutable_data<float>(), &ctx0);
-    auto* t1 = ws.CreateBlob("X_cpu1")->GetMutableTensor(CPU);
+    auto* t1 = BlobGetMutableTensor(ws.CreateBlob("X_cpu1"), CPU);
    t1->Resize(N, C, H, W);
    CPUContext ctx1;
    // Too noisy.
@ -814,8 +814,8 @@ void testOpenGLConcat(int N, std::vector<int> Cs, int H, int W, bool tiling = fa
            << "H: " << H << ", W: " << W;
  Workspace ws;
  for (int i = 0; i < Cs.size(); i++) {
-    auto* t =
+    auto* t = BlobGetMutableTensor(
-        ws.CreateBlob("X_cpu" + caffe2::to_string(i))->GetMutableTensor(CPU);
+        ws.CreateBlob("X_cpu" + caffe2::to_string(i)), CPU);
    t->Resize(N, Cs[i], H, W);
    CPUContext ctx0;
    // Too noisy.
@ -891,7 +891,7 @@ void testOpenGLSigmoid(int N, int C, int H, int W, float error) {
            << "C: " << C << ", H: " << H << ", W: " << W;
  Workspace ws;
  {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
    t->Resize(N, C, H, W);
    CPUContext ctx;
    // Too noisy.
@ -942,7 +942,7 @@ void testOpenGLTanh(int N, int C, int H, int W, float error) {
            << "C: " << C << ", H: " << H << ", W: " << W;
  Workspace ws;
  {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
    t->Resize(N, C, H, W);
    CPUContext ctx;
    math::RandGaussian<float, CPUContext>(t->size(), 0, 2, t->mutable_data<float>(), &ctx);
@ -992,14 +992,14 @@ void testOpenGLMul(int N, int C, int H, int W, float error) {
            << "C: " << C << ", H: " << H << ", W: " << W;
  Workspace ws;
  {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
    t->Resize(N, C, H, W);
    CPUContext ctx;
    math::RandGaussian<float, CPUContext>(t->size(), -10, 10, t->mutable_data<float>(), &ctx);
  }
  {
-    auto* t = ws.CreateBlob("B")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("B"), CPU);
    t->Resize(1);
    CPUContext ctx;
    math::RandGaussian<float, CPUContext>(t->size(), -10, 10, t->mutable_data<float>(), &ctx);
@ -1060,7 +1060,7 @@ void testOpenGLSoftmax(int N, int D, float error, bool tiled = false) {
  LOG(INFO) << "OpenGL Softmax Test "
            << "N: " << N << " D: " << D << " Tiled:" << tiled;
  Workspace ws;
-  auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+  auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
  {
    t->Resize(N, D);
    CPUContext ctx;
@ -1151,7 +1151,7 @@ void testOpenGLInstanceNorm(int N, int C, int H, int W, float error) {
            << "C: " << C << ", H: " << H << ", W: " << W;
  Workspace ws;
  {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
    t->Resize(N, C, H, W);
    CPUContext ctx;
    // Too noisy.
@ -1163,7 +1163,7 @@ void testOpenGLInstanceNorm(int N, int C, int H, int W, float error) {
  // scale
  {
-    auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
    t->Resize(C);
    CPUContext ctx;
    for (auto i = 0; i < t->size(); ++i) {
@ -1172,7 +1172,7 @@ void testOpenGLInstanceNorm(int N, int C, int H, int W, float error) {
  }
  // bias
  {
-    auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
    t->Resize(C);
    CPUContext ctx;
    for (auto i = 0; i < t->size(); ++i) {
@ -1254,7 +1254,7 @@ void testOpenGLInstanceNormPRelu(int N, int C, int H, int W, float error) {
            << "C: " << C << ", H: " << H << ", W: " << W;
  Workspace ws;
  {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
    t->Resize(N, C, H, W);
    CPUContext ctx;
    // Too noisy.
@ -1266,7 +1266,7 @@ void testOpenGLInstanceNormPRelu(int N, int C, int H, int W, float error) {
  // scale
  {
-    auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
    t->Resize(C);
    CPUContext ctx;
    for (auto i = 0; i < t->size(); ++i) {
@ -1275,7 +1275,7 @@ void testOpenGLInstanceNormPRelu(int N, int C, int H, int W, float error) {
  }
  // bias
  {
-    auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
    t->Resize(C);
    CPUContext ctx;
    for (auto i = 0; i < t->size(); ++i) {
@ -1284,7 +1284,7 @@ void testOpenGLInstanceNormPRelu(int N, int C, int H, int W, float error) {
  }
  // prelu scale
  {
-    auto* t = ws.CreateBlob("p")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("p"), CPU);
    t->Resize(C);
    CPUContext ctx;
    math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
@ -1385,7 +1385,7 @@ void OpenGL_speedtest(int N,
            << " C: " << C << " H: " << H << " W: " << W;
  Workspace ws;
  {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
    t->Resize(N, C, H, W);
    CPUContext ctx;
    if (random_input) {
@ -1399,7 +1399,7 @@ void OpenGL_speedtest(int N,
  }
  {
-    auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
    t->Resize(K, C, kernel_h, kernel_w);
    CPUContext ctx;
    if (random_input) {
@ -1413,7 +1413,7 @@ void OpenGL_speedtest(int N,
  }
  {
-    auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
    t->Resize(K);
    CPUContext ctx;
    if (random_input) {
@ -1479,7 +1479,7 @@ void testOpenGLPadImage(
  {
    Workspace ws;
    {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
      t->Resize(N, C, H, W);
      CPUContext ctx;
      math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
@ -1593,7 +1593,7 @@ void testOpenGLResize(int N,
  {
    Workspace ws;
    {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
      t->Resize(N, C, H, W);
      CPUContext ctx;
      math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
@ -1675,7 +1675,7 @@ void testOpenGLPreprocess(int N, int C, int H, int W, float error) {
  LOG(INFO) << "OpenGL Preprocess Test";
  Workspace ws;
  {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
    t->Resize(N, H, W, C);
    CPUContext ctx;
    for (auto i = 0; i < t->size(); ++i) {
@ -1684,7 +1684,7 @@ void testOpenGLPreprocess(int N, int C, int H, int W, float error) {
  }
  {
-    auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("mean"), CPU);
    t->Resize(3);
    CPUContext ctx;
    t->mutable_data<float>()[0] = 100;
@ -1748,7 +1748,7 @@ void testOpenGLDeprocess(int N, int C, int H, int W, float error) {
  LOG(INFO) << "OpenGLDeprocess Test";
  Workspace ws;
  {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
    t->Resize(N, C, H, W);
    CPUContext ctx;
    for (auto i = 0; i < t->size(); ++i) {
@ -1757,7 +1757,7 @@ void testOpenGLDeprocess(int N, int C, int H, int W, float error) {
  }
  {
-    auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("mean"), CPU);
    t->Resize(3);
    CPUContext ctx;
    t->mutable_data<float>()[0] = 30;
@ -1800,7 +1800,7 @@ void testOpenGLNormPlanarYUV(int N, int C, int H, int W, float error) {
  LOG(INFO) << "OpenGLNormPlanarYUV Test";
  Workspace ws;
  {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
    t->Resize(N, 3, H, W);
    CPUContext ctx;
    for (auto i = 0; i < t->size(); ++i) {
@ -1809,7 +1809,7 @@ void testOpenGLNormPlanarYUV(int N, int C, int H, int W, float error) {
  }
  {
-    auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("mean"), CPU);
    t->Resize(1, 3);
    CPUContext ctx;
    t->mutable_data<float>()[0] = 30;
@ -1818,7 +1818,7 @@ void testOpenGLNormPlanarYUV(int N, int C, int H, int W, float error) {
  }
  {
-    auto* t = ws.CreateBlob("stdev")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("stdev"), CPU);
    t->Resize(1, 3);
    CPUContext ctx;
    t->mutable_data<float>()[0] = 6;
@ -1879,7 +1879,7 @@ void OpenGL_copyops_speedtest(int N,
  LOG(INFO) << "OpenGL CopyOps Speed Test";
  Workspace ws;
  {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
    t->Resize(N, C, H, W);
    CPUContext ctx;
    if (random_input) {
@ -1893,7 +1893,7 @@ void OpenGL_copyops_speedtest(int N,
  }
  {
-    auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
    t->Resize(K, C, kernel_h, kernel_w);
    CPUContext ctx;
    if (random_input) {
@ -1907,7 +1907,7 @@ void OpenGL_copyops_speedtest(int N,
  }
  {
-    auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
    t->Resize(K);
    CPUContext ctx;
    if (random_input) {
@ -1990,8 +1990,8 @@ void compareModelsForOpenGL(std::string name,
    Workspace cws;
    cws.RunNetOnce(initNet);
-    auto* t_cpu = cws.CreateBlob(truncatedPredictNet.external_input(0))
+    auto* t_cpu = BlobGetMutableTensor(
-                      ->GetMutableTensor(CPU);
+        cws.CreateBlob(truncatedPredictNet.external_input(0)), CPU);
    if (name == "styleTransfer") {
      CAFFE_ENFORCE_EQ(input_order, "NHWC");
      CAFFE_ENFORCE_EQ(input_type, "uint8_t");
@ -2032,8 +2032,8 @@ void compareModelsForOpenGL(std::string name,
    Workspace mws;
    mws.RunNetOnce(initNet);
-    auto* t_gl = mws.CreateBlob(truncatedOpenGLPredictNet.external_input(0))
+    auto* t_gl = BlobGetMutableTensor(
-                     ->GetMutableTensor(CPU);
+        mws.CreateBlob(truncatedOpenGLPredictNet.external_input(0)), CPU);
    if (name == "styleTransfer") {
      CAFFE_ENFORCE_EQ(input_order, "NHWC");
      CAFFE_ENFORCE_EQ(input_type, "uint8_t");
@ -2116,7 +2116,7 @@ void compareBatchedToTiledModels(std::string name,
    tws.RunNetOnce(initNet);
    auto* t_batch =
-        tws.CreateBlob(bachedNet.external_input(0))->GetMutableTensor(CPU);
+        BlobGetMutableTensor(tws.CreateBlob(bachedNet.external_input(0)), CPU);
    if (name == "styleTransfer") {
      CAFFE_ENFORCE_EQ(input_order, "NHWC");
      CAFFE_ENFORCE_EQ(input_type, "uint8_t");
@ -2143,7 +2143,7 @@ void compareBatchedToTiledModels(std::string name,
    bws.RunNetOnce(initNet);
    auto* t_tiling =
-        bws.CreateBlob(tiledNet.external_input(0))->GetMutableTensor(CPU);
+        BlobGetMutableTensor(bws.CreateBlob(tiledNet.external_input(0)), CPU);
    if (name == "styleTransfer") {
      CAFFE_ENFORCE_EQ(input_order, "NHWC");
      CAFFE_ENFORCE_EQ(input_type, "uint8_t");
--- a/caffe2/mobile/contrib/snpe/snpe_op_benchmark.cc
+++ b/caffe2/mobile/contrib/snpe/snpe_op_benchmark.cc
@ -14,7 +14,7 @@
 #define POPULATE_DATA(_n, _s, _l)                                         \
  do {                                                                    \
    Blob* _blob = ws.CreateBlob((_n));                                    \
-    auto* _tensor = _blob->GetMutableTensor(CPU);                         \
+    auto* _tensor = BlobGetMutableTensor(_blob, CPU);                     \
    _tensor->Resize((_s));                                                \
    memcpy(_tensor->mutable_data<float>(), data_##_l, _tensor->nbytes()); \
  } while (0)
@ -23,7 +23,7 @@
 #define POPULATE_DATA(_n, _s, _l)                                 \
  do {                                                            \
    Blob* _blob = ws.CreateBlob((_n));                            \
-    auto* _tensor = _blob->GetMutableTensor(CPU);                 \
+    auto* _tensor = BlobGetMutableTensor(_blob, CPU);             \
    _tensor->Resize((_s));                                        \
    memset(_tensor->mutable_data<float>(), 1, _tensor->nbytes()); \
  } while (0)
@ -43,7 +43,7 @@ void AddConstInput(const vector<int64_t>& shape,
  DeviceOption option;
  CPUContext context(option);
  Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob, CPU);
  tensor->Resize(shape);
  math::Set<float, CPUContext>(tensor->size(), value,
                               tensor->mutable_data<float>(),
@ -56,7 +56,7 @@ void AddNoiseInput(const vector<int64_t>& shape,
  DeviceOption option;
  CPUContext context(option);
  Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob, CPU);
  tensor->Resize(shape);
  math::RandGaussian<float, CPUContext>(
--- a/caffe2/mobile/contrib/ulp2/ulp_test.cc
+++ b/caffe2/mobile/contrib/ulp2/ulp_test.cc
@ -289,13 +289,13 @@ void ConvTest2b1b(int IC, int KH, int KW, int H, int W, int OC, int N, ConvArgs
    def.add_arg()->CopyFrom(MakeArgument("pad_r", args.pad_r));
    def.add_arg()->CopyFrom(MakeArgument("pad_t", args.pad_t));
    def.add_arg()->CopyFrom(MakeArgument("pad_b", args.pad_b));
-    auto* Xws = ws.CreateBlob("X")->GetMutableTensor(CPU);
+    auto* Xws = BlobGetMutableTensor(ws.CreateBlob("X"), CPU);
    Xws->ResizeLike(X);
    Xws->ShareExternalPointer(X.mutable_data<float>(), X.size());
-    auto* Wws = ws.CreateBlob("W")->GetMutableTensor(CPU);
+    auto* Wws = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
    Wws->ResizeLike(W_);
    Wws->ShareExternalPointer(W_.mutable_data<float>(), W_.size());
-    auto* bws = ws.CreateBlob("b")->GetMutableTensor(CPU);
+    auto* bws = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
    bws->ResizeLike(bias);
    bws->ShareExternalPointer(bias.mutable_data<float>(), bias.size());
    ws.RunOperatorOnce(def);
--- a/caffe2/operators/batch_matmul_op_gpu_test.cc
+++ b/caffe2/operators/batch_matmul_op_gpu_test.cc
@ -30,7 +30,7 @@ class BatchMatMulOpGPUTest : public testing::Test {
      const float value,
      const string& name) {
    Blob* blob = ws_.CreateBlob(name);
-    auto* tensor = blob->GetMutableTensor(CUDA);
+    auto* tensor = BlobGetMutableTensor(blob, CUDA);
    tensor->Resize(dims);
    math::Set<float, CUDAContext>(
        tensor->size(),
--- a/caffe2/operators/batch_matmul_op_test.cc
+++ b/caffe2/operators/batch_matmul_op_test.cc
@ -24,7 +24,7 @@ class BatchMatMulOpTest : public testing::Test {
      const float value,
      const string& name) {
    Blob* blob = ws_.CreateBlob(name);
-    auto* tensor = blob->GetMutableTensor(CPU);
+    auto* tensor = BlobGetMutableTensor(blob, CPU);
    tensor->Resize(dims);
    math::Set<float, CPUContext>(
        tensor->size(),
--- a/caffe2/operators/boolean_unmask_ops_test.cc
+++ b/caffe2/operators/boolean_unmask_ops_test.cc
@ -16,7 +16,7 @@ static void AddScalarInput(
    Workspace* ws,
    bool isEmpty = false) {
  Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob, CPU);
  if (!isEmpty) {
    tensor->Resize(vector<int64_t>{1});
    *(tensor->template mutable_data<DataT>()) = value;
--- a/caffe2/operators/conv_op_shared.cc
+++ b/caffe2/operators/conv_op_shared.cc
@ -27,8 +27,8 @@ void runWithSharedBuffer<CPUContext>(
  auto* mutexPtr = mutexBlob->GetMutable<std::unique_ptr<std::mutex>>();
  std::lock_guard<std::mutex> g(**mutexPtr);
-  auto* buffer =
+  auto* buffer = BlobGetMutableTensor(
-      ws->GetBlob("__CAFFE2_SHARED_CONV_BUFFER_CPU__")->GetMutableTensor(CPU);
+      ws->GetBlob("__CAFFE2_SHARED_CONV_BUFFER_CPU__"), CPU);
  f(buffer);
 }
 }
--- a/caffe2/operators/conv_op_shared_gpu.cc
+++ b/caffe2/operators/conv_op_shared_gpu.cc
@ -20,8 +20,8 @@ void runWithSharedBuffer<CUDAContext>(
  auto* mutexPtr = mutexBlob->GetMutable<std::unique_ptr<std::mutex>>();
  std::lock_guard<std::mutex> g(**mutexPtr);
-  auto* buffer =
+  auto* buffer = BlobGetMutableTensor(
-      ws->GetBlob("__CAFFE2_SHARED_CONV_BUFFER_CUDA__")->GetMutableTensor(CUDA);
+      ws->GetBlob("__CAFFE2_SHARED_CONV_BUFFER_CUDA__"), CUDA);
  f(buffer);
 }
 }
--- a/caffe2/operators/conv_transpose_op_mobile_test.cc
+++ b/caffe2/operators/conv_transpose_op_mobile_test.cc
@ -17,7 +17,7 @@ void AddConstInput(const vector<int64_t>& shape,
  DeviceOption option;
  CPUContext context(option);
  Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob, CPU);
  tensor->Resize(shape);
  math::Set<float, CPUContext>(
      tensor->size(), value, tensor->template mutable_data<float>(), &context);
@ -29,7 +29,7 @@ void AddNoiseInput(const vector<int64_t>& shape,
  DeviceOption option;
  CPUContext context(option);
  Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob, CPU);
  tensor->Resize(shape);
  math::RandGaussian<float, CPUContext>(
--- a/caffe2/operators/dataset_ops.cc
+++ b/caffe2/operators/dataset_ops.cc
@ -1428,7 +1428,7 @@ class TreeCursorSerializer : public BlobSerializerBase {
    // serialize offsets as a tensor
    if (cursor->offsets.size() > 0) {
      Blob offsets_blob;
-      auto* offsets = offsets_blob.GetMutableTensor(CPU);
+      auto* offsets = BlobGetMutableTensor(&offsets_blob, CPU);
      offsets->Resize(cursor->offsets.size());
      std::copy(
          cursor->offsets.begin(),
--- a/caffe2/operators/dropout_op_cudnn.cc
+++ b/caffe2/operators/dropout_op_cudnn.cc
@ -150,7 +150,7 @@ bool CuDNNDropoutOp::DoRunWithType() {
    // Reshape tensor descriptors if necessary
    if (X.dims() != cudnn_input_dims_ && !is_test_) {
      CAFFE_ENFORCE(scratch_blob_);
-      Tensor* states = scratch_blob_->GetMutableTensor(CUDA);
+      Tensor* states = BlobGetMutableTensor(scratch_blob_, CUDA);
      cudnn_input_dims_ = X.dims();
      CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
          data_desc_,
--- a/caffe2/operators/elementwise_op_test.h
+++ b/caffe2/operators/elementwise_op_test.h
@ -19,7 +19,7 @@ void FillTensor(
    const std::vector<int64_t>& shape,
    const std::vector<I_Type>& values) {
  auto* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(Context::GetDeviceType());
+  auto* tensor = BlobGetMutableTensor(blob, Context::GetDeviceType());
  tensor->Resize(shape);
  auto* mutable_data = tensor->template mutable_data<O_Type>();
  const O_Type* data = reinterpret_cast<const O_Type*>(values.data());
--- a/caffe2/operators/generate_proposals_op_test.cc
+++ b/caffe2/operators/generate_proposals_op_test.cc
@ -18,7 +18,7 @@ static void AddConstInput(
  DeviceOption option;
  CPUContext context(option);
  Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob, CPU);
  tensor->Resize(shape);
  math::Set<float, CPUContext>(
      tensor->size(), value, tensor->template mutable_data<float>(), &context);
@ -34,7 +34,7 @@ static void AddLinSpacedInput(
  DeviceOption option;
  CPUContext context(option);
  Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob, CPU);
  tensor->Resize(shape);
  EigenVectorMap<float> tensor_vec(
      tensor->template mutable_data<float>(), tensor->size());
@ -51,7 +51,7 @@ static void AddInput(
  DeviceOption option;
  CPUContext context(option);
  Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob, CPU);
  tensor->Resize(shape);
  EigenVectorMap<float> tensor_vec(
      tensor->template mutable_data<float>(), tensor->size());
--- a/caffe2/operators/index_ops.cc
+++ b/caffe2/operators/index_ops.cc
@ -353,7 +353,7 @@ class IndexSerializer : public BlobSerializerBase {
      SerializationAcceptor acceptor) override {
    auto& base = blob.template Get<std::unique_ptr<IndexBase>>();
    Blob tensor_blob;
-    auto* tensor_out = tensor_blob.GetMutableTensor(CPU);
+    auto* tensor_out = BlobGetMutableTensor(&tensor_blob, CPU);
    if (base->Type().Match<std::string>()) {
      doStore<std::string>(base, tensor_out);
--- a/caffe2/operators/onnx_while_op.h
+++ b/caffe2/operators/onnx_while_op.h
@ -213,23 +213,23 @@ class ONNXWhileOp final : public Operator<Context> {
      lcd_tensors_.clear();
      for (int i = 2; i < body_net_def.external_input_size(); ++i) {
        Blob* b = loop_ws_->CreateBlob(body_net_def.external_input(i));
-        Tensor* t = b->GetMutableTensor(Context::GetDeviceType());
+        Tensor* t = BlobGetMutableTensor(b, Context::GetDeviceType());
        lcd_tensors_.push_back(t);
      }
      // First output is the iteration variable
      auto* iteration_var_blob = loop_ws_->CreateBlob(
          body_net_def.external_input(0));
      iteration_var_ =
-          iteration_var_blob->GetMutableTensor(Context::GetDeviceType());
+          BlobGetMutableTensor(iteration_var_blob, Context::GetDeviceType());
-      input_condition_var_ =
+      input_condition_var_ = BlobGetMutableTensor(
-          loop_ws_->CreateBlob(body_net_def.external_input(1))
+          loop_ws_->CreateBlob(body_net_def.external_input(1)),
-              ->GetMutableTensor(Context::GetDeviceType());
+          Context::GetDeviceType());
      auto* condition_var_blob =
          loop_ws_->CreateBlob(body_net_def.external_output(0));
      condition_var_ =
-          condition_var_blob->GetMutableTensor(Context::GetDeviceType());
+          BlobGetMutableTensor(condition_var_blob, Context::GetDeviceType());
      condition_var_->Resize(1);
      condition_var_->template mutable_data<bool>();
--- a/caffe2/operators/onnxifi_op.cc
+++ b/caffe2/operators/onnxifi_op.cc
@ -15,7 +15,7 @@ void BlobToTensorDescriptor(
  // Memory type
  // We only allow weights to be CPU tensor for now
  CAFFE_ENFORCE(
-      blob->IsTensorType(CPU),
+      BlobIsTensorType(*blob, CPU),
      "Initialization blob ",
      name,
      " needs to be TensorCPU");
--- a/caffe2/operators/operator_fallback_gpu.h
+++ b/caffe2/operators/operator_fallback_gpu.h
@ -65,8 +65,8 @@ class GPUFallbackOpEx final : public Operator<CUDAContext> {
    bool need_sync = false;
    for (int i = 0; i < InputSize(); ++i) {
      if (this->InputIsTensorType(i, CUDA)) {
-        local_input_blobs_[i]->GetMutableTensor(CPU)->CopyFrom(
+        BlobGetMutableTensor(local_input_blobs_[i], CPU)
-            Input(i), &context_);
+            ->CopyFrom(Input(i), &context_);
        need_sync = true;
      } else {
        VLOG(1) << "Input " << i << " is not TensorCUDA. Skipping copy.";
@ -95,7 +95,7 @@ class GPUFallbackOpEx final : public Operator<CUDAContext> {
        continue;
      }
      CAFFE_ENFORCE(
-          local_output_blobs_[i]->IsTensorType(CPU),
+          BlobIsTensorType(*local_output_blobs_[i], CPU),
          "GPU fallback op currently does not support non-TensorCPU "
          "output type who needs copying.");
      Output(i)->CopyFrom(local_output_blobs_[i]->template Get<TensorCPU>());
--- a/caffe2/operators/operator_fallback_gpu_test.cc
+++ b/caffe2/operators/operator_fallback_gpu_test.cc
@ -40,7 +40,7 @@ TEST(OperatorFallbackTest, IncrementByOneOp) {
  for (int i = 0; i < 6; ++i) {
    source_tensor.mutable_data<float>()[i] = i;
  }
-  ws.CreateBlob("X")->GetMutableTensor(CPU)->CopyFrom(source_tensor);
+  BlobGetMutableTensor(ws.CreateBlob("X"), CPU)->CopyFrom(source_tensor);
  unique_ptr<OperatorBase> op(CreateOperator(op_def, &ws));
  EXPECT_TRUE(op.get() != nullptr);
  EXPECT_TRUE(op->Run());
@ -64,7 +64,7 @@ TEST(OperatorFallbackTest, GPUIncrementByOneOp) {
  for (int i = 0; i < 6; ++i) {
    source_tensor.mutable_data<float>()[i] = i;
  }
-  ws.CreateBlob("X")->GetMutableTensor(CUDA)->CopyFrom(source_tensor);
+  BlobGetMutableTensor(ws.CreateBlob("X"), CUDA)->CopyFrom(source_tensor);
  unique_ptr<OperatorBase> op(CreateOperator(op_def, &ws));
  EXPECT_TRUE(op.get() != nullptr);
  EXPECT_TRUE(op->Run());
--- a/caffe2/operators/reshape_op_gpu_test.cc
+++ b/caffe2/operators/reshape_op_gpu_test.cc
@ -20,7 +20,7 @@ static void AddConstInput(
  option.set_device_type(PROTO_CUDA);
  CUDAContext context(option);
  Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CUDA);
+  auto* tensor = BlobGetMutableTensor(blob, CUDA);
  tensor->Resize(shape);
  math::Set<float, CUDAContext>(
      tensor->size(), value, tensor->template mutable_data<float>(), &context);
--- a/caffe2/operators/rnn/recurrent_network_blob_fetcher_op.h
+++ b/caffe2/operators/rnn/recurrent_network_blob_fetcher_op.h
@ -43,11 +43,10 @@ class RecurrentNetworkBlobFetcherOp final : public Operator<Context> {
            prefix_ + std::string("_") + blob_name + caffe2::to_string(i);
        blob_names_vector.push_back(newBlobName);
-        ws_->CreateBlob(newBlobName)
+        BlobGetMutableTensor(ws_->CreateBlob(newBlobName), CPU)
            ->GetMutableTensor(CPU)
            ->ResizeLike(currentTensor);
        auto type = Context::GetDeviceType();
-        auto* newTensor = ws_->GetBlob(newBlobName)->GetMutableTensor(type);
+        auto* newTensor = BlobGetMutableTensor(ws_->GetBlob(newBlobName), type);
        newTensor->CopyFrom(currentTensor);
      }
    }
--- a/caffe2/operators/rnn/recurrent_network_executor.h
+++ b/caffe2/operators/rnn/recurrent_network_executor.h
@ -111,10 +111,10 @@ class RecurrentNetworkExecutorBase {
      // the forward-only mode.
      std::string this_timestep_blob =
          timestep_blob_ + "_rnnexec_t" + caffe2::to_string(t);
-      ws->CreateBlob(this_timestep_blob)->GetMutableTensor(CPU)->Resize(1);
+      BlobGetMutableTensor(ws->CreateBlob(this_timestep_blob), CPU)->Resize(1);
      auto b = ws->GetBlob(this_timestep_blob);
      CAFFE_ENFORCE(b);
-      b->GetMutableTensor(CPU)->template mutable_data<int32_t>()[0] = t;
+      BlobGetMutableTensor(b, CPU)->template mutable_data<int32_t>()[0] = t;
      // Copy the operators from template
      for (auto& template_rnn_op : timestep_ops_template_) {
--- a/caffe2/operators/rnn/recurrent_network_op.h
+++ b/caffe2/operators/rnn/recurrent_network_op.h
@ -52,10 +52,11 @@ struct CAFFE2_API ScratchWorkspaces {
 };
 inline void UpdateTimestepBlob(Workspace* ws, std::string blob_name, int t) {
-  ws->CreateBlob(blob_name)->GetMutableTensor(CPU)->Resize(1);
+  BlobGetMutableTensor(ws->CreateBlob(blob_name), CPU)->Resize(1);
  auto timestepBlob = ws->GetBlob(blob_name);
  CAFFE_ENFORCE(timestepBlob);
-  timestepBlob->GetMutableTensor(CPU)->template mutable_data<int32_t>()[0] = t;
+  BlobGetMutableTensor(timestepBlob, CPU)->template mutable_data<int32_t>()[0] =
      t;
 }
 CAFFE2_API std::map<string, string> GetRecurrentMapping(
@ -71,8 +72,9 @@ void applyOffsetAlias(
          << " at offset: " << oc.offset;
  auto srcBlob = ws->GetBlob(oc.src);
  CAFFE_ENFORCE(srcBlob);
-  auto* src = srcBlob->GetMutableTensor(Context::GetDeviceType());
+  auto* src = BlobGetMutableTensor(srcBlob, Context::GetDeviceType());
-  auto* dst = ws->GetBlob(oc.dst)->GetMutableTensor(Context::GetDeviceType());
+  auto* dst =
      BlobGetMutableTensor(ws->GetBlob(oc.dst), Context::GetDeviceType());
  auto timestep = src->size() / src->dim(0);
  auto dims = src->dims();
  const int32_t startDstTimestep =
@ -113,7 +115,7 @@ void initializeRecurrentInput(
    Context* context) {
  auto stateBlob = ws->GetBlob(rc.state);
  CAFFE_ENFORCE(stateBlob);
-  auto* state = stateBlob->GetMutableTensor(Context::GetDeviceType());
+  auto* state = BlobGetMutableTensor(stateBlob, Context::GetDeviceType());
  auto inputBlob = ws->GetBlob(rc.input);
  CAFFE_ENFORCE(inputBlob);
@ -660,7 +662,7 @@ class RecurrentNetworkGradientOp final : public Operator<Context> {
      auto gBlob = sharedWs_->GetBlob(param.grad);
      CAFFE_ENFORCE(gBlob);
-      auto* g = gBlob->GetMutableTensor(Context::GetDeviceType());
+      auto* g = BlobGetMutableTensor(gBlob, Context::GetDeviceType());
      g->ResizeLike(p);
      math::Set<T, Context>(
          g->size(),
@ -676,7 +678,7 @@ class RecurrentNetworkGradientOp final : public Operator<Context> {
      auto gBlob = sharedWs_->CreateBlob(rg.grad);
      CAFFE_ENFORCE(gBlob);
-      auto* g = gBlob->GetMutableTensor(Context::GetDeviceType());
+      auto* g = BlobGetMutableTensor(gBlob, Context::GetDeviceType());
      g->ResizeLike(p);
      CAFFE_ENFORCE_EQ(g->ndim(), 3);
      const auto timestep = g->size() / g->dim(0);
@ -703,7 +705,7 @@ class RecurrentNetworkGradientOp final : public Operator<Context> {
              << ". Size: " << Input(gradientInputIndex).size();
      auto pGradientBlob = sharedWs_->GetBlob(gradientName);
      CAFFE_ENFORCE(pGradientBlob);
-      auto* g = pGradientBlob->GetMutableTensor(Context::GetDeviceType());
+      auto* g = BlobGetMutableTensor(pGradientBlob, Context::GetDeviceType());
      g->ResizeLike(Input(gradientInputIndex));
      g->template mutable_data<T>();
    }
@ -717,7 +719,7 @@ class RecurrentNetworkGradientOp final : public Operator<Context> {
                << rg.lastExternalGrad << " for final time step (sep. blob)";
        auto gBlob = sharedWs_->GetBlob(rg.grad);
        CAFFE_ENFORCE(gBlob);
-        auto* g = gBlob->GetMutableTensor(Context::GetDeviceType());
+        auto* g = BlobGetMutableTensor(gBlob, Context::GetDeviceType());
        auto oglastBlob = sharedWs_->GetBlob(rg.lastExternalGrad);
        CAFFE_ENFORCE(oglastBlob);
@ -779,7 +781,7 @@ class RecurrentNetworkGradientOp final : public Operator<Context> {
      T* output_data = Output(outputIdx)->template mutable_data<T>();
      auto pBlob = sharedWs_->GetBlob(recurrentGradients_[i].grad);
      CAFFE_ENFORCE(pBlob);
-      auto* p = pBlob->GetMutableTensor(Context::GetDeviceType());
+      auto* p = BlobGetMutableTensor(pBlob, Context::GetDeviceType());
      if (Input(inputId).ndim() >= 2) {
        // Gradient states blob should live. And if it gets changed by the
--- a/caffe2/operators/roi_align_op_gpu_test.cc
+++ b/caffe2/operators/roi_align_op_gpu_test.cc
@ -18,7 +18,7 @@ void AddConstInput(
    Context* context,
    Workspace* ws) {
  Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(Context::GetDeviceType());
+  auto* tensor = BlobGetMutableTensor(blob, Context::GetDeviceType());
  tensor->Resize(shape);
  math::Set<float, Context>(
      tensor->size(), value, tensor->template mutable_data<float>(), context);
@ -39,7 +39,7 @@ void AddInput<CPUContext>(
    const string& name,
    Workspace* ws) {
  Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob, CPU);
  tensor->Resize(shape);
  EigenVectorMap<float> tensor_vec(
      tensor->template mutable_data<float>(), tensor->size());
@ -57,7 +57,7 @@ void AddInput<CUDAContext>(
  tmp_vec.array() = utils::AsEArrXt(values);
  Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CUDA);
+  auto* tensor = BlobGetMutableTensor(blob, CUDA);
  tensor->CopyFrom(tmp);
 }
--- a/caffe2/operators/string_ops_test.cc
+++ b/caffe2/operators/string_ops_test.cc
@ -9,7 +9,7 @@ class StringJoinOpTest : public testing::Test {
 public:
  bool runOp(const TensorCPU& input) {
    auto* blob = ws_.CreateBlob("X");
-    auto* tensor = blob->GetMutableTensor(CPU);
+    auto* tensor = BlobGetMutableTensor(blob, CPU);
    tensor->ResizeLike(input);
    tensor->ShareData(input);
@ -26,7 +26,7 @@ class StringJoinOpTest : public testing::Test {
  const std::string* checkAndGetOutput(int outputSize) {
    const auto* output = ws_.GetBlob("Y");
    EXPECT_NE(output, nullptr);
-    EXPECT_TRUE(output->IsTensorType(CPU));
+    EXPECT_TRUE(BlobIsTensorType(*output, CPU));
    const auto& outputTensor = output->Get<TensorCPU>();
    EXPECT_EQ(outputTensor.ndim(), 1);
    EXPECT_EQ(outputTensor.dim(0), outputSize);
@ -42,7 +42,7 @@ TEST_F(StringJoinOpTest, testString1DJoin) {
  std::vector<std::string> input = {"a", "xx", "c"};
  auto blob = caffe2::make_unique<Blob>();
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob.get(), CPU);
  tensor->Resize(input.size());
  auto* data = tensor->template mutable_data<std::string>();
  for (int i = 0; i < input.size(); ++i) {
@ -62,7 +62,7 @@ TEST_F(StringJoinOpTest, testString2DJoin) {
                                                 {"dd", "ee", "ff"}};
  auto blob = caffe2::make_unique<Blob>();
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob.get(), CPU);
  tensor->Resize(input.size(), input[0].size());
  auto* data = tensor->template mutable_data<std::string>();
  for (int i = 0; i < input.size(); ++i) {
@ -82,7 +82,7 @@ TEST_F(StringJoinOpTest, testFloat1DJoin) {
  std::vector<float> input = {3.90f, 5.234f, 8.12f};
  auto blob = caffe2::make_unique<Blob>();
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob.get(), CPU);
  tensor->Resize(input.size());
  auto* data = tensor->template mutable_data<float>();
  for (int i = 0; i < input.size(); ++i) {
@ -102,7 +102,7 @@ TEST_F(StringJoinOpTest, testFloat2DJoin) {
                                           {4.67f, 5.90f, 6.32f}};
  auto blob = caffe2::make_unique<Blob>();
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob.get(), CPU);
  tensor->Resize(input.size(), input[0].size());
  auto* data = tensor->template mutable_data<float>();
  for (int i = 0; i < input.size(); ++i) {
@ -122,7 +122,7 @@ TEST_F(StringJoinOpTest, testLong2DJoin) {
  std::vector<std::vector<int64_t>> input = {{100, 200}, {1000, 2000}};
  auto blob = caffe2::make_unique<Blob>();
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob.get(), CPU);
  tensor->Resize(input.size(), input[0].size());
  auto* data = tensor->template mutable_data<int64_t>();
  for (int i = 0; i < input.size(); ++i) {
--- a/caffe2/operators/stylizer_ops.cc
+++ b/caffe2/operators/stylizer_ops.cc
@ -82,10 +82,10 @@ class PackedInt8BGRANHWCToNCHWCStylizerPreprocessOp
    auto defaultNoiseSize = OperatorBase::GetSingleArgument<int>(
        "noise_size", 491 /* prime to avoid artifacts */);
-    if (!noiseBlob->IsTensorType(CPU)) {
+    if (!BlobIsTensorType(*noiseBlob, CPU)) {
      // Initialize random noise on first use.
      // Cache it to maintain temporal consistency.
-      auto* t = noiseBlob->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(noiseBlob, CPU);
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
      // Noise space is larger for vectorized code due to the
--- a/caffe2/operators/tensor_protos_db_input.h
+++ b/caffe2/operators/tensor_protos_db_input.h
@ -56,7 +56,7 @@ bool TensorProtosDBInput<Context>::Prefetch() {
        protos.mutable_protos(i)->clear_device_detail();
      }
      deserializer.Deserialize(
-          protos.protos(i), prefetched_blobs_[i].GetMutableTensor(CPU));
+          protos.protos(i), BlobGetMutableTensor(&prefetched_blobs_[i], CPU));
    }
  } else {
    vector<Tensor> temp_tensors;
@ -74,11 +74,11 @@ bool TensorProtosDBInput<Context>::Prefetch() {
          vector<int> dims(
              protos.protos(i).dims().begin(), protos.protos(i).dims().end());
          dims.insert(dims.begin(), batch_size_);
-          prefetched_blobs_[i].GetMutableTensor(CPU)->Resize(dims);
+          BlobGetMutableTensor(&prefetched_blobs_[i], CPU)->Resize(dims);
        }
      }
      for (int i = 0; i < protos.protos_size(); ++i) {
-        TensorCPU* dst = prefetched_blobs_[i].GetMutableTensor(CPU);
+        TensorCPU* dst = BlobGetMutableTensor(&prefetched_blobs_[i], CPU);
        TensorCPU& src = temp_tensors[i];
        if (protos.protos(i).has_device_detail()) {
          protos.mutable_protos(i)->clear_device_detail();
--- a/caffe2/operators/tt_linear_op.h
+++ b/caffe2/operators/tt_linear_op.h
@ -52,7 +52,7 @@ class TTLinearOp final : public Operator<Context> {
    int cores_idx = 0;
    // Temporary buffer to facilitate multiplication of TT-cores with input
-    auto Y_buf = Y_temp_->GetMutableTensor(Context::GetDeviceType());
+    auto Y_buf = BlobGetMutableTensor(Y_temp_.get(), Context::GetDeviceType());
    Y_buf->ResizeLike(X);
    Y_buf->CopyFrom(X);
--- a/caffe2/operators/utility_ops_gpu_test.cc
+++ b/caffe2/operators/utility_ops_gpu_test.cc
@ -19,7 +19,7 @@ static void AddConstInput(
  option.set_device_type(PROTO_CUDA);
  CUDAContext context(option);
  Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CUDA);
+  auto* tensor = BlobGetMutableTensor(blob, CUDA);
  tensor->Resize(shape);
  math::Set<float, CUDAContext>(
      tensor->size(), value, tensor->template mutable_data<float>(), &context);
--- a/caffe2/operators/utility_ops_test.cc
+++ b/caffe2/operators/utility_ops_test.cc
@ -16,7 +16,7 @@ static void AddConstInput(
  DeviceOption option;
  CPUContext context(option);
  Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob, CPU);
  tensor->Resize(shape);
  math::Set<float, CPUContext>(
      tensor->size(), value, tensor->template mutable_data<float>(), &context);
--- a/caffe2/opt/fusion.cc
+++ b/caffe2/opt/fusion.cc
@ -44,10 +44,10 @@ bool fuseConvBNHelper(repr::NNModule* nn, caffe2::Workspace* ws) {
    CAFFE_ENFORCE(
        bnInputs.size() >= 5, "Invalid batch normalization input size");
-#define EXPOSE_TENSOR_DATA(name, index, inputs)                            \
+#define EXPOSE_TENSOR_DATA(name, index, inputs)                                \
-  auto name = repr::nn::get<repr::Tensor>(inputs[index]);                  \
+  auto name = repr::nn::get<repr::Tensor>(inputs[index]);                      \
-  assert(ws->HasBlob(name->getName()) && "Blob not in workspace");         \
+  assert(ws->HasBlob(name->getName()) && "Blob not in workspace");             \
-  auto name##Tensor = ws->GetBlob(name->getName())->GetMutableTensor(CPU); \
+  auto name##Tensor = BlobGetMutableTensor(ws->GetBlob(name->getName()), CPU); \
  auto name##Data = name##Tensor->mutable_data<float>();
    EXPOSE_TENSOR_DATA(filter, 1, convInputs);
@ -76,7 +76,7 @@ bool fuseConvBNHelper(repr::NNModule* nn, caffe2::Workspace* ws) {
          nn->dataFlow.createEdge(convBiasNode, convNode);
          auto* blob = ws->CreateBlob(convBiasName);
-          caffe2::TensorCPU* tensor = blob->GetMutableTensor(caffe2::CPU);
+          caffe2::TensorCPU* tensor = BlobGetMutableTensor(blob, caffe2::CPU);
          CHECK_NOTNULL(tensor);
          // Get output channel
          size_t c = filterTensor->dim32(0);
--- a/caffe2/opt/onnxifi_transformer.cc
+++ b/caffe2/opt/onnxifi_transformer.cc
@ -173,7 +173,7 @@ NetDef OnnxifiTransformer::SubnetToOnnxifiOp(
      // Feed into workspace as CPU Tensors
      auto* blob = ws->CreateBlob(t.name());
-      auto* cpu_tensor = blob->GetMutableTensor(CPU);
+      auto* cpu_tensor = BlobGetMutableTensor(blob, CPU);
      std::vector<int64_t> dims;
      for(const auto& d : t.dims()) {
        dims.push_back(d);
--- a/caffe2/predictor/predictor.cc
+++ b/caffe2/predictor/predictor.cc
@ -10,14 +10,14 @@ void enforceIsTensor(Workspace* ws, const std::string& name) {
  auto blob = ws->GetBlob(name);
  CAFFE_ENFORCE(blob, "Blob does not exist: ", name);
  CAFFE_ENFORCE(
-      blob->IsTensorType(CPU), "Blob is not a CPU Tensor: ", name);
+      BlobIsTensorType(*blob, CPU), "Blob is not a CPU Tensor: ", name);
 }
 TensorCPU* getTensor(Workspace* ws, const std::string& name) {
  enforceIsTensor(ws, name);
  auto* blob = ws->GetBlob(name);
  CAFFE_ENFORCE(blob, "Blob: ", name, " does not exist");
-  return blob->GetMutableTensor(CPU);
+  return BlobGetMutableTensor(blob, CPU);
 }
 void shareInputTensor(
@ -60,7 +60,7 @@ Predictor::Predictor(PredictorConfig config) : config_(std::move(config)) {
  for (const auto& name : config_.predict_net->external_input()) {
    if (!initialized.count(name)) {
      auto* blob = config_.ws->CreateBlob(name);
-      blob->GetMutableTensor(CPU);
+      BlobGetMutableTensor(blob, CPU);
    }
  }
  CAFFE_ENFORCE(config_.ws->CreateNet(config_.predict_net));
--- a/caffe2/predictor/predictor_test.cc
+++ b/caffe2/predictor/predictor_test.cc
@ -135,7 +135,7 @@ std::unique_ptr<Blob> randomTensor(
    const std::vector<int64_t>& dims,
    CPUContext* ctx) {
  auto blob = make_unique<Blob>();
-  auto* t = blob->GetMutableTensor(CPU);
+  auto* t = BlobGetMutableTensor(blob.get(), CPU);
  t->Resize(dims);
  math::RandUniform<float, CPUContext>(
      t->size(), -1.0, 1.0, t->template mutable_data<float>(), ctx);
@ -180,7 +180,7 @@ TEST_F(PredictorTest, SimpleBatchSized) {
  auto inputData = randomTensor({1, 4}, ctx_.get());
  Predictor::TensorList input;
  input.emplace_back(CPU);
-  auto tensor = inputData->GetMutableTensor(CPU);
+  auto tensor = BlobGetMutableTensor(inputData.get(), CPU);
  input.back().ResizeLike(*tensor);
  input.back().ShareData(*tensor);
  Predictor::TensorList output;
@ -196,7 +196,7 @@ TEST_F(PredictorTest, SimpleBatchSizedMapInput) {
  auto inputData = randomTensor({1, 4}, ctx_.get());
  Predictor::TensorMap input;
  auto iter = input.emplace("data", Tensor(CPU));
-  auto tensor = inputData->GetMutableTensor(CPU);
+  auto tensor = BlobGetMutableTensor(inputData.get(), CPU);
  iter.first->second.ResizeLike(*tensor);
  iter.first->second.ShareData(*tensor);
--- a/caffe2/python/pybind_state.cc
+++ b/caffe2/python/pybind_state.cc
@ -328,7 +328,7 @@ void addObjectMethods(py::module& m) {
          })
      .def(
          "tensor",
-          [](Blob* blob) { return py::cast(blob->GetMutableTensor(CPU)); },
+          [](Blob* blob) { return py::cast(BlobGetMutableTensor(blob, CPU)); },
          py::return_value_policy::reference_internal)
      .def(
          "_feed",
--- a/caffe2/python/pybind_state.h
+++ b/caffe2/python/pybind_state.h
@ -234,7 +234,7 @@ class TensorFeeder : public BlobFeederBase {
    FeedTensor(
        option,
        original_array,
-        blob->GetMutableTensor(Context::GetDeviceType()));
+        BlobGetMutableTensor(blob, Context::GetDeviceType()));
  }
 };
@ -366,31 +366,32 @@ class PythonOpBase : public Operator<Context> {
        // make sure output blob is initialized before creating the binding
        if (forced_cpu_outputs_.count(i)) {
-          blob->GetMutableTensor(Context::GetDeviceType());
+          BlobGetMutableTensor(blob, Context::GetDeviceType());
        } else {
-          blob->GetMutableTensor(Context::GetDeviceType());
+          BlobGetMutableTensor(blob, Context::GetDeviceType());
        }
        py::object py_obj;
        if (blob->template IsType<Tensor>()) {
          if (use_dlpack) {
            DLPackWrapper<CPUContext> wrapper(
-                blob->GetMutableTensor(Context::GetDeviceType()), cpu_option);
+                BlobGetMutableTensor(blob, Context::GetDeviceType()),
                cpu_option);
            py_obj = py::cast(wrapper, py::return_value_policy::copy);
          } else {
            py_obj = py::cast(
-                blob->GetMutableTensor(Context::GetDeviceType()),
+                BlobGetMutableTensor(blob, Context::GetDeviceType()),
                py::return_value_policy::reference);
          }
        } else {
          if (use_dlpack) {
            DLPackWrapper<Context> wrapper(
-                blob->GetMutableTensor(Context::GetDeviceType()),
+                BlobGetMutableTensor(blob, Context::GetDeviceType()),
                this->device_option());
            py_obj = py::cast(wrapper, py::return_value_policy::copy);
          } else {
            py_obj = py::cast(
-                blob->GetMutableTensor(Context::GetDeviceType()),
+                BlobGetMutableTensor(blob, Context::GetDeviceType()),
                py::return_value_policy::reference);
          }
        }
--- a/caffe2/python/pybind_state_ideep.cc
+++ b/caffe2/python/pybind_state_ideep.cc
@ -163,8 +163,8 @@ public:
        DeviceOption cpu_option(option);
        cpu_option.set_device_type(DeviceTypeProto::PROTO_CPU);
        TensorFeeder<CPUContext> cpu_tensor_feeder;
-        cpu_tensor_feeder.FeedTensor(cpu_option, original_array,
+        cpu_tensor_feeder.FeedTensor(
-                                     blob->GetMutableTensor(CPU));
+            cpu_option, original_array, BlobGetMutableTensor(blob, CPU));
      }
    } catch (ideep::error &e) {
      LOG(ERROR) << "IDEEP error: " << e.message;
--- a/caffe2/share/contrib/depthwise/depthwise3x3_conv_op_test.cc
+++ b/caffe2/share/contrib/depthwise/depthwise3x3_conv_op_test.cc
@ -19,7 +19,7 @@ void AddNoiseInput(
  DeviceOption option;
  CPUContext context(option);
  Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob, CPU);
  tensor->Resize(shape);
  math::RandGaussian<float, CPUContext>(
--- a/caffe2/share/contrib/nnpack/conv_op.cc
+++ b/caffe2/share/contrib/nnpack/conv_op.cc
@ -231,11 +231,12 @@ bool NNPACKConvOp::RunOnDeviceWithOrderNCHW() {
            (transformedFilterSize + sizeof(float) - 1) / sizeof(float);
        for (auto g = 0; g < group_; g++) {
-          transformedFilters_[g] = ws_->CreateBlob(
+          transformedFilters_[g] = BlobGetMutableTensor(
-                                          "__transformed_kernel_" +
+              ws_->CreateBlob(
-                                          to_string(__sync_fetch_and_add(
+                  "__transformed_kernel_" +
-                                              &precomputed_transform_id, 1)))
+                  to_string(
-                                       ->GetMutableTensor(CPU);
+                      __sync_fetch_and_add(&precomputed_transform_id, 1))),
              CPU);
          transformedFilters_[g]->Resize(transformedFilterElements);
          status = nnp_convolution_inference(
--- a/caffe2/share/contrib/nnpack/nnpack_test.cc
+++ b/caffe2/share/contrib/nnpack/nnpack_test.cc
@ -19,7 +19,7 @@ void AddNoiseInput(
  DeviceOption option;
  CPUContext context(option);
  Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob, CPU);
  tensor->Resize(shape);
  math::RandGaussian<float, CPUContext>(
--- a/caffe2/utils/hip/math_blas_hip_test.cc
+++ b/caffe2/utils/hip/math_blas_hip_test.cc
@ -26,13 +26,13 @@ TEST(MathROCBLASTest, GemmNoTransNoTrans) {
  vector<int> shapeX{5, 10};
  vector<int> shapeW{10, 6};
  vector<int> shapeY{5, 6};
-  auto* tensorX = blobX->GetMutableTensor(HIP);
+  auto* tensorX = BlobGetMutableTensor(blobX, HIP);
  tensorX->Resize(shapeX);
-  auto* tensorW = blobW->GetMutableTensor(HIP);
+  auto* tensorW = BlobGetMutableTensor(blobW, HIP);
  tensorW->Resize(shapeW);
-  auto* tensorY = blobY->GetMutableTensor(HIP);
+  auto* tensorY = BlobGetMutableTensor(blobY, HIP);
  tensorY->Resize(shapeY);
-  auto* tensorY_host = blobY_host->GetMutableTensor(CPU);
+  auto* tensorY_host = BlobGetMutableTensor(blobY_host, CPU);
  tensorY_host->Resize(shapeY);
  EXPECT_EQ(tensorX->size(), 50);
@ -126,13 +126,13 @@ TEST(MathROCBLASTest, GemmNoTransTrans) {
  vector<int> shapeX{5, 10};
  vector<int> shapeW{6, 10};
  vector<int> shapeY{5, 6};
-  auto* tensorX = blobX->GetMutableTensor(HIP);
+  auto* tensorX = BlobGetMutableTensor(blobX, HIP);
  tensorX->Resize(shapeX);
-  auto* tensorW = blobW->GetMutableTensor(HIP);
+  auto* tensorW = BlobGetMutableTensor(blobW, HIP);
  tensorW->Resize(shapeW);
-  auto* tensorY = blobY->GetMutableTensor(HIP);
+  auto* tensorY = BlobGetMutableTensor(blobY, HIP);
  tensorY->Resize(shapeY);
-  auto* tensorY_host = blobY_host->GetMutableTensor(CPU);
+  auto* tensorY_host = BlobGetMutableTensor(blobY_host, CPU);
  tensorY_host->Resize(shapeY);
  EXPECT_EQ(tensorX->size(), 50);
@ -225,13 +225,13 @@ TEST(MathROCBLASTest, GemvNoTrans) {
  vector<int> shapeA{5, 10};
  vector<int> shapeX{10};
  vector<int> shapeY{5};
-  auto* tensorA = blobA->GetMutableTensor(HIP);
+  auto* tensorA = BlobGetMutableTensor(blobA, HIP);
  tensorA->Resize(shapeA);
-  auto* tensorX = blobX->GetMutableTensor(HIP);
+  auto* tensorX = BlobGetMutableTensor(blobX, HIP);
  tensorX->Resize(shapeX);
-  auto* tensorY = blobY->GetMutableTensor(HIP);
+  auto* tensorY = BlobGetMutableTensor(blobY, HIP);
  tensorY->Resize(shapeY);
-  auto* tensorY_host = blobY_host->GetMutableTensor(CPU);
+  auto* tensorY_host = BlobGetMutableTensor(blobY_host, CPU);
  tensorY_host->Resize(shapeY);
  EXPECT_EQ(tensorA->size(), 50);
@ -315,13 +315,13 @@ TEST(MathROCBLASTest, GemvTrans) {
  vector<int> shapeA{6, 10};
  vector<int> shapeX{6};
  vector<int> shapeY{10};
-  auto* tensorA = blobA->GetMutableTensor(HIP);
+  auto* tensorA = BlobGetMutableTensor(blobA, HIP);
  tensorA->Resize(shapeA);
-  auto* tensorX = blobX->GetMutableTensor(HIP);
+  auto* tensorX = BlobGetMutableTensor(blobX, HIP);
  tensorX->Resize(shapeX);
-  auto* tensorY = blobY->GetMutableTensor(HIP);
+  auto* tensorY = BlobGetMutableTensor(blobY, HIP);
  tensorY->Resize(shapeY);
-  auto* tensorY_host = blobY_host->GetMutableTensor(CPU);
+  auto* tensorY_host = BlobGetMutableTensor(blobY_host, CPU);
  tensorY_host->Resize(shapeY);
  EXPECT_EQ(tensorA->size(), 60);
--- a/caffe2/utils/math_gpu_test.cc
+++ b/caffe2/utils/math_gpu_test.cc
@ -41,9 +41,9 @@ void executeGpuBinaryOpTest(
  Blob* bloby = ws.CreateBlob("Y");
  Blob* bloby_host = ws.CreateBlob("Y_host");
-  auto* tensorx0 = blobx0->GetMutableTensor(CUDA);
+  auto* tensorx0 = BlobGetMutableTensor(blobx0, CUDA);
-  auto* tensorx1 = blobx1->GetMutableTensor(CUDA);
+  auto* tensorx1 = BlobGetMutableTensor(blobx1, CUDA);
-  auto* tensory = bloby->GetMutableTensor(CUDA);
+  auto* tensory = BlobGetMutableTensor(bloby, CUDA);
  vector<int> shapex0_vector{shapex0};
  vector<int> shapex1_vector{shapex1};
@ -71,7 +71,7 @@ void executeGpuBinaryOpTest(
  context.FinishDeviceComputation();
  // Copy result to CPU so we can inspect it
-  auto* tensory_host = bloby_host->GetMutableTensor(CPU);
+  auto* tensory_host = BlobGetMutableTensor(bloby_host, CPU);
  tensory_host->CopyFrom(*tensory, &context);
  context.FinishDeviceComputation();
@ -94,7 +94,7 @@ TEST(MathUtilGPUTest, testAddStripedBatch) {
  vector<int> shapex{33 * 9, 25};
  vector<int> shapey{33, 25};
-  auto* tensorx = blobx->GetMutableTensor(CUDA);
+  auto* tensorx = BlobGetMutableTensor(blobx, CUDA);
  tensorx->Resize(shapex);
  int stripe = 33 * 25;
  vector<float> tot(33, 0.0);
@ -110,7 +110,7 @@ TEST(MathUtilGPUTest, testAddStripedBatch) {
    }
  }
-  auto* tensory = bloby->GetMutableTensor(CUDA);
+  auto* tensory = BlobGetMutableTensor(bloby, CUDA);
  tensory->Resize(shapey);
  math::Set<float, CUDAContext>(
      stripe, 0.0, tensory->mutable_data<float>(), &context);
@ -125,7 +125,7 @@ TEST(MathUtilGPUTest, testAddStripedBatch) {
  context.FinishDeviceComputation();
  // Copy result to CPU so we can inspect it
-  auto* tensory_host = bloby_host->GetMutableTensor(CPU);
+  auto* tensory_host = BlobGetMutableTensor(bloby_host, CPU);
  tensory_host->CopyFrom(*tensory, &context);
  context.FinishDeviceComputation();
@ -258,9 +258,9 @@ class GemmBatchedGPUTest
    Blob* X_blob = ws_.CreateBlob("X");
    Blob* W_blob = ws_.CreateBlob("W");
    Blob* Y_blob = ws_.CreateBlob("Y");
-    X_ = X_blob->GetMutableTensor(CUDA);
+    X_ = BlobGetMutableTensor(X_blob, CUDA);
-    W_ = W_blob->GetMutableTensor(CUDA);
+    W_ = BlobGetMutableTensor(W_blob, CUDA);
-    Y_ = Y_blob->GetMutableTensor(CUDA);
+    Y_ = BlobGetMutableTensor(Y_blob, CUDA);
    X_->Resize(std::vector<int64_t>{3, 5, 10});
    W_->Resize(std::vector<int64_t>{3, 6, 10});
    Y_->Resize(std::vector<int64_t>{3, 5, 6});
@ -381,8 +381,8 @@ class ReduceTensorGPUTest : public testing::Test {
    cuda_context_ = make_unique<CUDAContext>(option_);
    Blob* blob_x = ws_.CreateBlob("X");
    Blob* blob_y = ws_.CreateBlob("Y");
-    X_ = blob_x->GetMutableTensor(CUDA);
+    X_ = BlobGetMutableTensor(blob_x, CUDA);
-    Y_ = blob_y->GetMutableTensor(CUDA);
+    Y_ = BlobGetMutableTensor(blob_y, CUDA);
  }
  void SetUpData(
@ -402,7 +402,7 @@ class ReduceTensorGPUTest : public testing::Test {
  void VerifyResult(const std::vector<float>& expected_output) {
    Blob* blob_y_host = ws_.CreateBlob("Y_host");
-    auto* Y_host = blob_y_host->GetMutableTensor(CPU);
+    auto* Y_host = BlobGetMutableTensor(blob_y_host, CPU);
    Y_host->CopyFrom(*Y_, cuda_context_.get());
    cuda_context_->FinishDeviceComputation();
    ASSERT_EQ(expected_output.size(), Y_host->size());
@ -664,8 +664,8 @@ class BroadcastGPUTest : public testing::Test {
    cuda_context_ = make_unique<CUDAContext>(option_);
    Blob* blob_x = ws_.CreateBlob("X");
    Blob* blob_y = ws_.CreateBlob("Y");
-    X_ = blob_x->GetMutableTensor(CUDA);
+    X_ = BlobGetMutableTensor(blob_x, CUDA);
-    Y_ = blob_y->GetMutableTensor(CUDA);
+    Y_ = BlobGetMutableTensor(blob_y, CUDA);
  }
  void SetUpData(
@ -681,7 +681,7 @@ class BroadcastGPUTest : public testing::Test {
  void VerifyResult(const std::vector<float>& expected_output) {
    Blob* blob_y_host = ws_.CreateBlob("Y_host");
-    auto* Y_host = blob_y_host->GetMutableTensor(CPU);
+    auto* Y_host = BlobGetMutableTensor(blob_y_host, CPU);
    Y_host->CopyFrom(*Y_, cuda_context_.get());
    cuda_context_->FinishDeviceComputation();
    ASSERT_EQ(expected_output.size(), Y_host->size());
@ -741,9 +741,9 @@ class MomentsGPUTest : public testing::Test {
    Blob* blob_x = ws_.CreateBlob("X");
    Blob* blob_mean = ws_.CreateBlob("mean");
    Blob* blob_variance = ws_.CreateBlob("variance");
-    X_ = blob_x->GetMutableTensor(CUDA);
+    X_ = BlobGetMutableTensor(blob_x, CUDA);
-    mean_ = blob_mean->GetMutableTensor(CUDA);
+    mean_ = BlobGetMutableTensor(blob_mean, CUDA);
-    variance_ = blob_variance->GetMutableTensor(CUDA);
+    variance_ = BlobGetMutableTensor(blob_variance, CUDA);
  }
  void SetUpData(
@ -766,10 +766,10 @@ class MomentsGPUTest : public testing::Test {
      const std::vector<float>& mean_data,
      const std::vector<float>& variance_data) {
    Blob* blob_mean_host = ws_.CreateBlob("mean_host");
-    auto* mean_host = blob_mean_host->GetMutableTensor(CPU);
+    auto* mean_host = BlobGetMutableTensor(blob_mean_host, CPU);
    mean_host->CopyFrom(*mean_, cuda_context_.get());
    Blob* blob_variance_host = ws_.CreateBlob("variance_host");
-    auto* variance_host = blob_variance_host->GetMutableTensor(CPU);
+    auto* variance_host = BlobGetMutableTensor(blob_variance_host, CPU);
    variance_host->CopyFrom(*variance_, cuda_context_.get());
    cuda_context_->FinishDeviceComputation();
@ -868,8 +868,8 @@ class TransposeGPUTest : public testing::Test {
    cuda_context_ = make_unique<CUDAContext>(option_);
    Blob* blob_x = ws_.CreateBlob("X");
    Blob* blob_y = ws_.CreateBlob("Y");
-    X_ = blob_x->GetMutableTensor(CUDA);
+    X_ = BlobGetMutableTensor(blob_x, CUDA);
-    Y_ = blob_y->GetMutableTensor(CUDA);
+    Y_ = BlobGetMutableTensor(blob_y, CUDA);
  }
  void SetUpData(
@ -890,7 +890,7 @@ class TransposeGPUTest : public testing::Test {
  void VerifyResult(const std::vector<float>& expected_output) {
    Blob* blob_y_host = ws_.CreateBlob("Y_host");
-    auto* Y_host = blob_y_host->GetMutableTensor(CPU);
+    auto* Y_host = BlobGetMutableTensor(blob_y_host, CPU);
    Y_host->CopyFrom(*Y_, cuda_context_.get());
    cuda_context_->FinishDeviceComputation();
    ASSERT_EQ(expected_output.size(), Y_host->size());