diff --git a/aten/src/ATen/core/Allocator.h b/aten/src/ATen/core/Allocator.h
index a3bae36efe4a..c8c8a236fbe9 100644
--- a/aten/src/ATen/core/Allocator.h
+++ b/aten/src/ATen/core/Allocator.h
@@ -43,6 +43,9 @@ class DataPtr {
   void* release_context() {
     return ptr_.release_context();
   }
+  std::unique_ptr<void, DeleterFnPtr>&& move_context() {
+    return ptr_.move_context();
+  }
   operator bool() const {
     return static_cast<bool>(ptr_);
   }
@@ -50,6 +53,9 @@ class DataPtr {
   T* cast_context(DeleterFnPtr expected_deleter) const {
     return ptr_.cast_context<T>(expected_deleter);
   }
+  DeleterFnPtr get_deleter() const {
+    return ptr_.get_deleter();
+  }
   Device device() const {
     return device_;
   }
diff --git a/aten/src/ATen/core/TensorImpl.cpp b/aten/src/ATen/core/TensorImpl.cpp
index 41cff4f8b6de..30287442b2c3 100644
--- a/aten/src/ATen/core/TensorImpl.cpp
+++ b/aten/src/ATen/core/TensorImpl.cpp
@@ -109,4 +109,20 @@ const Storage& TensorImpl::storage() const {
   return storage_;
 }
 
+static void deletePlacementDeleteContext(void* ptr) {
+  delete static_cast<PlacementDeleteContext*>(ptr);
+}
+
+at::DataPtr PlacementDeleteContext::makeDataPtr(
+    at::DataPtr&& data_ptr,
+    PlacementDtor placement_dtor,
+    size_t size,
+    at::Device device) {
+  auto* ptr = data_ptr.get();
+  return {ptr,
+          new PlacementDeleteContext(std::move(data_ptr), placement_dtor, size),
+          &deletePlacementDeleteContext,
+          device};
+}
+
 } // namespace at
diff --git a/aten/src/ATen/core/TensorImpl.h b/aten/src/ATen/core/TensorImpl.h
index 7662069a43cb..0d19ad5a5af4 100644
--- a/aten/src/ATen/core/TensorImpl.h
+++ b/aten/src/ATen/core/TensorImpl.h
@@ -3,13 +3,13 @@
 #include <atomic>
 #include <memory>
 
-#include "ATen/core/Storage.h"
-#include "ATen/core/optional.h"
-#include "ATen/core/TensorTypeId.h"
-#include "ATen/core/TensorTypeIdRegistration.h"
-#include "ATen/core/LegacyTypeDispatch.h"
-#include "ATen/core/Backend.h"
-#include "ATen/core/context_base.h"
+#include <ATen/core/Backend.h>
+#include <ATen/core/LegacyTypeDispatch.h>
+#include <ATen/core/Storage.h>
+#include <ATen/core/TensorTypeId.h>
+#include <ATen/core/TensorTypeIdRegistration.h>
+#include <ATen/core/context_base.h>
+#include <ATen/core/optional.h>
 
 #include "caffe2/core/allocator.h"
 #include "caffe2/core/common.h"
@@ -99,6 +99,39 @@ inline int canonical_axis_index_(int axis_index, int ndims) {
   return axis_index;
 }
 
+using PlacementDtor = void (*)(void*, size_t);
+
+/*
+ * A Context that will call extra placement deleter during
+ * deconstruction.
+ *
+ * Accept a already constructed DataPtr and store it as member
+ * during destruction, we'll call extra deleter on the underlying
+ * data pointer before the DataPtr is destructed.
+ * `data_ptr_` owns the memory.
+ */
+struct CAFFE2_API PlacementDeleteContext {
+  at::DataPtr data_ptr_;
+  PlacementDtor placement_dtor_;
+  size_t size_;
+  PlacementDeleteContext(
+      at::DataPtr&& data_ptr,
+      PlacementDtor placement_dtor,
+      size_t size)
+      : data_ptr_(std::move(data_ptr)),
+        placement_dtor_(placement_dtor),
+        size_(size) {}
+  static at::DataPtr makeDataPtr(
+      at::DataPtr&& data_ptr,
+      PlacementDtor placement_dtor,
+      size_t size,
+      at::Device device);
+  ~PlacementDeleteContext() {
+    placement_dtor_(data_ptr_.get(), size_);
+    // original memory will be freed when data_ptr_ is destructed
+  }
+};
+
 /**
  * The low-level representation of a tensor, which contains a storage
  * (which contains the actual data) and metadata (e.g., sizes and strides)
@@ -734,29 +767,19 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
         // destruction procedure.
         auto size = numel_;
         auto dtor = data_type_.dtor();
-        void* ptr;
-        at::DeleterFnPtr deleter;
-        auto ptr_and_deleter = GetStaticContext()->New(
+        auto data_ptr = GetStaticContext()->New(
             numel_ * storage_.itemsize()); // Removing this can get rid of
                                            // InefficientStdFunctionContext
-        ptr = ptr_and_deleter.first;
-        deleter = ptr_and_deleter.second;
-        storage_.set_data_ptr(at::InefficientStdFunctionContext::makeDataPtr(
-            ptr,
-            [size, dtor, deleter](void* local_ptr) -> void {
-              dtor(local_ptr, size);
-              deleter(local_ptr);
-            },
+        storage_.set_data_ptr(PlacementDeleteContext::makeDataPtr(
+            std::move(data_ptr),
+            dtor,
+            size,
             at::Device(storage_.device_type())));
         data_type_.ctor()(storage_.data(), numel_);
       } else {
         // For fundamental type, new and delete is easier.
-        auto ptr_and_deleter =
-            GetStaticContext()->New(numel_ * storage_.itemsize());
-        storage_.set_data_ptr(at::InefficientStdFunctionContext::makeDataPtr(
-            ptr_and_deleter.first,
-            ptr_and_deleter.second,
-            at::Device(storage_.device_type())));
+        storage_.set_data_ptr(
+            GetStaticContext()->New(numel_ * storage_.itemsize()));
       }
       storage_.set_numel(numel_);
       AT_ASSERT(storage_offset_ == 0); // because we just reallocated
diff --git a/aten/src/ATen/core/UniqueVoidPtr.h b/aten/src/ATen/core/UniqueVoidPtr.h
index daa6cdd37357..59bced8c037c 100644
--- a/aten/src/ATen/core/UniqueVoidPtr.h
+++ b/aten/src/ATen/core/UniqueVoidPtr.h
@@ -63,6 +63,10 @@ class UniqueVoidPtr {
   void* release_context() {
     return ctx_.release();
   }
+  std::unique_ptr<void, DeleterFnPtr>&& move_context() {
+    return std::move(ctx_);
+  }
+
   template <typename T>
   T* cast_context(DeleterFnPtr expected_deleter) const {
     if (get_deleter() != expected_deleter)
diff --git a/aten/src/ATen/core/context_base.h b/aten/src/ATen/core/context_base.h
index 13bc885da344..1aeec94ab14f 100644
--- a/aten/src/ATen/core/context_base.h
+++ b/aten/src/ATen/core/context_base.h
@@ -7,6 +7,7 @@
 #include <unordered_map>
 
 #include <ATen/core/ATenGeneral.h>
+#include <ATen/core/Allocator.h>
 #include <ATen/core/Device.h>
 #include <ATen/core/Error.h>
 #include <ATen/core/UniqueVoidPtr.h>
@@ -30,7 +31,7 @@ class CAFFE2_API BaseStaticContext {
  public:
   virtual ~BaseStaticContext() noexcept {}
 
-  virtual std::pair<void*, DeleterFnPtr> New(size_t nbytes) const = 0;
+  virtual at::DataPtr New(size_t nbytes) const = 0;
 
   virtual DeviceType GetDeviceType() = 0;
 
diff --git a/binaries/core_overhead_benchmark_gpu.cc b/binaries/core_overhead_benchmark_gpu.cc
index e024e4ddc9fa..b16d99c0bf41 100644
--- a/binaries/core_overhead_benchmark_gpu.cc
+++ b/binaries/core_overhead_benchmark_gpu.cc
@@ -190,9 +190,8 @@ BENCHMARK(BM_OperatorCreationCUDA);
 static void BM_RawAllocDeallocCPU(benchmark::State& state) {
   while (state.KeepRunning()) {
     // Allocating only 1 byte in order to measure the overhead.
-    auto ptr_and_deleter = GetCPUAllocator()->New(1);
-    // Deallocate.
-    ptr_and_deleter.second(ptr_and_deleter.first);
+    auto data_ptr = GetCPUAllocator()->allocate(1);
+    // Deallocated when it's out of scope
   }
 }
 BENCHMARK(BM_RawAllocDeallocCPU);
diff --git a/caffe2/core/allocator.cc b/caffe2/core/allocator.cc
index bd0e99b20a0c..0933e4f0d002 100644
--- a/caffe2/core/allocator.cc
+++ b/caffe2/core/allocator.cc
@@ -16,16 +16,17 @@ namespace caffe2 {
 
 void NoDelete(void*) {}
 
-static std::unique_ptr<CPUAllocator> g_cpu_allocator(new DefaultCPUAllocator());
-CPUAllocator* GetCPUAllocator() {
+static std::unique_ptr<at::Allocator> g_cpu_allocator(
+    new DefaultCPUAllocator());
+at::Allocator* GetCPUAllocator() {
   return g_cpu_allocator.get();
 }
 
-void SetCPUAllocator(CPUAllocator* alloc) {
+void SetCPUAllocator(at::Allocator* alloc) {
   g_cpu_allocator.reset(alloc);
 }
 
-MemoryAllocationReporter CPUStaticContext::reporter_;
+MemoryAllocationReporter DefaultCPUAllocator::reporter_;
 
 void MemoryAllocationReporter::New(void* ptr, size_t nbytes) {
   std::lock_guard<std::mutex> guard(mutex_);
diff --git a/caffe2/core/allocator.h b/caffe2/core/allocator.h
index aa41595ae06b..994a9e1e5994 100644
--- a/caffe2/core/allocator.h
+++ b/caffe2/core/allocator.h
@@ -4,6 +4,7 @@
 #include <cstring>
 #include <unordered_map>
 
+#include <ATen/core/Allocator.h>
 #include "caffe2/core/logging.h"
 #include "caffe2/core/numa.h"
 
@@ -42,10 +43,10 @@ class CAFFE2_API MemoryAllocationReporter {
   size_t allocated_;
 };
 
-struct CAFFE2_API DefaultCPUAllocator final : CPUAllocator {
+struct CAFFE2_API DefaultCPUAllocator final : at::Allocator {
   DefaultCPUAllocator() {}
   ~DefaultCPUAllocator() override {}
-  std::pair<void*, MemoryDeleter> New(size_t nbytes) override {
+  at::DataPtr allocate(size_t nbytes) const override {
     void* data = nullptr;
 #ifdef __ANDROID__
     data = memalign(gCaffe2Alignment, nbytes);
@@ -60,7 +61,11 @@ struct CAFFE2_API DefaultCPUAllocator final : CPUAllocator {
     if (FLAGS_caffe2_cpu_allocator_do_zero_fill) {
       memset(data, 0, nbytes);
     }
-    return {data, Delete};
+    if (FLAGS_caffe2_report_cpu_memory_usage) {
+      reporter_.New(data, nbytes);
+      return {data, data, &ReportAndDelete, at::Device(at::DeviceType::CPU)};
+    }
+    return {data, data, &Delete, at::Device(at::DeviceType::CPU)};
   }
 
 #ifdef _MSC_VER
@@ -73,16 +78,27 @@ struct CAFFE2_API DefaultCPUAllocator final : CPUAllocator {
   }
 #endif
 
-  MemoryDeleter GetDeleter() override {
-    return Delete;
+  static void ReportAndDelete(void* ptr) {
+    reporter_.Delete(ptr);
+    Delete(ptr);
   }
+
+  at::DeleterFnPtr raw_deleter() const override {
+    if (FLAGS_caffe2_report_cpu_memory_usage) {
+      return &ReportAndDelete;
+    }
+    return &Delete;
+  }
+
+ protected:
+  static MemoryAllocationReporter reporter_;
 };
 
 // Get the CPU Alloctor.
-CAFFE2_API CPUAllocator* GetCPUAllocator();
+CAFFE2_API at::Allocator* GetCPUAllocator();
 // Sets the CPU allocator to the given allocator: the caller gives away the
 // ownership of the pointer.
-CAFFE2_API void SetCPUAllocator(CPUAllocator* alloc);
+CAFFE2_API void SetCPUAllocator(at::Allocator* alloc);
 
 } // namespace caffe2
 
diff --git a/caffe2/core/context.h b/caffe2/core/context.h
index af66396af72c..b0f55504dcb9 100644
--- a/caffe2/core/context.h
+++ b/caffe2/core/context.h
@@ -13,8 +13,8 @@
 #include "caffe2/core/typeid.h"
 #include "caffe2/proto/caffe2_pb.h"
 
-#include "ATen/core/ATenCoreTest.h"
-#include "ATen/core/ArrayRef.h"
+#include <ATen/core/ATenCoreTest.h>
+#include <ATen/core/ArrayRef.h>
 
 CAFFE2_DECLARE_bool(caffe2_report_cpu_memory_usage);
 
@@ -85,7 +85,7 @@ class CAFFE2_API CPUContext final : public BaseContext {
     return *random_generator_.get();
   }
 
-  inline static std::pair<void*, MemoryDeleter> New(size_t nbytes) {
+  inline static at::DataPtr New(size_t nbytes) {
     return StaticContext()->New(nbytes);
   }
 
@@ -185,13 +185,8 @@ inline void CPUContext::CopyBytes<CPUContext, CPUContext>(
 // TODO(jerryzh): merge CPUStaticContext with Allocator
 class CAFFE2_API CPUStaticContext : public BaseStaticContext {
  public:
-  std::pair<void*, MemoryDeleter> New(size_t nbytes) const override {
-    auto data_and_deleter = GetCPUAllocator()->New(nbytes);
-    if (FLAGS_caffe2_report_cpu_memory_usage) {
-      reporter_.New(data_and_deleter.first, nbytes);
-      data_and_deleter.second = ReportAndDelete;
-    }
-    return data_and_deleter;
+  at::DataPtr New(size_t nbytes) const override {
+    return GetCPUAllocator()->allocate(nbytes);
   }
 
   DeviceType GetDeviceType() override {
@@ -204,14 +199,6 @@ class CAFFE2_API CPUStaticContext : public BaseStaticContext {
     device->set_device_type(TypeToProto(GetDeviceType()));
   }
 
- protected:
-  static MemoryAllocationReporter reporter_;
-
- private:
-  static void ReportAndDelete(void* ptr) {
-    reporter_.Delete(ptr);
-    GetCPUAllocator()->GetDeleter()(ptr);
-  }
 };
 
 }  // namespace caffe2
diff --git a/caffe2/core/context_gpu.cu b/caffe2/core/context_gpu.cu
index 0d9e2686212a..6591d65168cc 100644
--- a/caffe2/core/context_gpu.cu
+++ b/caffe2/core/context_gpu.cu
@@ -314,7 +314,8 @@ void TrackMemoryAlloc(size_t nbytes) {
 }
 }
 
-std::pair<void*, MemoryDeleter> CUDAStaticContext::New(size_t nbytes) const {
+// TODO: wrap this function in DefaultCUDAAllocator
+at::DataPtr CUDAStaticContext::New(size_t nbytes) const {
   // Lock the mutex
   std::lock_guard<std::mutex> lock(CUDAContext::mutex());
   // A one-time caffe2 cuda initializer.
@@ -331,7 +332,7 @@ std::pair<void*, MemoryDeleter> CUDAStaticContext::New(size_t nbytes) const {
       g_size_map[ptr] = nbytes;
       g_cuda_device_affiliation[ptr] = CaffeCudaGetDevice();
     }
-    return {ptr, Delete};
+    return {ptr, ptr, Delete, at::Device(CUDA)};
   case CudaMemoryPoolType::CUB:
     CUDA_ENFORCE(g_cub_allocator->DeviceAllocate(&ptr, nbytes));
     g_cuda_device_affiliation[ptr] = CaffeCudaGetDevice();
@@ -340,16 +341,16 @@ std::pair<void*, MemoryDeleter> CUDAStaticContext::New(size_t nbytes) const {
     if (FLAGS_caffe2_gpu_memory_tracking) {
       g_size_map[ptr] = nbytes;
     }
-    return {ptr, Delete};
+    return {ptr, ptr, Delete, at::Device(CUDA)};
   case CudaMemoryPoolType::THC:
     CUDA_ENFORCE(g_thc_allocator->Alloc(&ptr, nbytes, 0 /* stream */));
     if (FLAGS_caffe2_gpu_memory_tracking) {
       g_size_map[ptr] = nbytes;
       g_cuda_device_affiliation[ptr] = CaffeCudaGetDevice();
     }
-    return {ptr, Delete};
+    return {ptr, ptr, Delete, at::Device(CUDA)};
   }
-  return {nullptr, Delete};
+  return {nullptr, nullptr, Delete, at::Device(CUDA)};
 }
 
 void CUDAStaticContext::Delete(void* ptr) {
diff --git a/caffe2/core/context_gpu.h b/caffe2/core/context_gpu.h
index ce73f5f94282..ce148475a3ab 100644
--- a/caffe2/core/context_gpu.h
+++ b/caffe2/core/context_gpu.h
@@ -223,7 +223,7 @@ class CAFFE2_CUDA_API CUDAContext final : public BaseContext {
     return curand_generator_;
   }
 
-  inline static std::pair<void*, MemoryDeleter> New(size_t nbytes) {
+  inline static at::DataPtr New(size_t nbytes) {
     return StaticContext()->New(nbytes);
   }
 
@@ -334,26 +334,28 @@ inline void CPUContext::CopyBytes<CPUContext, CUDAContext>(
  * GPU present during runtime, at global initialization time we will set
  * the CPU memory allocator to allocate pinned memory.
  */
-struct CAFFE2_CUDA_API PinnedCPUAllocator final : CPUAllocator {
+struct CAFFE2_CUDA_API PinnedCPUAllocator final : public at::Allocator {
   PinnedCPUAllocator() {}
   ~PinnedCPUAllocator() override {}
-  std::pair<void*, MemoryDeleter> New(size_t nbytes) override {
+  at::DataPtr allocate(size_t nbytes) const override {
     void* data;
+    at::DataPtr data_ptr;
     std::lock_guard<std::mutex> lock(CUDAContext::mutex());
     if (IsNUMAEnabled()) {
-      auto ptr_and_deleter = baseAllocator_.New(nbytes);
-      data = ptr_and_deleter.first;
+      data_ptr = baseAllocator_.allocate(nbytes);
+      data = data_ptr.get();
       CAFFE_ENFORCE(data);
       CUDA_ENFORCE(cudaHostRegister(data, nbytes, cudaHostRegisterDefault));
     } else {
       CUDA_ENFORCE(cudaMallocHost(&data, nbytes));
+      data_ptr = {data, data, &Delete, at::Device(CPU)};
     }
     memset(data, 0, nbytes);
-    return {data, Delete};
+    return data_ptr;
   }
 
-  MemoryDeleter GetDeleter() override {
-    return Delete;
+  at::DeleterFnPtr raw_deleter() const override {
+    return &Delete;
   }
 
  private:
@@ -385,13 +387,14 @@ struct CAFFE2_CUDA_API PinnedCPUAllocator final : CPUAllocator {
 
 class CAFFE2_CUDA_API CUDAStaticContext final : public BaseStaticContext {
  public:
-  std::pair<void*, MemoryDeleter> New(size_t nbytes) const override;
+  at::DataPtr New(size_t nbytes) const override;
 
   DeviceType GetDeviceType() override {
     return CUDA;
   }
 
   void ExtractDeviceOption(DeviceOption* device, const void* data) override {
+    CAFFE_ENFORCE(data, "data cannot be nullptr");
     device->set_device_type(TypeToProto(GetDeviceType()));
     device->set_cuda_gpu_id(GetGPUIDForPointer(data));
   }
diff --git a/caffe2/core/context_gpu_test.cc b/caffe2/core/context_gpu_test.cc
index ce1cb74b9881..956a231ce641 100644
--- a/caffe2/core/context_gpu_test.cc
+++ b/caffe2/core/context_gpu_test.cc
@@ -11,12 +11,6 @@ CAFFE2_DECLARE_bool(caffe2_cuda_full_device_control);
 
 namespace caffe2 {
 
-namespace {
-std::shared_ptr<void> shared_from_new(std::pair<void*, MemoryDeleter>&& p) {
-  return std::shared_ptr<void>(p.first, std::move(p.second));
-}
-}
-
 TEST(CUDATest, HasCudaRuntime) {
   EXPECT_TRUE(HasCudaRuntime());
 }
@@ -25,7 +19,7 @@ TEST(CUDAContextTest, TestAllocDealloc) {
   if (!HasCudaGPU()) return;
   CUDAContext context(0);
   context.SwitchToDevice();
-  auto data = shared_from_new(CUDAContext::New(10 * sizeof(float)));
+  auto data = CUDAContext::New(10 * sizeof(float));
   EXPECT_NE(data.get(), nullptr);
 }
 
@@ -66,20 +60,20 @@ TEST(CUDAContextTest, MemoryPoolAllocateDealloc) {
   for (int i = 0; i < NumCudaDevices(); ++i) {
     LOG(INFO) << "Device " << i << " of " << NumCudaDevices();
     DeviceGuard guard(i);
-    auto allocated = shared_from_new(CUDAContext::New(nbytes));
+    auto allocated = CUDAContext::New(nbytes);
     EXPECT_NE(allocated, nullptr);
     cudaPointerAttributes attr;
     CUDA_ENFORCE(cudaPointerGetAttributes(&attr, allocated.get()));
     EXPECT_EQ(attr.memoryType, cudaMemoryTypeDevice);
     EXPECT_EQ(attr.device, i);
     void* prev_allocated = allocated.get();
-    allocated.reset();
-    auto new_allocated = shared_from_new(CUDAContext::New(nbytes));
+    allocated.clear();
+    auto new_allocated = CUDAContext::New(nbytes);
     // With a pool, the above allocation should yield the same address.
     EXPECT_EQ(new_allocated.get(), prev_allocated);
     // But, if we are allocating something larger, we will have a different
     // chunk of memory.
-    auto larger_allocated = shared_from_new(CUDAContext::New(nbytes * 2));
+    auto larger_allocated = CUDAContext::New(nbytes * 2);
     EXPECT_NE(larger_allocated.get(), prev_allocated);
   }
 }
diff --git a/caffe2/core/context_test.cc b/caffe2/core/context_test.cc
index a01f967b536a..987c9c422cb3 100644
--- a/caffe2/core/context_test.cc
+++ b/caffe2/core/context_test.cc
@@ -14,17 +14,17 @@ TEST(CPUContextTest, ATenCoreTest) {
 TEST(CPUContextTest, TestAllocAlignment) {
   for (int i = 1; i < 10; ++i) {
     auto data = CPUContext::New(i);
-    EXPECT_EQ((reinterpret_cast<size_t>(data.first) % gCaffe2Alignment), 0);
-    data.second(data.first);
+    EXPECT_EQ((reinterpret_cast<size_t>(data.get()) % gCaffe2Alignment), 0);
+    // data is freed when out of scope
   }
 }
 
 TEST(CPUContextTest, TestAllocDealloc) {
-  auto data_and_deleter = CPUContext::New(10 * sizeof(float));
-  float* data = static_cast<float*>(data_and_deleter.first);
+  auto data_ptr = CPUContext::New(10 * sizeof(float));
+  float* data = static_cast<float*>(data_ptr.get());
   EXPECT_NE(data, nullptr);
-  auto dst_data_and_deleter = CPUContext::New(10 * sizeof(float));
-  float* dst_data = static_cast<float*>(dst_data_and_deleter.first);
+  auto dst_data_ptr = CPUContext::New(10 * sizeof(float));
+  float* dst_data = static_cast<float*>(dst_data_ptr.get());
   EXPECT_NE(dst_data, nullptr);
   for (int i = 0; i < 10; ++i) {
     data[i] = i;
@@ -35,8 +35,7 @@ TEST(CPUContextTest, TestAllocDealloc) {
   for (int i = 0; i < 10; ++i) {
     EXPECT_FLOAT_EQ(dst_data[i], i);
   }
-  data_and_deleter.second(data);
-  dst_data_and_deleter.second(dst_data);
+  // data_ptr is freed when out of scope
 }
 
 }  // namespace caffe2
diff --git a/caffe2/core/cudnn_wrappers.h b/caffe2/core/cudnn_wrappers.h
index 1bd39fa62a39..2e6e185ea387 100644
--- a/caffe2/core/cudnn_wrappers.h
+++ b/caffe2/core/cudnn_wrappers.h
@@ -24,8 +24,7 @@ struct CuDNNWorkspace {
   void* get(size_t nbytes) {
     if (nbytes_ < nbytes) {
       reset();
-      auto data_and_deleter = CUDAContext::New(nbytes);
-      data_ = {data_and_deleter.first, data_and_deleter.second};
+      data_ = CUDAContext::New(nbytes);
       nbytes_ = nbytes;
     }
     CAFFE_ENFORCE_GE(nbytes_, nbytes);
@@ -33,12 +32,12 @@ struct CuDNNWorkspace {
   }
 
   void reset() {
-    data_ = nullptr;
+    data_.clear();
     nbytes_ = 0;
   }
 
  private:
-  std::unique_ptr<void, MemoryDeleter> data_{nullptr, NoDelete};
+  at::DataPtr data_{nullptr, nullptr, &NoDelete, at::Device(CUDA)};
   size_t nbytes_{0};
 };
 
diff --git a/caffe2/core/hip/context_hip.cc b/caffe2/core/hip/context_hip.cc
index 3eadaf0e71b1..12e76770201b 100644
--- a/caffe2/core/hip/context_hip.cc
+++ b/caffe2/core/hip/context_hip.cc
@@ -326,7 +326,7 @@ void TrackMemoryAlloc(size_t nbytes)
 }
 }
 
-std::pair<void*, MemoryDeleter> HIPStaticContext::New(size_t nbytes) const {
+at::DataPtr HIPStaticContext::New(size_t nbytes) const {
   // Lock the mutex
   std::lock_guard<std::mutex> lock(HIPContext::mutex());
   // A one-time caffe2 cuda initializer.
@@ -344,7 +344,7 @@ std::pair<void*, MemoryDeleter> HIPStaticContext::New(size_t nbytes) const {
             g_size_map[ptr]               = nbytes;
             g_hip_device_affiliation[ptr] = CaffeHipGetDevice();
         }
-        return {ptr, Delete};
+        return {ptr, ptr, &Delete, at::Device(HIP)};
     case HipMemoryPoolType::CUB:
         HIP_ENFORCE(g_cub_allocator->DeviceAllocate(&ptr, nbytes));
         g_hip_device_affiliation[ptr] = CaffeHipGetDevice();
@@ -353,7 +353,7 @@ std::pair<void*, MemoryDeleter> HIPStaticContext::New(size_t nbytes) const {
         {
             g_size_map[ptr] = nbytes;
         }
-        return {ptr, Delete};
+        return {ptr, ptr, &Delete, at::Device(HIP)};
     case HipMemoryPoolType::THC:
         HIP_ENFORCE(g_thc_allocator->Alloc(&ptr, nbytes, 0 /* stream */));
         if (FLAGS_caffe2_gpu_memory_tracking)
@@ -361,9 +361,9 @@ std::pair<void*, MemoryDeleter> HIPStaticContext::New(size_t nbytes) const {
           g_size_map[ptr]                = nbytes;
           g_hip_device_affiliation[ptr] = CaffeHipGetDevice();
         }
-        return {ptr, Delete};
-    }
-    return {nullptr, Delete};
+        return {ptr, ptr, &Delete, at::Device(HIP)};
+  }
+  return {nullptr, nullptr, &Delete, at::Device(HIP)};
 }
 
 void HIPStaticContext::Delete(void* ptr) {
diff --git a/caffe2/core/hip/context_hip.h b/caffe2/core/hip/context_hip.h
index fb04336354e7..d7d456b975e5 100644
--- a/caffe2/core/hip/context_hip.h
+++ b/caffe2/core/hip/context_hip.h
@@ -206,7 +206,7 @@ class HIPContext final : public BaseContext {
     return hiprand_generator_;
   }
 
-  static std::pair<void*, MemoryDeleter> New(size_t nbytes) {
+  static at::DataPtr New(size_t nbytes) {
     return StaticContext()->New(nbytes);
   }
 
@@ -323,26 +323,28 @@ inline void CPUContext::CopyBytes<CPUContext, HIPContext>(
  * GPU present during runtime, at global initialization time we will set
  * the CPU memory allocator to allocate pinned memory.
  */
-struct PinnedCPUAllocator final : CPUAllocator {
+struct PinnedCPUAllocator final : public at::Allocator {
   PinnedCPUAllocator() {}
   ~PinnedCPUAllocator() override {}
-  std::pair<void*, MemoryDeleter> New(size_t nbytes) override {
+  at::DataPtr allocate(size_t nbytes) const override {
     void* data;
+    at::DataPtr data_ptr;
     std::lock_guard<std::mutex> lock(HIPContext::mutex());
     if (IsNUMAEnabled()) {
-      auto ptr_and_deleter = baseAllocator_.New(nbytes);
-      data = ptr_and_deleter.first;
+      data_ptr = baseAllocator_.allocate(nbytes);
+      data = data_ptr.get();
       CAFFE_ENFORCE(data);
       HIP_ENFORCE(hipHostRegister(data, nbytes, hipHostRegisterDefault));
     } else {
       HIP_ENFORCE(hipHostMalloc(&data, nbytes));
+      data_ptr = {data, data, &Delete, at::Device(CPU)};
     }
     memset(data, 0, nbytes);
-    return {data, Delete};
+    return data_ptr;
   }
 
-  MemoryDeleter GetDeleter() override {
-    return Delete;
+  at::DeleterFnPtr raw_deleter() const override {
+    return &Delete;
   }
 
  private:
@@ -374,7 +376,7 @@ struct PinnedCPUAllocator final : CPUAllocator {
 
 class HIPStaticContext final : public BaseStaticContext {
  public:
-  std::pair<void*, MemoryDeleter> New(size_t nbytes) const override;
+  at::DataPtr New(size_t nbytes) const override;
 
   DeviceType GetDeviceType() override {
     return HIP;
diff --git a/caffe2/core/hip/miopen_wrapper.h b/caffe2/core/hip/miopen_wrapper.h
index 328c7522258d..46977248e0da 100644
--- a/caffe2/core/hip/miopen_wrapper.h
+++ b/caffe2/core/hip/miopen_wrapper.h
@@ -26,8 +26,7 @@ struct MIOPENWorkspace
         if(nbytes_ < nbytes)
         {
             reset();
-            auto data_and_deleter = HIPContext::New(nbytes);
-            data_                 = {data_and_deleter.first, data_and_deleter.second};
+            data_ = HIPContext::New(nbytes);
             nbytes_               = nbytes;
         }
         CAFFE_ENFORCE_GE(nbytes_, nbytes);
@@ -36,13 +35,13 @@ struct MIOPENWorkspace
 
     void reset()
     {
-        data_   = nullptr;
-        nbytes_ = 0;
+      data_.clear();
+      nbytes_ = 0;
     }
 
     private:
-    std::unique_ptr<void, MemoryDeleter> data_{nullptr, NoDelete};
-    size_t nbytes_{0};
+     at::DataPtr data_;
+     size_t nbytes_{0};
 };
 
 // MIOPENState is the owner of the MIOPENWorkspace, and serializes all
diff --git a/caffe2/core/qtensor.h b/caffe2/core/qtensor.h
index 385ebf1d5f9f..867dee0cdf45 100644
--- a/caffe2/core/qtensor.h
+++ b/caffe2/core/qtensor.h
@@ -59,7 +59,7 @@ class C10_EXPORT QTensor {
       size_t source_size = std::accumulate(
           dim_source.begin(), dim_source.end(), 1, std::multiplies<int>());
       if ((source_size * (precision_ + signed_)) > capacity_) {
-        data_.reset();
+        data_ptr_.clear();
         capacity_ = 0;
       }
       dims_ = dim_source;
@@ -104,12 +104,12 @@ class C10_EXPORT QTensor {
 
   void SetPrecision(const unsigned char precision) {
     precision_ = precision;
-    data_.reset();
+    data_ptr_.clear();
   }
 
   void SetSigned(const bool make_signed = true) {
     signed_ = make_signed;
-    data_.reset();
+    data_ptr_.clear();
   }
 
   void SetScale(const double scale) {
@@ -121,19 +121,16 @@ class C10_EXPORT QTensor {
   }
 
   unsigned char* mutable_data() {
-    if (!data_) {
-      auto ptr_and_deleter = Context::New(nbytes());
-      data_.reset(
-          static_cast<unsigned char*>(ptr_and_deleter.first),
-          ptr_and_deleter.second);
+    if (!data_ptr_) {
+      data_ptr_ = Context::New(nbytes());
       capacity_ = nbytes() * CHAR_BIT;
     }
     CAFFE_ENFORCE(capacity_ == nbytes() * CHAR_BIT);
-    return data_.get();
+    return static_cast<unsigned char*>(data_ptr_.get());
   }
 
   inline const unsigned char* data() const {
-    return data_.get();
+    return static_cast<unsigned char*>(data_ptr_.get());
   }
 
   inline size_t size() const {
@@ -242,7 +239,7 @@ class C10_EXPORT QTensor {
   unsigned char alignment_ = CHAR_BIT;
 
   // Allocated data.
-  std::shared_ptr<unsigned char> data_;
+  at::DataPtr data_ptr_;
 
   // value = scale_ * (x + bias_)
   double scale_;
diff --git a/caffe2/ideep/utils/ideep_context.h b/caffe2/ideep/utils/ideep_context.h
index 087078c507d1..b5c702ea3d64 100644
--- a/caffe2/ideep/utils/ideep_context.h
+++ b/caffe2/ideep/utils/ideep_context.h
@@ -55,7 +55,7 @@ class IDEEPContext final : public BaseContext {
     return *random_generator_.get();
   }
 
-  inline static std::pair<void*, MemoryDeleter> New(size_t nbytes) {
+  inline static at::DataPtr New(size_t nbytes) {
     return StaticContext()->New(nbytes);
   }
 
@@ -176,8 +176,8 @@ inline void IDEEPContext::CopyBytes<IDEEPContext, CPUContext>(
 
 class IDEEPStaticContext : public BaseStaticContext {
  public:
-  inline std::pair<void*, MemoryDeleter> New(size_t nbytes) const override {
-    return GetCPUAllocator()->New(nbytes);
+  inline at::DataPtr New(size_t nbytes) const override {
+    return GetCPUAllocator()->allocate(nbytes);
   }
 
   DeviceType GetDeviceType() override {
diff --git a/caffe2/mkl/utils/mkl_context.h b/caffe2/mkl/utils/mkl_context.h
index 8364026d91c6..7735283b6e04 100644
--- a/caffe2/mkl/utils/mkl_context.h
+++ b/caffe2/mkl/utils/mkl_context.h
@@ -62,7 +62,7 @@ class MKLContext : public BaseContext {
     return *random_generator_.get();
   }
 
-  inline static std::pair<void*, MemoryDeleter> New(size_t nbytes) {
+  inline static at::DataPtr New(size_t nbytes) {
     return StaticContext()->New(nbytes);
   }
 
@@ -153,8 +153,8 @@ inline void MKLContext::CopyBytes<MKLContext, MKLContext>(
 
 class MKLStaticContext : public BaseStaticContext {
  public:
-  inline std::pair<void*, MemoryDeleter> New(size_t nbytes) const override {
-    return GetCPUAllocator()->New(nbytes);
+  inline at::DataPtr New(size_t nbytes) const override {
+    return GetCPUAllocator()->allocate(nbytes);
   }
 
   DeviceType GetDeviceType() override {