diff --git a/aten/src/ATen/core/Allocator.h b/aten/src/ATen/core/Allocator.h index a3bae36efe4a..c8c8a236fbe9 100644 --- a/aten/src/ATen/core/Allocator.h +++ b/aten/src/ATen/core/Allocator.h @@ -43,6 +43,9 @@ class DataPtr { void* release_context() { return ptr_.release_context(); } + std::unique_ptr&& move_context() { + return ptr_.move_context(); + } operator bool() const { return static_cast(ptr_); } @@ -50,6 +53,9 @@ class DataPtr { T* cast_context(DeleterFnPtr expected_deleter) const { return ptr_.cast_context(expected_deleter); } + DeleterFnPtr get_deleter() const { + return ptr_.get_deleter(); + } Device device() const { return device_; } diff --git a/aten/src/ATen/core/TensorImpl.cpp b/aten/src/ATen/core/TensorImpl.cpp index 41cff4f8b6de..30287442b2c3 100644 --- a/aten/src/ATen/core/TensorImpl.cpp +++ b/aten/src/ATen/core/TensorImpl.cpp @@ -109,4 +109,20 @@ const Storage& TensorImpl::storage() const { return storage_; } +static void deletePlacementDeleteContext(void* ptr) { + delete static_cast(ptr); +} + +at::DataPtr PlacementDeleteContext::makeDataPtr( + at::DataPtr&& data_ptr, + PlacementDtor placement_dtor, + size_t size, + at::Device device) { + auto* ptr = data_ptr.get(); + return {ptr, + new PlacementDeleteContext(std::move(data_ptr), placement_dtor, size), + &deletePlacementDeleteContext, + device}; +} + } // namespace at diff --git a/aten/src/ATen/core/TensorImpl.h b/aten/src/ATen/core/TensorImpl.h index 7662069a43cb..0d19ad5a5af4 100644 --- a/aten/src/ATen/core/TensorImpl.h +++ b/aten/src/ATen/core/TensorImpl.h @@ -3,13 +3,13 @@ #include #include -#include "ATen/core/Storage.h" -#include "ATen/core/optional.h" -#include "ATen/core/TensorTypeId.h" -#include "ATen/core/TensorTypeIdRegistration.h" -#include "ATen/core/LegacyTypeDispatch.h" -#include "ATen/core/Backend.h" -#include "ATen/core/context_base.h" +#include +#include +#include +#include +#include +#include +#include #include "caffe2/core/allocator.h" #include "caffe2/core/common.h" @@ -99,6 +99,39 @@ inline int canonical_axis_index_(int axis_index, int ndims) { return axis_index; } +using PlacementDtor = void (*)(void*, size_t); + +/* + * A Context that will call extra placement deleter during + * deconstruction. + * + * Accept a already constructed DataPtr and store it as member + * during destruction, we'll call extra deleter on the underlying + * data pointer before the DataPtr is destructed. + * `data_ptr_` owns the memory. + */ +struct CAFFE2_API PlacementDeleteContext { + at::DataPtr data_ptr_; + PlacementDtor placement_dtor_; + size_t size_; + PlacementDeleteContext( + at::DataPtr&& data_ptr, + PlacementDtor placement_dtor, + size_t size) + : data_ptr_(std::move(data_ptr)), + placement_dtor_(placement_dtor), + size_(size) {} + static at::DataPtr makeDataPtr( + at::DataPtr&& data_ptr, + PlacementDtor placement_dtor, + size_t size, + at::Device device); + ~PlacementDeleteContext() { + placement_dtor_(data_ptr_.get(), size_); + // original memory will be freed when data_ptr_ is destructed + } +}; + /** * The low-level representation of a tensor, which contains a storage * (which contains the actual data) and metadata (e.g., sizes and strides) @@ -734,29 +767,19 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { // destruction procedure. auto size = numel_; auto dtor = data_type_.dtor(); - void* ptr; - at::DeleterFnPtr deleter; - auto ptr_and_deleter = GetStaticContext()->New( + auto data_ptr = GetStaticContext()->New( numel_ * storage_.itemsize()); // Removing this can get rid of // InefficientStdFunctionContext - ptr = ptr_and_deleter.first; - deleter = ptr_and_deleter.second; - storage_.set_data_ptr(at::InefficientStdFunctionContext::makeDataPtr( - ptr, - [size, dtor, deleter](void* local_ptr) -> void { - dtor(local_ptr, size); - deleter(local_ptr); - }, + storage_.set_data_ptr(PlacementDeleteContext::makeDataPtr( + std::move(data_ptr), + dtor, + size, at::Device(storage_.device_type()))); data_type_.ctor()(storage_.data(), numel_); } else { // For fundamental type, new and delete is easier. - auto ptr_and_deleter = - GetStaticContext()->New(numel_ * storage_.itemsize()); - storage_.set_data_ptr(at::InefficientStdFunctionContext::makeDataPtr( - ptr_and_deleter.first, - ptr_and_deleter.second, - at::Device(storage_.device_type()))); + storage_.set_data_ptr( + GetStaticContext()->New(numel_ * storage_.itemsize())); } storage_.set_numel(numel_); AT_ASSERT(storage_offset_ == 0); // because we just reallocated diff --git a/aten/src/ATen/core/UniqueVoidPtr.h b/aten/src/ATen/core/UniqueVoidPtr.h index daa6cdd37357..59bced8c037c 100644 --- a/aten/src/ATen/core/UniqueVoidPtr.h +++ b/aten/src/ATen/core/UniqueVoidPtr.h @@ -63,6 +63,10 @@ class UniqueVoidPtr { void* release_context() { return ctx_.release(); } + std::unique_ptr&& move_context() { + return std::move(ctx_); + } + template T* cast_context(DeleterFnPtr expected_deleter) const { if (get_deleter() != expected_deleter) diff --git a/aten/src/ATen/core/context_base.h b/aten/src/ATen/core/context_base.h index 13bc885da344..1aeec94ab14f 100644 --- a/aten/src/ATen/core/context_base.h +++ b/aten/src/ATen/core/context_base.h @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -30,7 +31,7 @@ class CAFFE2_API BaseStaticContext { public: virtual ~BaseStaticContext() noexcept {} - virtual std::pair New(size_t nbytes) const = 0; + virtual at::DataPtr New(size_t nbytes) const = 0; virtual DeviceType GetDeviceType() = 0; diff --git a/binaries/core_overhead_benchmark_gpu.cc b/binaries/core_overhead_benchmark_gpu.cc index e024e4ddc9fa..b16d99c0bf41 100644 --- a/binaries/core_overhead_benchmark_gpu.cc +++ b/binaries/core_overhead_benchmark_gpu.cc @@ -190,9 +190,8 @@ BENCHMARK(BM_OperatorCreationCUDA); static void BM_RawAllocDeallocCPU(benchmark::State& state) { while (state.KeepRunning()) { // Allocating only 1 byte in order to measure the overhead. - auto ptr_and_deleter = GetCPUAllocator()->New(1); - // Deallocate. - ptr_and_deleter.second(ptr_and_deleter.first); + auto data_ptr = GetCPUAllocator()->allocate(1); + // Deallocated when it's out of scope } } BENCHMARK(BM_RawAllocDeallocCPU); diff --git a/caffe2/core/allocator.cc b/caffe2/core/allocator.cc index bd0e99b20a0c..0933e4f0d002 100644 --- a/caffe2/core/allocator.cc +++ b/caffe2/core/allocator.cc @@ -16,16 +16,17 @@ namespace caffe2 { void NoDelete(void*) {} -static std::unique_ptr g_cpu_allocator(new DefaultCPUAllocator()); -CPUAllocator* GetCPUAllocator() { +static std::unique_ptr g_cpu_allocator( + new DefaultCPUAllocator()); +at::Allocator* GetCPUAllocator() { return g_cpu_allocator.get(); } -void SetCPUAllocator(CPUAllocator* alloc) { +void SetCPUAllocator(at::Allocator* alloc) { g_cpu_allocator.reset(alloc); } -MemoryAllocationReporter CPUStaticContext::reporter_; +MemoryAllocationReporter DefaultCPUAllocator::reporter_; void MemoryAllocationReporter::New(void* ptr, size_t nbytes) { std::lock_guard guard(mutex_); diff --git a/caffe2/core/allocator.h b/caffe2/core/allocator.h index aa41595ae06b..994a9e1e5994 100644 --- a/caffe2/core/allocator.h +++ b/caffe2/core/allocator.h @@ -4,6 +4,7 @@ #include #include +#include #include "caffe2/core/logging.h" #include "caffe2/core/numa.h" @@ -42,10 +43,10 @@ class CAFFE2_API MemoryAllocationReporter { size_t allocated_; }; -struct CAFFE2_API DefaultCPUAllocator final : CPUAllocator { +struct CAFFE2_API DefaultCPUAllocator final : at::Allocator { DefaultCPUAllocator() {} ~DefaultCPUAllocator() override {} - std::pair New(size_t nbytes) override { + at::DataPtr allocate(size_t nbytes) const override { void* data = nullptr; #ifdef __ANDROID__ data = memalign(gCaffe2Alignment, nbytes); @@ -60,7 +61,11 @@ struct CAFFE2_API DefaultCPUAllocator final : CPUAllocator { if (FLAGS_caffe2_cpu_allocator_do_zero_fill) { memset(data, 0, nbytes); } - return {data, Delete}; + if (FLAGS_caffe2_report_cpu_memory_usage) { + reporter_.New(data, nbytes); + return {data, data, &ReportAndDelete, at::Device(at::DeviceType::CPU)}; + } + return {data, data, &Delete, at::Device(at::DeviceType::CPU)}; } #ifdef _MSC_VER @@ -73,16 +78,27 @@ struct CAFFE2_API DefaultCPUAllocator final : CPUAllocator { } #endif - MemoryDeleter GetDeleter() override { - return Delete; + static void ReportAndDelete(void* ptr) { + reporter_.Delete(ptr); + Delete(ptr); } + + at::DeleterFnPtr raw_deleter() const override { + if (FLAGS_caffe2_report_cpu_memory_usage) { + return &ReportAndDelete; + } + return &Delete; + } + + protected: + static MemoryAllocationReporter reporter_; }; // Get the CPU Alloctor. -CAFFE2_API CPUAllocator* GetCPUAllocator(); +CAFFE2_API at::Allocator* GetCPUAllocator(); // Sets the CPU allocator to the given allocator: the caller gives away the // ownership of the pointer. -CAFFE2_API void SetCPUAllocator(CPUAllocator* alloc); +CAFFE2_API void SetCPUAllocator(at::Allocator* alloc); } // namespace caffe2 diff --git a/caffe2/core/context.h b/caffe2/core/context.h index af66396af72c..b0f55504dcb9 100644 --- a/caffe2/core/context.h +++ b/caffe2/core/context.h @@ -13,8 +13,8 @@ #include "caffe2/core/typeid.h" #include "caffe2/proto/caffe2_pb.h" -#include "ATen/core/ATenCoreTest.h" -#include "ATen/core/ArrayRef.h" +#include +#include CAFFE2_DECLARE_bool(caffe2_report_cpu_memory_usage); @@ -85,7 +85,7 @@ class CAFFE2_API CPUContext final : public BaseContext { return *random_generator_.get(); } - inline static std::pair New(size_t nbytes) { + inline static at::DataPtr New(size_t nbytes) { return StaticContext()->New(nbytes); } @@ -185,13 +185,8 @@ inline void CPUContext::CopyBytes( // TODO(jerryzh): merge CPUStaticContext with Allocator class CAFFE2_API CPUStaticContext : public BaseStaticContext { public: - std::pair New(size_t nbytes) const override { - auto data_and_deleter = GetCPUAllocator()->New(nbytes); - if (FLAGS_caffe2_report_cpu_memory_usage) { - reporter_.New(data_and_deleter.first, nbytes); - data_and_deleter.second = ReportAndDelete; - } - return data_and_deleter; + at::DataPtr New(size_t nbytes) const override { + return GetCPUAllocator()->allocate(nbytes); } DeviceType GetDeviceType() override { @@ -204,14 +199,6 @@ class CAFFE2_API CPUStaticContext : public BaseStaticContext { device->set_device_type(TypeToProto(GetDeviceType())); } - protected: - static MemoryAllocationReporter reporter_; - - private: - static void ReportAndDelete(void* ptr) { - reporter_.Delete(ptr); - GetCPUAllocator()->GetDeleter()(ptr); - } }; } // namespace caffe2 diff --git a/caffe2/core/context_gpu.cu b/caffe2/core/context_gpu.cu index 0d9e2686212a..6591d65168cc 100644 --- a/caffe2/core/context_gpu.cu +++ b/caffe2/core/context_gpu.cu @@ -314,7 +314,8 @@ void TrackMemoryAlloc(size_t nbytes) { } } -std::pair CUDAStaticContext::New(size_t nbytes) const { +// TODO: wrap this function in DefaultCUDAAllocator +at::DataPtr CUDAStaticContext::New(size_t nbytes) const { // Lock the mutex std::lock_guard lock(CUDAContext::mutex()); // A one-time caffe2 cuda initializer. @@ -331,7 +332,7 @@ std::pair CUDAStaticContext::New(size_t nbytes) const { g_size_map[ptr] = nbytes; g_cuda_device_affiliation[ptr] = CaffeCudaGetDevice(); } - return {ptr, Delete}; + return {ptr, ptr, Delete, at::Device(CUDA)}; case CudaMemoryPoolType::CUB: CUDA_ENFORCE(g_cub_allocator->DeviceAllocate(&ptr, nbytes)); g_cuda_device_affiliation[ptr] = CaffeCudaGetDevice(); @@ -340,16 +341,16 @@ std::pair CUDAStaticContext::New(size_t nbytes) const { if (FLAGS_caffe2_gpu_memory_tracking) { g_size_map[ptr] = nbytes; } - return {ptr, Delete}; + return {ptr, ptr, Delete, at::Device(CUDA)}; case CudaMemoryPoolType::THC: CUDA_ENFORCE(g_thc_allocator->Alloc(&ptr, nbytes, 0 /* stream */)); if (FLAGS_caffe2_gpu_memory_tracking) { g_size_map[ptr] = nbytes; g_cuda_device_affiliation[ptr] = CaffeCudaGetDevice(); } - return {ptr, Delete}; + return {ptr, ptr, Delete, at::Device(CUDA)}; } - return {nullptr, Delete}; + return {nullptr, nullptr, Delete, at::Device(CUDA)}; } void CUDAStaticContext::Delete(void* ptr) { diff --git a/caffe2/core/context_gpu.h b/caffe2/core/context_gpu.h index ce73f5f94282..ce148475a3ab 100644 --- a/caffe2/core/context_gpu.h +++ b/caffe2/core/context_gpu.h @@ -223,7 +223,7 @@ class CAFFE2_CUDA_API CUDAContext final : public BaseContext { return curand_generator_; } - inline static std::pair New(size_t nbytes) { + inline static at::DataPtr New(size_t nbytes) { return StaticContext()->New(nbytes); } @@ -334,26 +334,28 @@ inline void CPUContext::CopyBytes( * GPU present during runtime, at global initialization time we will set * the CPU memory allocator to allocate pinned memory. */ -struct CAFFE2_CUDA_API PinnedCPUAllocator final : CPUAllocator { +struct CAFFE2_CUDA_API PinnedCPUAllocator final : public at::Allocator { PinnedCPUAllocator() {} ~PinnedCPUAllocator() override {} - std::pair New(size_t nbytes) override { + at::DataPtr allocate(size_t nbytes) const override { void* data; + at::DataPtr data_ptr; std::lock_guard lock(CUDAContext::mutex()); if (IsNUMAEnabled()) { - auto ptr_and_deleter = baseAllocator_.New(nbytes); - data = ptr_and_deleter.first; + data_ptr = baseAllocator_.allocate(nbytes); + data = data_ptr.get(); CAFFE_ENFORCE(data); CUDA_ENFORCE(cudaHostRegister(data, nbytes, cudaHostRegisterDefault)); } else { CUDA_ENFORCE(cudaMallocHost(&data, nbytes)); + data_ptr = {data, data, &Delete, at::Device(CPU)}; } memset(data, 0, nbytes); - return {data, Delete}; + return data_ptr; } - MemoryDeleter GetDeleter() override { - return Delete; + at::DeleterFnPtr raw_deleter() const override { + return &Delete; } private: @@ -385,13 +387,14 @@ struct CAFFE2_CUDA_API PinnedCPUAllocator final : CPUAllocator { class CAFFE2_CUDA_API CUDAStaticContext final : public BaseStaticContext { public: - std::pair New(size_t nbytes) const override; + at::DataPtr New(size_t nbytes) const override; DeviceType GetDeviceType() override { return CUDA; } void ExtractDeviceOption(DeviceOption* device, const void* data) override { + CAFFE_ENFORCE(data, "data cannot be nullptr"); device->set_device_type(TypeToProto(GetDeviceType())); device->set_cuda_gpu_id(GetGPUIDForPointer(data)); } diff --git a/caffe2/core/context_gpu_test.cc b/caffe2/core/context_gpu_test.cc index ce1cb74b9881..956a231ce641 100644 --- a/caffe2/core/context_gpu_test.cc +++ b/caffe2/core/context_gpu_test.cc @@ -11,12 +11,6 @@ CAFFE2_DECLARE_bool(caffe2_cuda_full_device_control); namespace caffe2 { -namespace { -std::shared_ptr shared_from_new(std::pair&& p) { - return std::shared_ptr(p.first, std::move(p.second)); -} -} - TEST(CUDATest, HasCudaRuntime) { EXPECT_TRUE(HasCudaRuntime()); } @@ -25,7 +19,7 @@ TEST(CUDAContextTest, TestAllocDealloc) { if (!HasCudaGPU()) return; CUDAContext context(0); context.SwitchToDevice(); - auto data = shared_from_new(CUDAContext::New(10 * sizeof(float))); + auto data = CUDAContext::New(10 * sizeof(float)); EXPECT_NE(data.get(), nullptr); } @@ -66,20 +60,20 @@ TEST(CUDAContextTest, MemoryPoolAllocateDealloc) { for (int i = 0; i < NumCudaDevices(); ++i) { LOG(INFO) << "Device " << i << " of " << NumCudaDevices(); DeviceGuard guard(i); - auto allocated = shared_from_new(CUDAContext::New(nbytes)); + auto allocated = CUDAContext::New(nbytes); EXPECT_NE(allocated, nullptr); cudaPointerAttributes attr; CUDA_ENFORCE(cudaPointerGetAttributes(&attr, allocated.get())); EXPECT_EQ(attr.memoryType, cudaMemoryTypeDevice); EXPECT_EQ(attr.device, i); void* prev_allocated = allocated.get(); - allocated.reset(); - auto new_allocated = shared_from_new(CUDAContext::New(nbytes)); + allocated.clear(); + auto new_allocated = CUDAContext::New(nbytes); // With a pool, the above allocation should yield the same address. EXPECT_EQ(new_allocated.get(), prev_allocated); // But, if we are allocating something larger, we will have a different // chunk of memory. - auto larger_allocated = shared_from_new(CUDAContext::New(nbytes * 2)); + auto larger_allocated = CUDAContext::New(nbytes * 2); EXPECT_NE(larger_allocated.get(), prev_allocated); } } diff --git a/caffe2/core/context_test.cc b/caffe2/core/context_test.cc index a01f967b536a..987c9c422cb3 100644 --- a/caffe2/core/context_test.cc +++ b/caffe2/core/context_test.cc @@ -14,17 +14,17 @@ TEST(CPUContextTest, ATenCoreTest) { TEST(CPUContextTest, TestAllocAlignment) { for (int i = 1; i < 10; ++i) { auto data = CPUContext::New(i); - EXPECT_EQ((reinterpret_cast(data.first) % gCaffe2Alignment), 0); - data.second(data.first); + EXPECT_EQ((reinterpret_cast(data.get()) % gCaffe2Alignment), 0); + // data is freed when out of scope } } TEST(CPUContextTest, TestAllocDealloc) { - auto data_and_deleter = CPUContext::New(10 * sizeof(float)); - float* data = static_cast(data_and_deleter.first); + auto data_ptr = CPUContext::New(10 * sizeof(float)); + float* data = static_cast(data_ptr.get()); EXPECT_NE(data, nullptr); - auto dst_data_and_deleter = CPUContext::New(10 * sizeof(float)); - float* dst_data = static_cast(dst_data_and_deleter.first); + auto dst_data_ptr = CPUContext::New(10 * sizeof(float)); + float* dst_data = static_cast(dst_data_ptr.get()); EXPECT_NE(dst_data, nullptr); for (int i = 0; i < 10; ++i) { data[i] = i; @@ -35,8 +35,7 @@ TEST(CPUContextTest, TestAllocDealloc) { for (int i = 0; i < 10; ++i) { EXPECT_FLOAT_EQ(dst_data[i], i); } - data_and_deleter.second(data); - dst_data_and_deleter.second(dst_data); + // data_ptr is freed when out of scope } } // namespace caffe2 diff --git a/caffe2/core/cudnn_wrappers.h b/caffe2/core/cudnn_wrappers.h index 1bd39fa62a39..2e6e185ea387 100644 --- a/caffe2/core/cudnn_wrappers.h +++ b/caffe2/core/cudnn_wrappers.h @@ -24,8 +24,7 @@ struct CuDNNWorkspace { void* get(size_t nbytes) { if (nbytes_ < nbytes) { reset(); - auto data_and_deleter = CUDAContext::New(nbytes); - data_ = {data_and_deleter.first, data_and_deleter.second}; + data_ = CUDAContext::New(nbytes); nbytes_ = nbytes; } CAFFE_ENFORCE_GE(nbytes_, nbytes); @@ -33,12 +32,12 @@ struct CuDNNWorkspace { } void reset() { - data_ = nullptr; + data_.clear(); nbytes_ = 0; } private: - std::unique_ptr data_{nullptr, NoDelete}; + at::DataPtr data_{nullptr, nullptr, &NoDelete, at::Device(CUDA)}; size_t nbytes_{0}; }; diff --git a/caffe2/core/hip/context_hip.cc b/caffe2/core/hip/context_hip.cc index 3eadaf0e71b1..12e76770201b 100644 --- a/caffe2/core/hip/context_hip.cc +++ b/caffe2/core/hip/context_hip.cc @@ -326,7 +326,7 @@ void TrackMemoryAlloc(size_t nbytes) } } -std::pair HIPStaticContext::New(size_t nbytes) const { +at::DataPtr HIPStaticContext::New(size_t nbytes) const { // Lock the mutex std::lock_guard lock(HIPContext::mutex()); // A one-time caffe2 cuda initializer. @@ -344,7 +344,7 @@ std::pair HIPStaticContext::New(size_t nbytes) const { g_size_map[ptr] = nbytes; g_hip_device_affiliation[ptr] = CaffeHipGetDevice(); } - return {ptr, Delete}; + return {ptr, ptr, &Delete, at::Device(HIP)}; case HipMemoryPoolType::CUB: HIP_ENFORCE(g_cub_allocator->DeviceAllocate(&ptr, nbytes)); g_hip_device_affiliation[ptr] = CaffeHipGetDevice(); @@ -353,7 +353,7 @@ std::pair HIPStaticContext::New(size_t nbytes) const { { g_size_map[ptr] = nbytes; } - return {ptr, Delete}; + return {ptr, ptr, &Delete, at::Device(HIP)}; case HipMemoryPoolType::THC: HIP_ENFORCE(g_thc_allocator->Alloc(&ptr, nbytes, 0 /* stream */)); if (FLAGS_caffe2_gpu_memory_tracking) @@ -361,9 +361,9 @@ std::pair HIPStaticContext::New(size_t nbytes) const { g_size_map[ptr] = nbytes; g_hip_device_affiliation[ptr] = CaffeHipGetDevice(); } - return {ptr, Delete}; - } - return {nullptr, Delete}; + return {ptr, ptr, &Delete, at::Device(HIP)}; + } + return {nullptr, nullptr, &Delete, at::Device(HIP)}; } void HIPStaticContext::Delete(void* ptr) { diff --git a/caffe2/core/hip/context_hip.h b/caffe2/core/hip/context_hip.h index fb04336354e7..d7d456b975e5 100644 --- a/caffe2/core/hip/context_hip.h +++ b/caffe2/core/hip/context_hip.h @@ -206,7 +206,7 @@ class HIPContext final : public BaseContext { return hiprand_generator_; } - static std::pair New(size_t nbytes) { + static at::DataPtr New(size_t nbytes) { return StaticContext()->New(nbytes); } @@ -323,26 +323,28 @@ inline void CPUContext::CopyBytes( * GPU present during runtime, at global initialization time we will set * the CPU memory allocator to allocate pinned memory. */ -struct PinnedCPUAllocator final : CPUAllocator { +struct PinnedCPUAllocator final : public at::Allocator { PinnedCPUAllocator() {} ~PinnedCPUAllocator() override {} - std::pair New(size_t nbytes) override { + at::DataPtr allocate(size_t nbytes) const override { void* data; + at::DataPtr data_ptr; std::lock_guard lock(HIPContext::mutex()); if (IsNUMAEnabled()) { - auto ptr_and_deleter = baseAllocator_.New(nbytes); - data = ptr_and_deleter.first; + data_ptr = baseAllocator_.allocate(nbytes); + data = data_ptr.get(); CAFFE_ENFORCE(data); HIP_ENFORCE(hipHostRegister(data, nbytes, hipHostRegisterDefault)); } else { HIP_ENFORCE(hipHostMalloc(&data, nbytes)); + data_ptr = {data, data, &Delete, at::Device(CPU)}; } memset(data, 0, nbytes); - return {data, Delete}; + return data_ptr; } - MemoryDeleter GetDeleter() override { - return Delete; + at::DeleterFnPtr raw_deleter() const override { + return &Delete; } private: @@ -374,7 +376,7 @@ struct PinnedCPUAllocator final : CPUAllocator { class HIPStaticContext final : public BaseStaticContext { public: - std::pair New(size_t nbytes) const override; + at::DataPtr New(size_t nbytes) const override; DeviceType GetDeviceType() override { return HIP; diff --git a/caffe2/core/hip/miopen_wrapper.h b/caffe2/core/hip/miopen_wrapper.h index 328c7522258d..46977248e0da 100644 --- a/caffe2/core/hip/miopen_wrapper.h +++ b/caffe2/core/hip/miopen_wrapper.h @@ -26,8 +26,7 @@ struct MIOPENWorkspace if(nbytes_ < nbytes) { reset(); - auto data_and_deleter = HIPContext::New(nbytes); - data_ = {data_and_deleter.first, data_and_deleter.second}; + data_ = HIPContext::New(nbytes); nbytes_ = nbytes; } CAFFE_ENFORCE_GE(nbytes_, nbytes); @@ -36,13 +35,13 @@ struct MIOPENWorkspace void reset() { - data_ = nullptr; - nbytes_ = 0; + data_.clear(); + nbytes_ = 0; } private: - std::unique_ptr data_{nullptr, NoDelete}; - size_t nbytes_{0}; + at::DataPtr data_; + size_t nbytes_{0}; }; // MIOPENState is the owner of the MIOPENWorkspace, and serializes all diff --git a/caffe2/core/qtensor.h b/caffe2/core/qtensor.h index 385ebf1d5f9f..867dee0cdf45 100644 --- a/caffe2/core/qtensor.h +++ b/caffe2/core/qtensor.h @@ -59,7 +59,7 @@ class C10_EXPORT QTensor { size_t source_size = std::accumulate( dim_source.begin(), dim_source.end(), 1, std::multiplies()); if ((source_size * (precision_ + signed_)) > capacity_) { - data_.reset(); + data_ptr_.clear(); capacity_ = 0; } dims_ = dim_source; @@ -104,12 +104,12 @@ class C10_EXPORT QTensor { void SetPrecision(const unsigned char precision) { precision_ = precision; - data_.reset(); + data_ptr_.clear(); } void SetSigned(const bool make_signed = true) { signed_ = make_signed; - data_.reset(); + data_ptr_.clear(); } void SetScale(const double scale) { @@ -121,19 +121,16 @@ class C10_EXPORT QTensor { } unsigned char* mutable_data() { - if (!data_) { - auto ptr_and_deleter = Context::New(nbytes()); - data_.reset( - static_cast(ptr_and_deleter.first), - ptr_and_deleter.second); + if (!data_ptr_) { + data_ptr_ = Context::New(nbytes()); capacity_ = nbytes() * CHAR_BIT; } CAFFE_ENFORCE(capacity_ == nbytes() * CHAR_BIT); - return data_.get(); + return static_cast(data_ptr_.get()); } inline const unsigned char* data() const { - return data_.get(); + return static_cast(data_ptr_.get()); } inline size_t size() const { @@ -242,7 +239,7 @@ class C10_EXPORT QTensor { unsigned char alignment_ = CHAR_BIT; // Allocated data. - std::shared_ptr data_; + at::DataPtr data_ptr_; // value = scale_ * (x + bias_) double scale_; diff --git a/caffe2/ideep/utils/ideep_context.h b/caffe2/ideep/utils/ideep_context.h index 087078c507d1..b5c702ea3d64 100644 --- a/caffe2/ideep/utils/ideep_context.h +++ b/caffe2/ideep/utils/ideep_context.h @@ -55,7 +55,7 @@ class IDEEPContext final : public BaseContext { return *random_generator_.get(); } - inline static std::pair New(size_t nbytes) { + inline static at::DataPtr New(size_t nbytes) { return StaticContext()->New(nbytes); } @@ -176,8 +176,8 @@ inline void IDEEPContext::CopyBytes( class IDEEPStaticContext : public BaseStaticContext { public: - inline std::pair New(size_t nbytes) const override { - return GetCPUAllocator()->New(nbytes); + inline at::DataPtr New(size_t nbytes) const override { + return GetCPUAllocator()->allocate(nbytes); } DeviceType GetDeviceType() override { diff --git a/caffe2/mkl/utils/mkl_context.h b/caffe2/mkl/utils/mkl_context.h index 8364026d91c6..7735283b6e04 100644 --- a/caffe2/mkl/utils/mkl_context.h +++ b/caffe2/mkl/utils/mkl_context.h @@ -62,7 +62,7 @@ class MKLContext : public BaseContext { return *random_generator_.get(); } - inline static std::pair New(size_t nbytes) { + inline static at::DataPtr New(size_t nbytes) { return StaticContext()->New(nbytes); } @@ -153,8 +153,8 @@ inline void MKLContext::CopyBytes( class MKLStaticContext : public BaseStaticContext { public: - inline std::pair New(size_t nbytes) const override { - return GetCPUAllocator()->New(nbytes); + inline at::DataPtr New(size_t nbytes) const override { + return GetCPUAllocator()->allocate(nbytes); } DeviceType GetDeviceType() override {