From 4588f49f68c209b8c9e8093fa61d027c5a596b18 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Fri, 21 Feb 2020 08:01:50 -0800 Subject: [PATCH] Kill cudaDeviceAllocator in THCState (#33380) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/33380 Differential Revision: D19973151 Pulled By: ezyang fbshipit-source-id: 41634c43b28ca723e39e761afd32e5015e122368 --- aten/src/ATen/cuda/CUDAContext.cpp | 4 ++-- aten/src/ATen/native/cuda/Reduce.cuh | 5 +++-- aten/src/THC/THCGeneral.cpp | 18 ++++-------------- aten/src/THC/THCGeneral.hpp | 6 ++---- aten/src/THC/THCStorage.cpp | 2 +- aten/src/THC/generic/THCStorage.cpp | 4 ++-- 6 files changed, 14 insertions(+), 25 deletions(-) diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp index e48c020b0338..6fd74b58d007 100644 --- a/aten/src/ATen/cuda/CUDAContext.cpp +++ b/aten/src/ATen/cuda/CUDAContext.cpp @@ -1,5 +1,5 @@ #include -#include +#include #include #include @@ -48,7 +48,7 @@ cudaDeviceProp* getDeviceProperties(int64_t device) { } Allocator* getCUDADeviceAllocator() { - return at::globalContext().getTHCState()->cudaDeviceAllocator; + return c10::cuda::CUDACachingAllocator::get(); } } // namespace cuda diff --git a/aten/src/ATen/native/cuda/Reduce.cuh b/aten/src/ATen/native/cuda/Reduce.cuh index 7fc4dd1bdb5e..c431b9e4df15 100644 --- a/aten/src/ATen/native/cuda/Reduce.cuh +++ b/aten/src/ATen/native/cuda/Reduce.cuh @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -631,7 +632,7 @@ struct AccumulationBuffer { numerator_ = 1; denominator_ = 1; } else { - auto& allocator = *at::globalContext().getTHCState()->cudaDeviceAllocator; + auto& allocator = *c10::cuda::CUDACachingAllocator::get(); buffer_ = allocator.allocate(size); acc_ptr_ = (char*)buffer_.get(); numerator_ = acc_t_size; @@ -790,7 +791,7 @@ inline void gpu_reduce_kernel(TensorIterator& iter, const ops_t& ops, ident_t id at::DataPtr buffer; at::DataPtr semaphores; if (config.should_global_reduce()) { - auto& allocator = *at::globalContext().getTHCState()->cudaDeviceAllocator; + auto& allocator = *c10::cuda::CUDACachingAllocator::get(); buffer = allocator.allocate(config.global_memory_size()); semaphores = allocator.allocate(config.semaphore_size()); diff --git a/aten/src/THC/THCGeneral.cpp b/aten/src/THC/THCGeneral.cpp index 89e9257eeb62..697a3f9203fb 100644 --- a/aten/src/THC/THCGeneral.cpp +++ b/aten/src/THC/THCGeneral.cpp @@ -39,9 +39,6 @@ THCState* THCState_alloc(void) void THCudaInit(THCState* state) { - if (!state->cudaDeviceAllocator) { - state->cudaDeviceAllocator = c10::cuda::CUDACachingAllocator::get(); - } if (!state->cudaHostAllocator) { state->cudaHostAllocator = getTHCCachingHostAllocator(); } @@ -107,9 +104,7 @@ void THCudaShutdown(THCState* state) free(state->p2pAccessEnabled); free(state->resourcesPerDevice); - if (state->cudaDeviceAllocator == c10::cuda::CUDACachingAllocator::get()) { - c10::cuda::CUDACachingAllocator::emptyCache(); - } + c10::cuda::CUDACachingAllocator::emptyCache(); if (state->cudaHostAllocator == getTHCCachingHostAllocator()) { THCCachingHostAllocator_emptyCache(); } @@ -295,13 +290,11 @@ void __THCusparseCheck(cusparseStatus_t status, const char *file, const int line void* THCudaMalloc(THCState *state, size_t size) { - THCudaCheck(cudaGetLastError()); - c10::Allocator* allocator = state->cudaDeviceAllocator; - return allocator->raw_allocate(size); + return c10::cuda::CUDACachingAllocator::raw_alloc(size); } void THCudaFree(THCState *state, void* ptr) { - state->cudaDeviceAllocator->raw_deallocate(ptr); + c10::cuda::CUDACachingAllocator::raw_delete(ptr); } at::DataPtr THCudaHostAlloc(THCState *state, size_t size) @@ -320,7 +313,6 @@ void THCudaHostRecord(THCState *state, void *ptr) { cudaError_t THCudaMemGetInfo(THCState *state, size_t* freeBytes, size_t* totalBytes, size_t* largestBlock) { size_t cachedBytes = 0; - c10::Allocator* allocator = state->cudaDeviceAllocator; *largestBlock = 0; /* get info from CUDA first */ @@ -336,9 +328,7 @@ cudaError_t THCudaMemGetInfo(THCState *state, size_t* freeBytes, size_t* totalB /* not always true - our optimistic guess here */ *largestBlock = *freeBytes; - if (allocator == c10::cuda::CUDACachingAllocator::get()) { - c10::cuda::CUDACachingAllocator::cacheInfo(device, &cachedBytes, largestBlock); - } + c10::cuda::CUDACachingAllocator::cacheInfo(device, &cachedBytes, largestBlock); /* Adjust resulting free bytes number. largesBlock unused for now */ *freeBytes += cachedBytes; diff --git a/aten/src/THC/THCGeneral.hpp b/aten/src/THC/THCGeneral.hpp index dc12b43cff8b..fd3bea2f5cbf 100644 --- a/aten/src/THC/THCGeneral.hpp +++ b/aten/src/THC/THCGeneral.hpp @@ -10,12 +10,10 @@ struct THCState { int numDevices; /* Allocator using cudaMallocHost. */ - // NB: These allocators (specifically, cudaHostAllocator) MUST implement - // maybeGlobalBoundDeleter, because we have a few use-cases where we need to - // do raw allocations with them (for Thrust). + // NB: cudaHostAllocator MUST implement maybeGlobalBoundDeleter, because we have + // a few use-cases where we need to do raw allocations with them (for Thrust). // TODO: Make this statically obvious at::Allocator* cudaHostAllocator; - at::Allocator* cudaDeviceAllocator; /* Table of enabled peer-to-peer access between directed pairs of GPUs. If i accessing allocs on j is enabled, p2pAccess[i][j] is 1; 0 otherwise. */ diff --git a/aten/src/THC/THCStorage.cpp b/aten/src/THC/THCStorage.cpp index b58ed4bbf25d..f46556cc3215 100644 --- a/aten/src/THC/THCStorage.cpp +++ b/aten/src/THC/THCStorage.cpp @@ -65,7 +65,7 @@ THCStorage* THCStorage_new( THStorage* storage = c10::make_intrusive( data_type, 0, - state->cudaDeviceAllocator, + c10::cuda::CUDACachingAllocator::get(), true).release(); return storage; } diff --git a/aten/src/THC/generic/THCStorage.cpp b/aten/src/THC/generic/THCStorage.cpp index b94ba0f4c6bd..7a3b40bcbcb8 100644 --- a/aten/src/THC/generic/THCStorage.cpp +++ b/aten/src/THC/generic/THCStorage.cpp @@ -46,7 +46,7 @@ THCStorage* THCStorage_(new)(THCState *state) THStorage* storage = c10::make_intrusive( caffe2::TypeMeta::Make(), 0, - state->cudaDeviceAllocator, + c10::cuda::CUDACachingAllocator::get(), true).release(); return storage; } @@ -56,7 +56,7 @@ THCStorage* THCStorage_(newWithSize)(THCState *state, ptrdiff_t size) THStorage* storage = c10::make_intrusive( caffe2::TypeMeta::Make(), size, - state->cudaDeviceAllocator, + c10::cuda::CUDACachingAllocator::get(), true).release(); return storage; }