Kill cudaDeviceAllocator in THCState (#33380)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/33380

Differential Revision: D19973151

Pulled By: ezyang

fbshipit-source-id: 41634c43b28ca723e39e761afd32e5015e122368
This commit is contained in:
Xiang Gao
2020-02-21 08:01:50 -08:00
committed by Facebook Github Bot
parent a943b0518b
commit 4588f49f68
6 changed files with 14 additions and 25 deletions

View File

@ -1,5 +1,5 @@
#include <ATen/cuda/CUDAContext.h>
#include <THC/THCGeneral.hpp>
#include <c10/cuda/CUDACachingAllocator.h>
#include <ATen/cuda/CUDAConfig.h>
#include <mutex>
@ -48,7 +48,7 @@ cudaDeviceProp* getDeviceProperties(int64_t device) {
}
Allocator* getCUDADeviceAllocator() {
return at::globalContext().getTHCState()->cudaDeviceAllocator;
return c10::cuda::CUDACachingAllocator::get();
}
} // namespace cuda

View File

@ -11,6 +11,7 @@
#include <ATen/native/TensorIterator.h>
#include <ATen/native/cuda/Loops.cuh>
#include <c10/macros/Macros.h>
#include <c10/cuda/CUDACachingAllocator.h>
#include <functional>
#include <iosfwd>
#include <tuple>
@ -631,7 +632,7 @@ struct AccumulationBuffer {
numerator_ = 1;
denominator_ = 1;
} else {
auto& allocator = *at::globalContext().getTHCState()->cudaDeviceAllocator;
auto& allocator = *c10::cuda::CUDACachingAllocator::get();
buffer_ = allocator.allocate(size);
acc_ptr_ = (char*)buffer_.get();
numerator_ = acc_t_size;
@ -790,7 +791,7 @@ inline void gpu_reduce_kernel(TensorIterator& iter, const ops_t& ops, ident_t id
at::DataPtr buffer;
at::DataPtr semaphores;
if (config.should_global_reduce()) {
auto& allocator = *at::globalContext().getTHCState()->cudaDeviceAllocator;
auto& allocator = *c10::cuda::CUDACachingAllocator::get();
buffer = allocator.allocate(config.global_memory_size());
semaphores = allocator.allocate(config.semaphore_size());

View File

@ -39,9 +39,6 @@ THCState* THCState_alloc(void)
void THCudaInit(THCState* state)
{
if (!state->cudaDeviceAllocator) {
state->cudaDeviceAllocator = c10::cuda::CUDACachingAllocator::get();
}
if (!state->cudaHostAllocator) {
state->cudaHostAllocator = getTHCCachingHostAllocator();
}
@ -107,9 +104,7 @@ void THCudaShutdown(THCState* state)
free(state->p2pAccessEnabled);
free(state->resourcesPerDevice);
if (state->cudaDeviceAllocator == c10::cuda::CUDACachingAllocator::get()) {
c10::cuda::CUDACachingAllocator::emptyCache();
}
c10::cuda::CUDACachingAllocator::emptyCache();
if (state->cudaHostAllocator == getTHCCachingHostAllocator()) {
THCCachingHostAllocator_emptyCache();
}
@ -295,13 +290,11 @@ void __THCusparseCheck(cusparseStatus_t status, const char *file, const int line
void* THCudaMalloc(THCState *state, size_t size)
{
THCudaCheck(cudaGetLastError());
c10::Allocator* allocator = state->cudaDeviceAllocator;
return allocator->raw_allocate(size);
return c10::cuda::CUDACachingAllocator::raw_alloc(size);
}
void THCudaFree(THCState *state, void* ptr) {
state->cudaDeviceAllocator->raw_deallocate(ptr);
c10::cuda::CUDACachingAllocator::raw_delete(ptr);
}
at::DataPtr THCudaHostAlloc(THCState *state, size_t size)
@ -320,7 +313,6 @@ void THCudaHostRecord(THCState *state, void *ptr) {
cudaError_t THCudaMemGetInfo(THCState *state, size_t* freeBytes, size_t* totalBytes, size_t* largestBlock)
{
size_t cachedBytes = 0;
c10::Allocator* allocator = state->cudaDeviceAllocator;
*largestBlock = 0;
/* get info from CUDA first */
@ -336,9 +328,7 @@ cudaError_t THCudaMemGetInfo(THCState *state, size_t* freeBytes, size_t* totalB
/* not always true - our optimistic guess here */
*largestBlock = *freeBytes;
if (allocator == c10::cuda::CUDACachingAllocator::get()) {
c10::cuda::CUDACachingAllocator::cacheInfo(device, &cachedBytes, largestBlock);
}
c10::cuda::CUDACachingAllocator::cacheInfo(device, &cachedBytes, largestBlock);
/* Adjust resulting free bytes number. largesBlock unused for now */
*freeBytes += cachedBytes;

View File

@ -10,12 +10,10 @@ struct THCState {
int numDevices;
/* Allocator using cudaMallocHost. */
// NB: These allocators (specifically, cudaHostAllocator) MUST implement
// maybeGlobalBoundDeleter, because we have a few use-cases where we need to
// do raw allocations with them (for Thrust).
// NB: cudaHostAllocator MUST implement maybeGlobalBoundDeleter, because we have
// a few use-cases where we need to do raw allocations with them (for Thrust).
// TODO: Make this statically obvious
at::Allocator* cudaHostAllocator;
at::Allocator* cudaDeviceAllocator;
/* Table of enabled peer-to-peer access between directed pairs of GPUs.
If i accessing allocs on j is enabled, p2pAccess[i][j] is 1; 0 otherwise. */

View File

@ -65,7 +65,7 @@ THCStorage* THCStorage_new(
THStorage* storage = c10::make_intrusive<at::StorageImpl>(
data_type,
0,
state->cudaDeviceAllocator,
c10::cuda::CUDACachingAllocator::get(),
true).release();
return storage;
}

View File

@ -46,7 +46,7 @@ THCStorage* THCStorage_(new)(THCState *state)
THStorage* storage = c10::make_intrusive<at::StorageImpl>(
caffe2::TypeMeta::Make<scalar_t>(),
0,
state->cudaDeviceAllocator,
c10::cuda::CUDACachingAllocator::get(),
true).release();
return storage;
}
@ -56,7 +56,7 @@ THCStorage* THCStorage_(newWithSize)(THCState *state, ptrdiff_t size)
THStorage* storage = c10::make_intrusive<at::StorageImpl>(
caffe2::TypeMeta::Make<scalar_t>(),
size,
state->cudaDeviceAllocator,
c10::cuda::CUDACachingAllocator::get(),
true).release();
return storage;
}