mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Kill cudaDeviceAllocator in THCState (#33380)
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/33380 Differential Revision: D19973151 Pulled By: ezyang fbshipit-source-id: 41634c43b28ca723e39e761afd32e5015e122368
This commit is contained in:
committed by
Facebook Github Bot
parent
a943b0518b
commit
4588f49f68
@ -1,5 +1,5 @@
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <THC/THCGeneral.hpp>
|
||||
#include <c10/cuda/CUDACachingAllocator.h>
|
||||
|
||||
#include <ATen/cuda/CUDAConfig.h>
|
||||
#include <mutex>
|
||||
@ -48,7 +48,7 @@ cudaDeviceProp* getDeviceProperties(int64_t device) {
|
||||
}
|
||||
|
||||
Allocator* getCUDADeviceAllocator() {
|
||||
return at::globalContext().getTHCState()->cudaDeviceAllocator;
|
||||
return c10::cuda::CUDACachingAllocator::get();
|
||||
}
|
||||
|
||||
} // namespace cuda
|
||||
|
@ -11,6 +11,7 @@
|
||||
#include <ATen/native/TensorIterator.h>
|
||||
#include <ATen/native/cuda/Loops.cuh>
|
||||
#include <c10/macros/Macros.h>
|
||||
#include <c10/cuda/CUDACachingAllocator.h>
|
||||
#include <functional>
|
||||
#include <iosfwd>
|
||||
#include <tuple>
|
||||
@ -631,7 +632,7 @@ struct AccumulationBuffer {
|
||||
numerator_ = 1;
|
||||
denominator_ = 1;
|
||||
} else {
|
||||
auto& allocator = *at::globalContext().getTHCState()->cudaDeviceAllocator;
|
||||
auto& allocator = *c10::cuda::CUDACachingAllocator::get();
|
||||
buffer_ = allocator.allocate(size);
|
||||
acc_ptr_ = (char*)buffer_.get();
|
||||
numerator_ = acc_t_size;
|
||||
@ -790,7 +791,7 @@ inline void gpu_reduce_kernel(TensorIterator& iter, const ops_t& ops, ident_t id
|
||||
at::DataPtr buffer;
|
||||
at::DataPtr semaphores;
|
||||
if (config.should_global_reduce()) {
|
||||
auto& allocator = *at::globalContext().getTHCState()->cudaDeviceAllocator;
|
||||
auto& allocator = *c10::cuda::CUDACachingAllocator::get();
|
||||
buffer = allocator.allocate(config.global_memory_size());
|
||||
semaphores = allocator.allocate(config.semaphore_size());
|
||||
|
||||
|
@ -39,9 +39,6 @@ THCState* THCState_alloc(void)
|
||||
|
||||
void THCudaInit(THCState* state)
|
||||
{
|
||||
if (!state->cudaDeviceAllocator) {
|
||||
state->cudaDeviceAllocator = c10::cuda::CUDACachingAllocator::get();
|
||||
}
|
||||
if (!state->cudaHostAllocator) {
|
||||
state->cudaHostAllocator = getTHCCachingHostAllocator();
|
||||
}
|
||||
@ -107,9 +104,7 @@ void THCudaShutdown(THCState* state)
|
||||
free(state->p2pAccessEnabled);
|
||||
|
||||
free(state->resourcesPerDevice);
|
||||
if (state->cudaDeviceAllocator == c10::cuda::CUDACachingAllocator::get()) {
|
||||
c10::cuda::CUDACachingAllocator::emptyCache();
|
||||
}
|
||||
c10::cuda::CUDACachingAllocator::emptyCache();
|
||||
if (state->cudaHostAllocator == getTHCCachingHostAllocator()) {
|
||||
THCCachingHostAllocator_emptyCache();
|
||||
}
|
||||
@ -295,13 +290,11 @@ void __THCusparseCheck(cusparseStatus_t status, const char *file, const int line
|
||||
|
||||
void* THCudaMalloc(THCState *state, size_t size)
|
||||
{
|
||||
THCudaCheck(cudaGetLastError());
|
||||
c10::Allocator* allocator = state->cudaDeviceAllocator;
|
||||
return allocator->raw_allocate(size);
|
||||
return c10::cuda::CUDACachingAllocator::raw_alloc(size);
|
||||
}
|
||||
|
||||
void THCudaFree(THCState *state, void* ptr) {
|
||||
state->cudaDeviceAllocator->raw_deallocate(ptr);
|
||||
c10::cuda::CUDACachingAllocator::raw_delete(ptr);
|
||||
}
|
||||
|
||||
at::DataPtr THCudaHostAlloc(THCState *state, size_t size)
|
||||
@ -320,7 +313,6 @@ void THCudaHostRecord(THCState *state, void *ptr) {
|
||||
cudaError_t THCudaMemGetInfo(THCState *state, size_t* freeBytes, size_t* totalBytes, size_t* largestBlock)
|
||||
{
|
||||
size_t cachedBytes = 0;
|
||||
c10::Allocator* allocator = state->cudaDeviceAllocator;
|
||||
|
||||
*largestBlock = 0;
|
||||
/* get info from CUDA first */
|
||||
@ -336,9 +328,7 @@ cudaError_t THCudaMemGetInfo(THCState *state, size_t* freeBytes, size_t* totalB
|
||||
/* not always true - our optimistic guess here */
|
||||
*largestBlock = *freeBytes;
|
||||
|
||||
if (allocator == c10::cuda::CUDACachingAllocator::get()) {
|
||||
c10::cuda::CUDACachingAllocator::cacheInfo(device, &cachedBytes, largestBlock);
|
||||
}
|
||||
c10::cuda::CUDACachingAllocator::cacheInfo(device, &cachedBytes, largestBlock);
|
||||
|
||||
/* Adjust resulting free bytes number. largesBlock unused for now */
|
||||
*freeBytes += cachedBytes;
|
||||
|
@ -10,12 +10,10 @@ struct THCState {
|
||||
int numDevices;
|
||||
|
||||
/* Allocator using cudaMallocHost. */
|
||||
// NB: These allocators (specifically, cudaHostAllocator) MUST implement
|
||||
// maybeGlobalBoundDeleter, because we have a few use-cases where we need to
|
||||
// do raw allocations with them (for Thrust).
|
||||
// NB: cudaHostAllocator MUST implement maybeGlobalBoundDeleter, because we have
|
||||
// a few use-cases where we need to do raw allocations with them (for Thrust).
|
||||
// TODO: Make this statically obvious
|
||||
at::Allocator* cudaHostAllocator;
|
||||
at::Allocator* cudaDeviceAllocator;
|
||||
|
||||
/* Table of enabled peer-to-peer access between directed pairs of GPUs.
|
||||
If i accessing allocs on j is enabled, p2pAccess[i][j] is 1; 0 otherwise. */
|
||||
|
@ -65,7 +65,7 @@ THCStorage* THCStorage_new(
|
||||
THStorage* storage = c10::make_intrusive<at::StorageImpl>(
|
||||
data_type,
|
||||
0,
|
||||
state->cudaDeviceAllocator,
|
||||
c10::cuda::CUDACachingAllocator::get(),
|
||||
true).release();
|
||||
return storage;
|
||||
}
|
||||
|
@ -46,7 +46,7 @@ THCStorage* THCStorage_(new)(THCState *state)
|
||||
THStorage* storage = c10::make_intrusive<at::StorageImpl>(
|
||||
caffe2::TypeMeta::Make<scalar_t>(),
|
||||
0,
|
||||
state->cudaDeviceAllocator,
|
||||
c10::cuda::CUDACachingAllocator::get(),
|
||||
true).release();
|
||||
return storage;
|
||||
}
|
||||
@ -56,7 +56,7 @@ THCStorage* THCStorage_(newWithSize)(THCState *state, ptrdiff_t size)
|
||||
THStorage* storage = c10::make_intrusive<at::StorageImpl>(
|
||||
caffe2::TypeMeta::Make<scalar_t>(),
|
||||
size,
|
||||
state->cudaDeviceAllocator,
|
||||
c10::cuda::CUDACachingAllocator::get(),
|
||||
true).release();
|
||||
return storage;
|
||||
}
|
||||
|
Reference in New Issue
Block a user