Kill cudaDeviceAllocator in THCState (#33380)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/33380 Differential Revision: D19973151 Pulled By: ezyang fbshipit-source-id: 41634c43b28ca723e39e761afd32e5015e122368
2025-10-20 21:14:14 +08:00 · 2020-02-21 08:01:50 -08:00
parent a943b0518b
commit 4588f49f68
6 changed files with 14 additions and 25 deletions
--- a/aten/src/ATen/cuda/CUDAContext.cpp
+++ b/aten/src/ATen/cuda/CUDAContext.cpp
@ -1,5 +1,5 @@
 #include <ATen/cuda/CUDAContext.h>
-#include <THC/THCGeneral.hpp>
+#include <c10/cuda/CUDACachingAllocator.h>

 #include <ATen/cuda/CUDAConfig.h>
 #include <mutex>
@ -48,7 +48,7 @@ cudaDeviceProp* getDeviceProperties(int64_t device) {
 }

 Allocator* getCUDADeviceAllocator() {
-  return at::globalContext().getTHCState()->cudaDeviceAllocator;
+  return c10::cuda::CUDACachingAllocator::get();
 }

 } // namespace cuda
--- a/aten/src/ATen/native/cuda/Reduce.cuh
+++ b/aten/src/ATen/native/cuda/Reduce.cuh
@ -11,6 +11,7 @@
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/cuda/Loops.cuh>
 #include <c10/macros/Macros.h>
+#include <c10/cuda/CUDACachingAllocator.h>
 #include <functional>
 #include <iosfwd>
 #include <tuple>
@ -631,7 +632,7 @@ struct AccumulationBuffer {
      numerator_ = 1;
      denominator_ = 1;
    } else {
-      auto& allocator = *at::globalContext().getTHCState()->cudaDeviceAllocator;
+      auto& allocator = *c10::cuda::CUDACachingAllocator::get();
      buffer_ = allocator.allocate(size);
      acc_ptr_ = (char*)buffer_.get();
      numerator_ = acc_t_size;
@ -790,7 +791,7 @@ inline void gpu_reduce_kernel(TensorIterator& iter, const ops_t& ops, ident_t id
  at::DataPtr buffer;
  at::DataPtr semaphores;
  if (config.should_global_reduce()) {
-    auto& allocator = *at::globalContext().getTHCState()->cudaDeviceAllocator;
+    auto& allocator = *c10::cuda::CUDACachingAllocator::get();
    buffer = allocator.allocate(config.global_memory_size());
    semaphores = allocator.allocate(config.semaphore_size());

--- a/aten/src/THC/THCGeneral.cpp
+++ b/aten/src/THC/THCGeneral.cpp
@ -39,9 +39,6 @@ THCState* THCState_alloc(void)

 void THCudaInit(THCState* state)
 {
-  if (!state->cudaDeviceAllocator) {
-    state->cudaDeviceAllocator = c10::cuda::CUDACachingAllocator::get();
-  }
  if (!state->cudaHostAllocator) {
    state->cudaHostAllocator = getTHCCachingHostAllocator();
  }
@ -107,9 +104,7 @@ void THCudaShutdown(THCState* state)
  free(state->p2pAccessEnabled);

  free(state->resourcesPerDevice);
-  if (state->cudaDeviceAllocator == c10::cuda::CUDACachingAllocator::get()) {
-    c10::cuda::CUDACachingAllocator::emptyCache();
-  }
+  c10::cuda::CUDACachingAllocator::emptyCache();
  if (state->cudaHostAllocator == getTHCCachingHostAllocator()) {
    THCCachingHostAllocator_emptyCache();
  }
@ -295,13 +290,11 @@ void __THCusparseCheck(cusparseStatus_t status, const char *file, const int line

 void* THCudaMalloc(THCState *state, size_t size)
 {
-  THCudaCheck(cudaGetLastError());
-  c10::Allocator* allocator = state->cudaDeviceAllocator;
-  return allocator->raw_allocate(size);
+  return c10::cuda::CUDACachingAllocator::raw_alloc(size);
 }

 void THCudaFree(THCState *state, void* ptr) {
-  state->cudaDeviceAllocator->raw_deallocate(ptr);
+  c10::cuda::CUDACachingAllocator::raw_delete(ptr);
 }

 at::DataPtr THCudaHostAlloc(THCState *state, size_t size)
@ -320,7 +313,6 @@ void THCudaHostRecord(THCState *state, void *ptr) {
 cudaError_t THCudaMemGetInfo(THCState *state,  size_t* freeBytes, size_t* totalBytes, size_t* largestBlock)
 {
  size_t cachedBytes = 0;
-  c10::Allocator* allocator = state->cudaDeviceAllocator;

  *largestBlock = 0;
  /* get info from CUDA first */
@ -336,9 +328,7 @@ cudaError_t THCudaMemGetInfo(THCState *state,  size_t* freeBytes, size_t* totalB
  /* not always true - our optimistic guess here */
  *largestBlock = *freeBytes;

-  if (allocator == c10::cuda::CUDACachingAllocator::get()) {
-    c10::cuda::CUDACachingAllocator::cacheInfo(device, &cachedBytes, largestBlock);
-  }
+  c10::cuda::CUDACachingAllocator::cacheInfo(device, &cachedBytes, largestBlock);

  /* Adjust resulting free bytes number. largesBlock unused for now */
  *freeBytes += cachedBytes;
--- a/aten/src/THC/THCGeneral.hpp
+++ b/aten/src/THC/THCGeneral.hpp
@ -10,12 +10,10 @@ struct THCState {
  int numDevices;

  /* Allocator using cudaMallocHost. */
-  // NB: These allocators (specifically, cudaHostAllocator) MUST implement
-  // maybeGlobalBoundDeleter, because we have a few use-cases where we need to
-  // do raw allocations with them (for Thrust).
+  // NB: cudaHostAllocator MUST implement maybeGlobalBoundDeleter, because we have
+  // a few use-cases where we need to do raw allocations with them (for Thrust).
  // TODO: Make this statically obvious
  at::Allocator* cudaHostAllocator;
-  at::Allocator* cudaDeviceAllocator;

  /* Table of enabled peer-to-peer access between directed pairs of GPUs.
     If i accessing allocs on j is enabled, p2pAccess[i][j] is 1; 0 otherwise. */
--- a/aten/src/THC/THCStorage.cpp
+++ b/aten/src/THC/THCStorage.cpp
@ -65,7 +65,7 @@ THCStorage* THCStorage_new(
  THStorage* storage = c10::make_intrusive<at::StorageImpl>(
      data_type,
      0,
-      state->cudaDeviceAllocator,
+      c10::cuda::CUDACachingAllocator::get(),
      true).release();
  return storage;
 }
--- a/aten/src/THC/generic/THCStorage.cpp
+++ b/aten/src/THC/generic/THCStorage.cpp
@ -46,7 +46,7 @@ THCStorage* THCStorage_(new)(THCState *state)
  THStorage* storage = c10::make_intrusive<at::StorageImpl>(
      caffe2::TypeMeta::Make<scalar_t>(),
      0,
-      state->cudaDeviceAllocator,
+      c10::cuda::CUDACachingAllocator::get(),
      true).release();
  return storage;
 }
@ -56,7 +56,7 @@ THCStorage* THCStorage_(newWithSize)(THCState *state, ptrdiff_t size)
  THStorage* storage = c10::make_intrusive<at::StorageImpl>(
      caffe2::TypeMeta::Make<scalar_t>(),
      size,
-      state->cudaDeviceAllocator,
+      c10::cuda::CUDACachingAllocator::get(),
      true).release();
  return storage;
 }