From 4588f49f68c209b8c9e8093fa61d027c5a596b18 Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Fri, 21 Feb 2020 08:01:50 -0800
Subject: [PATCH] Kill cudaDeviceAllocator in THCState (#33380)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/33380

Differential Revision: D19973151

Pulled By: ezyang

fbshipit-source-id: 41634c43b28ca723e39e761afd32e5015e122368
---
 aten/src/ATen/cuda/CUDAContext.cpp   |  4 ++--
 aten/src/ATen/native/cuda/Reduce.cuh |  5 +++--
 aten/src/THC/THCGeneral.cpp          | 18 ++++--------------
 aten/src/THC/THCGeneral.hpp          |  6 ++----
 aten/src/THC/THCStorage.cpp          |  2 +-
 aten/src/THC/generic/THCStorage.cpp  |  4 ++--
 6 files changed, 14 insertions(+), 25 deletions(-)
diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp
index e48c020b0338..6fd74b58d007 100644
--- a/aten/src/ATen/cuda/CUDAContext.cpp
+++ b/aten/src/ATen/cuda/CUDAContext.cpp
@@ -1,5 +1,5 @@
 #include <ATen/cuda/CUDAContext.h>
-#include <THC/THCGeneral.hpp>
+#include <c10/cuda/CUDACachingAllocator.h>
 
 #include <ATen/cuda/CUDAConfig.h>
 #include <mutex>
@@ -48,7 +48,7 @@ cudaDeviceProp* getDeviceProperties(int64_t device) {
 }
 
 Allocator* getCUDADeviceAllocator() {
-  return at::globalContext().getTHCState()->cudaDeviceAllocator;
+  return c10::cuda::CUDACachingAllocator::get();
 }
 
 } // namespace cuda
diff --git a/aten/src/ATen/native/cuda/Reduce.cuh b/aten/src/ATen/native/cuda/Reduce.cuh
index 7fc4dd1bdb5e..c431b9e4df15 100644
--- a/aten/src/ATen/native/cuda/Reduce.cuh
+++ b/aten/src/ATen/native/cuda/Reduce.cuh
@@ -11,6 +11,7 @@
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/cuda/Loops.cuh>
 #include <c10/macros/Macros.h>
+#include <c10/cuda/CUDACachingAllocator.h>
 #include <functional>
 #include <iosfwd>
 #include <tuple>
@@ -631,7 +632,7 @@ struct AccumulationBuffer {
       numerator_ = 1;
       denominator_ = 1;
     } else {
-      auto& allocator = *at::globalContext().getTHCState()->cudaDeviceAllocator;
+      auto& allocator = *c10::cuda::CUDACachingAllocator::get();
       buffer_ = allocator.allocate(size);
       acc_ptr_ = (char*)buffer_.get();
       numerator_ = acc_t_size;
@@ -790,7 +791,7 @@ inline void gpu_reduce_kernel(TensorIterator& iter, const ops_t& ops, ident_t id
   at::DataPtr buffer;
   at::DataPtr semaphores;
   if (config.should_global_reduce()) {
-    auto& allocator = *at::globalContext().getTHCState()->cudaDeviceAllocator;
+    auto& allocator = *c10::cuda::CUDACachingAllocator::get();
     buffer = allocator.allocate(config.global_memory_size());
     semaphores = allocator.allocate(config.semaphore_size());
 
diff --git a/aten/src/THC/THCGeneral.cpp b/aten/src/THC/THCGeneral.cpp
index 89e9257eeb62..697a3f9203fb 100644
--- a/aten/src/THC/THCGeneral.cpp
+++ b/aten/src/THC/THCGeneral.cpp
@@ -39,9 +39,6 @@ THCState* THCState_alloc(void)
 
 void THCudaInit(THCState* state)
 {
-  if (!state->cudaDeviceAllocator) {
-    state->cudaDeviceAllocator = c10::cuda::CUDACachingAllocator::get();
-  }
   if (!state->cudaHostAllocator) {
     state->cudaHostAllocator = getTHCCachingHostAllocator();
   }
@@ -107,9 +104,7 @@ void THCudaShutdown(THCState* state)
   free(state->p2pAccessEnabled);
 
   free(state->resourcesPerDevice);
-  if (state->cudaDeviceAllocator == c10::cuda::CUDACachingAllocator::get()) {
-    c10::cuda::CUDACachingAllocator::emptyCache();
-  }
+  c10::cuda::CUDACachingAllocator::emptyCache();
   if (state->cudaHostAllocator == getTHCCachingHostAllocator()) {
     THCCachingHostAllocator_emptyCache();
   }
@@ -295,13 +290,11 @@ void __THCusparseCheck(cusparseStatus_t status, const char *file, const int line
 
 void* THCudaMalloc(THCState *state, size_t size)
 {
-  THCudaCheck(cudaGetLastError());
-  c10::Allocator* allocator = state->cudaDeviceAllocator;
-  return allocator->raw_allocate(size);
+  return c10::cuda::CUDACachingAllocator::raw_alloc(size);
 }
 
 void THCudaFree(THCState *state, void* ptr) {
-  state->cudaDeviceAllocator->raw_deallocate(ptr);
+  c10::cuda::CUDACachingAllocator::raw_delete(ptr);
 }
 
 at::DataPtr THCudaHostAlloc(THCState *state, size_t size)
@@ -320,7 +313,6 @@ void THCudaHostRecord(THCState *state, void *ptr) {
 cudaError_t THCudaMemGetInfo(THCState *state,  size_t* freeBytes, size_t* totalBytes, size_t* largestBlock)
 {
   size_t cachedBytes = 0;
-  c10::Allocator* allocator = state->cudaDeviceAllocator;
 
   *largestBlock = 0;
   /* get info from CUDA first */
@@ -336,9 +328,7 @@ cudaError_t THCudaMemGetInfo(THCState *state,  size_t* freeBytes, size_t* totalB
   /* not always true - our optimistic guess here */
   *largestBlock = *freeBytes;
 
-  if (allocator == c10::cuda::CUDACachingAllocator::get()) {
-    c10::cuda::CUDACachingAllocator::cacheInfo(device, &cachedBytes, largestBlock);
-  }
+  c10::cuda::CUDACachingAllocator::cacheInfo(device, &cachedBytes, largestBlock);
 
   /* Adjust resulting free bytes number. largesBlock unused for now */
   *freeBytes += cachedBytes;
diff --git a/aten/src/THC/THCGeneral.hpp b/aten/src/THC/THCGeneral.hpp
index dc12b43cff8b..fd3bea2f5cbf 100644
--- a/aten/src/THC/THCGeneral.hpp
+++ b/aten/src/THC/THCGeneral.hpp
@@ -10,12 +10,10 @@ struct THCState {
   int numDevices;
 
   /* Allocator using cudaMallocHost. */
-  // NB: These allocators (specifically, cudaHostAllocator) MUST implement
-  // maybeGlobalBoundDeleter, because we have a few use-cases where we need to
-  // do raw allocations with them (for Thrust).
+  // NB: cudaHostAllocator MUST implement maybeGlobalBoundDeleter, because we have
+  // a few use-cases where we need to do raw allocations with them (for Thrust).
   // TODO: Make this statically obvious
   at::Allocator* cudaHostAllocator;
-  at::Allocator* cudaDeviceAllocator;
 
   /* Table of enabled peer-to-peer access between directed pairs of GPUs.
      If i accessing allocs on j is enabled, p2pAccess[i][j] is 1; 0 otherwise. */
diff --git a/aten/src/THC/THCStorage.cpp b/aten/src/THC/THCStorage.cpp
index b58ed4bbf25d..f46556cc3215 100644
--- a/aten/src/THC/THCStorage.cpp
+++ b/aten/src/THC/THCStorage.cpp
@@ -65,7 +65,7 @@ THCStorage* THCStorage_new(
   THStorage* storage = c10::make_intrusive<at::StorageImpl>(
       data_type,
       0,
-      state->cudaDeviceAllocator,
+      c10::cuda::CUDACachingAllocator::get(),
       true).release();
   return storage;
 }
diff --git a/aten/src/THC/generic/THCStorage.cpp b/aten/src/THC/generic/THCStorage.cpp
index b94ba0f4c6bd..7a3b40bcbcb8 100644
--- a/aten/src/THC/generic/THCStorage.cpp
+++ b/aten/src/THC/generic/THCStorage.cpp
@@ -46,7 +46,7 @@ THCStorage* THCStorage_(new)(THCState *state)
   THStorage* storage = c10::make_intrusive<at::StorageImpl>(
       caffe2::TypeMeta::Make<scalar_t>(),
       0,
-      state->cudaDeviceAllocator,
+      c10::cuda::CUDACachingAllocator::get(),
       true).release();
   return storage;
 }
@@ -56,7 +56,7 @@ THCStorage* THCStorage_(newWithSize)(THCState *state, ptrdiff_t size)
   THStorage* storage = c10::make_intrusive<at::StorageImpl>(
       caffe2::TypeMeta::Make<scalar_t>(),
       size,
-      state->cudaDeviceAllocator,
+      c10::cuda::CUDACachingAllocator::get(),
       true).release();
   return storage;
 }