Revert "raw_alloc ignores PYTORCH_NO_CUDA_MEMORY_CACHING (#131114)"

This reverts commit 70019074806920f95976fedad775d7570294f635. Reverted https://github.com/pytorch/pytorch/pull/131114 on behalf of https://github.com/PaliC due to failing internal builds ([comment](https://github.com/pytorch/pytorch/pull/131114#issuecomment-2390615007))
2025-10-21 05:34:18 +08:00 · 2024-10-03 06:22:53 +00:00
parent 87bf2a8428
commit 0d1701f310
12 changed files with 21 additions and 148 deletions
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp
@ -3127,28 +3127,12 @@ class DeviceCachingAllocator {
 // Returns whether to force all allocations to bypass the caching allocator and
 // go straight to cudaMalloc.  This setting is useful when debugging GPU memory
 // errors, since the caching allocator foils cuda-memcheck.
-static bool forceUncachedAllocator() {
-  // Allow either CUDA or HIP name for env var for maximum user comfort
-  // the CUDA env var avoids being hipified in cuda_to_hip_mappings.py
-  static const char* cuda_env = getenv("PYTORCH_NO_CUDA_MEMORY_CACHING");
-  static const char* rocm_env = getenv("PYTORCH_NO_HIP_MEMORY_CACHING");
-  static bool force_uncached = (cuda_env != nullptr) || (rocm_env != nullptr);
+bool forceUncachedAllocator() {
+  static bool force_uncached =
+      getenv("PYTORCH_NO_CUDA_MEMORY_CACHING") != nullptr;
  return force_uncached;
 }

-static void* uncached_allocate(size_t size) {
-  void* devPtr = nullptr;
-  // Deliberately don't use cudaMallocMaybeCapturing here, to force an error
-  // if someone tries to use forceUncachedAllocator while capturing.
-  C10_CUDA_CHECK(cudaMalloc(&devPtr, size));
-  const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
-  if (C10_UNLIKELY(interp)) {
-    (*interp)->trace_gpu_memory_allocation(
-        c10::kCUDA, reinterpret_cast<uintptr_t>(devPtr));
-  }
-  return devPtr;
-}
-
 static void uncached_delete(void* ptr) {
  if (TORCH_SDT_IS_ENABLED(free)) {
    TORCH_SDT_WITH_SEMAPHORE(free, ptr);
@ -3166,9 +3150,6 @@ void local_raw_delete(void* ptr);

 class NativeCachingAllocator : public CUDAAllocator {
 private:
-  // allows this allocator to be turned on and off programmatically
-  bool enable_ = true;
-
  // Shard allocation region to have independent mutexes to reduce contention.
  static constexpr size_t kNumMutexShard = 67;

@ -3343,14 +3324,6 @@ class NativeCachingAllocator : public CUDAAllocator {
      da->emptyCache();
  }

-  void enable(bool value) override {
-    enable_ = value;
-  }
-
-  bool isEnabled() const override {
-    return enable_;
-  }
-
  void* getBaseAllocation(void* ptr, size_t* outSize) override {
    Block* block = get_allocated_block(ptr);
    if (!block) {
@ -3485,9 +3458,17 @@ class NativeCachingAllocator : public CUDAAllocator {
    void (*deleteFunc)(void*) = &local_raw_delete;
    CUDAStream stream = cuda::getCurrentCUDAStream(device);

-    if (forceUncachedAllocator() || !isEnabled()) {
+    if (forceUncachedAllocator()) {
      deleteFunc = &uncached_delete;
-      devPtr = uncached_allocate(size);
+
+      // Deliberately don't use cudaMallocMaybeCapturing here, to force an error
+      // if someone tries to use forceUncachedAllocator while capturing.
+      C10_CUDA_CHECK(cudaMalloc(&devPtr, size));
+      const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+      if (C10_UNLIKELY(interp)) {
+        (*interp)->trace_gpu_memory_allocation(
+            c10::kCUDA, reinterpret_cast<uintptr_t>(devPtr));
+      }
    } else {
      if (size != 0) {
        this->malloc(&devPtr, device, size, stream);
@ -3501,7 +3482,7 @@ class NativeCachingAllocator : public CUDAAllocator {
    return {devPtr, devPtr, deleteFunc, Device(DeviceType::CUDA, device)};
  }
  DeleterFnPtr raw_deleter() const override {
-    if (forceUncachedAllocator() || !isEnabled()) {
+    if (forceUncachedAllocator()) {
      return &uncached_delete;
    } else {
      return &local_raw_delete;
@ -3558,14 +3539,10 @@ class NativeCachingAllocator : public CUDAAllocator {
    if (nbytes == 0) {
      return nullptr;
    }
+    c10::DeviceIndex device = 0;
+    C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
    void* r = nullptr;
-    if (forceUncachedAllocator() || !isEnabled()) {
-      r = uncached_allocate(nbytes);
-    } else {
-      c10::DeviceIndex device = 0;
-      C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
-      malloc(&r, device, nbytes, cuda::getCurrentCUDAStream(device));
-    }
+    malloc(&r, device, nbytes, cuda::getCurrentCUDAStream(device));
    return r;
  }

@ -3573,14 +3550,10 @@ class NativeCachingAllocator : public CUDAAllocator {
    if (nbytes == 0) {
      return nullptr;
    }
+    c10::DeviceIndex device = 0;
+    C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
    void* r = nullptr;
-    if (forceUncachedAllocator() || !isEnabled()) {
-      r = uncached_allocate(nbytes);
-    } else {
-      c10::DeviceIndex device = 0;
-      C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
-      malloc(&r, device, nbytes, stream);
-    }
+    malloc(&r, device, nbytes, stream);
    return r;
  }

@ -3625,11 +3598,7 @@ class NativeCachingAllocator : public CUDAAllocator {
  }

  void raw_delete(void* ptr) override {
-    if (forceUncachedAllocator() || !isEnabled()) {
-      uncached_delete(ptr);
-    } else {
-      this->free(ptr);
-    }
+    this->free(ptr);
  }

  // In CUDA IPC, sender sends a tensor to receiver via shareIPCHandle,
--- a/c10/cuda/CUDACachingAllocator.h
+++ b/c10/cuda/CUDACachingAllocator.h
@ -206,8 +206,6 @@ class CUDAAllocator : public Allocator {
  virtual bool initialized() = 0;
  virtual void setMemoryFraction(double fraction, c10::DeviceIndex device) = 0;
  virtual void emptyCache() = 0;
-  virtual void enable(bool value) = 0;
-  virtual bool isEnabled() const = 0;
  virtual void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) = 0;
  virtual void* getBaseAllocation(void* ptr, size_t* size) = 0;
  virtual void recordStream(const DataPtr&, CUDAStream stream) = 0;
@ -329,14 +327,6 @@ inline void emptyCache() {
  return get()->emptyCache();
 }

-inline void enable(bool value) {
-  return get()->enable(value);
-}
-
-inline bool isEnabled() {
-  return get()->isEnabled();
-}
-
 inline void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) {
  return get()->cacheInfo(device, largestBlock);
 }
--- a/c10/cuda/CUDAMallocAsyncAllocator.cpp
+++ b/c10/cuda/CUDAMallocAsyncAllocator.cpp
@ -496,14 +496,6 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
    }
  }

-  void enable(bool) override {
-    // cannot disable
-  }
-
-  bool isEnabled() const override {
-    return true;
-  }
-
  void cacheInfo(c10::DeviceIndex device, size_t* maxWorkspaceGuess) override {
    // The only consumer of cacheInfo is getMaxWorkspaceSize in Conv_v7.cpp.
    // Afaict, the role of cacheInfo is to give getMaxWorkspaceSize a reasonable
--- a/docs/source/cuda.rst
+++ b/docs/source/cuda.rst
@ -123,15 +123,6 @@ Memory management
     MemPool
     MemPoolContext

-.. currentmodule:: torch.cuda.memory
-
-.. autosummary::
-    :toctree: generated
-    :nosignatures:
-
-    caching_allocator_enable
-
-.. currentmodule:: torch.cuda
 .. autoclass:: torch.cuda.use_mem_pool

 .. FIXME The following doesn't seem to exist. Is it supposed to?
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@ -3770,43 +3770,6 @@ class TestCudaMallocAsync(TestCase):
                "pinned_num_register_threads:1024"
            )

-    def test_cachingAllocator_raw_alloc(self):
-        # Test that raw_alloc respects the setting that
-        # activates/deactivates the caching allocator
-
-        # Helper function that calls raw_alloc and returns
-        # relevant field in data structure
-        def requested_bytes_alloc_stats(raw_alloc_size, stream):
-            start = torch.cuda.memory_stats()["requested_bytes.all.allocated"]
-            torch._C._cuda_cudaCachingAllocator_raw_alloc(raw_alloc_size, stream)
-            finish = torch.cuda.memory_stats()["requested_bytes.all.allocated"]
-            return finish - start
-
-        torch.cuda.empty_cache()
-        device = torch._C._cuda_getDevice()
-        stream = torch._C._cuda_getCurrentRawStream(device)
-        torch._C._cuda_resetAccumulatedMemoryStats(device)
-
-        # size of allocation
-        raw_alloc_size = 1024 * 1024  # 1 MB
-
-        try:
-            # Deactivate the caching allocator
-            torch.cuda.caching_allocator_enable(False)
-
-            # For a deactivated caching allocator, result is zero
-            cuda_alloc_size = requested_bytes_alloc_stats(raw_alloc_size, stream)
-            self.assertEqual(cuda_alloc_size, 0)
-
-        finally:
-            # Make sure we get back to the default state that is
-            # an activated caching allocator
-            torch.cuda.caching_allocator_enable(True)
-
-            # For an active caching allocator, result matches raw_alloc_size
-            cuda_alloc_size = requested_bytes_alloc_stats(raw_alloc_size, stream)
-            self.assertEqual(cuda_alloc_size, raw_alloc_size)
-
    @parametrize("max_split_size_mb_setting", [False, True])
    def test_raises_oom(self, max_split_size_mb_setting):
        if max_split_size_mb_setting:
--- a/torch/_C/init.pyi.in
+++ b/torch/_C/init.pyi.in
@ -1833,7 +1833,6 @@ def _cuda_getCompiledVersion() -> _int: ...
 def _cuda_cudaHostAllocator() -> _int: ...
 def _cuda_cudaCachingAllocator_raw_alloc(size: _int, cuda_stream: _int) -> _int: ...
 def _cuda_cudaCachingAllocator_raw_delete(ptr: _int) -> None: ...
-def _cuda_cudaCachingAllocator_enable(val: _bool) -> None: ...
 def _cuda_cudaCachingAllocator_set_allocator_settings(env: str) -> None: ...
 def _cuda_beginAllocateToPool(device: _int, mempool_id: Tuple[_int, _int]) -> None: ...
 def _cuda_beginAllocateCurrentStreamToPool(device: _int, mempool_id: Tuple[_int, _int]) -> None: ...
--- a/torch/_dynamo/trace_rules.py
+++ b/torch/_dynamo/trace_rules.py
@ -2540,7 +2540,6 @@ torch_non_c_binding_in_graph_functions = dict.fromkeys(
        "torch.cuda.memory._snapshot",
        "torch.cuda.memory.caching_allocator_alloc",
        "torch.cuda.memory.caching_allocator_delete",
-        "torch.cuda.memory.caching_allocator_enable",
        "torch.cuda.memory.change_current_allocator",
        "torch.cuda.memory.empty_cache",
        "torch.cuda.memory.get_allocator_backend",
--- a/torch/csrc/cuda/CUDAPluggableAllocator.h
+++ b/torch/csrc/cuda/CUDAPluggableAllocator.h
@ -110,10 +110,6 @@ struct TORCH_CUDA_CPP_API CUDAPluggableAllocator
  bool initialized() override;
  void setMemoryFraction(double fraction, c10::DeviceIndex device) override;
  void emptyCache() override;
-  void enable(bool) override {}
-  bool isEnabled() const override {
-    return true;
-  }
  void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) override;
  void* getBaseAllocation(void* ptr, size_t* size) override;

--- a/torch/csrc/cuda/Module.cpp
+++ b/torch/csrc/cuda/Module.cpp
@ -428,19 +428,6 @@ PyObject* THCPModule_cudaCachingAllocator_raw_delete(
  END_HANDLE_TH_ERRORS
 }

-PyObject* THCPModule_cudaCachingAllocator_enable(
-    PyObject* _unused,
-    PyObject* arg) {
-  HANDLE_TH_ERRORS
-  TORCH_CHECK(
-      THPUtils_checkBool(arg),
-      "cudaCachingAllocator_enable expects a bool, but got ",
-      THPUtils_typename(arg));
-  c10::cuda::CUDACachingAllocator::enable(THPUtils_unpackBool(arg));
-  Py_RETURN_NONE;
-  END_HANDLE_TH_ERRORS
-}
-
 PyObject* THCPModule_cudaCachingAllocator_set_allocator_settings(
    PyObject* _unused,
    PyObject* env) {
@ -1869,10 +1856,6 @@ static struct PyMethodDef _THCPModule_methods[] = {
     THCPModule_cudaCachingAllocator_raw_delete,
     METH_O,
     nullptr},
-    {"_cuda_cudaCachingAllocator_enable",
-     THCPModule_cudaCachingAllocator_enable,
-     METH_O,
-     nullptr},
    {"_cuda_cudaCachingAllocator_set_allocator_settings",
     THCPModule_cudaCachingAllocator_set_allocator_settings,
     METH_O,
--- a/torch/cuda/init.py
+++ b/torch/cuda/init.py
@ -1578,7 +1578,6 @@ __all__ = [
    "amp",
    "caching_allocator_alloc",
    "caching_allocator_delete",
-    "caching_allocator_enable",
    "can_device_access_peer",
    "check_error",
    "cudaStatus",
--- a/torch/cuda/memory.py
+++ b/torch/cuda/memory.py
@ -29,7 +29,6 @@ from ._memory_viz import memory as _memory, segments as _segments
 __all__ = [
    "caching_allocator_alloc",
    "caching_allocator_delete",
-    "caching_allocator_enable",
    "set_per_process_memory_fraction",
    "empty_cache",
    "memory_stats",
@ -149,12 +148,6 @@ def caching_allocator_delete(mem_ptr):
    torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr)


-def caching_allocator_enable(value: bool = True) -> None:
-    r"""Enable or disable the CUDA memory allocator. On by default."""
-    if is_initialized():
-        torch._C._cuda_cudaCachingAllocator_enable(value)
-
-
 def set_per_process_memory_fraction(
    fraction, device: Union[Device, int] = None
 ) -> None:
--- a/torch/utils/hipify/cuda_to_hip_mappings.py
+++ b/torch/utils/hipify/cuda_to_hip_mappings.py
@ -8568,7 +8568,6 @@ PYTORCH_SPECIFIC_MAPPINGS = collections.OrderedDict(

 CAFFE2_SPECIFIC_MAPPINGS = collections.OrderedDict(
    [
-        ("PYTORCH_NO_CUDA_MEMORY_CACHING", ("PYTORCH_NO_CUDA_MEMORY_CACHING", API_CAFFE2)),
        ("cuda_stream", ("hip_stream", API_CAFFE2)),
        # if the header is a native hip folder (under hip directory),
        # there is no need to add a hip path to it; the trie in hipify script