Add DeviceAllocator as the base device allocator (#138222)

# Motivation In line with [RFC] [A device-agnostic Python device memory related API design for stream-based accelerators](https://github.com/pytorch/pytorch/issues/134978), some memory-related APIs are widely used in popular repositories, such as HuggingFace [so many if-else conditional code](https://github.com/search?q=repo%3Ahuggingface%2Faccelerate%20torch.cuda.empty_cache&type=code). We would like to introduce a generic API set under torch.accelerator namespace to generalize these user cases. <div align="center"> <table> <tr> <td> Device-specific memory APIs torch.xxx.foo</td> <td> Device-agnostic memory APIs torch.accelerator.foo</td> </tr> <tr> <td> ```python torch.xxx.empty_cache ``` </td> <td> ```python torch.accelerator.empty_cache ``` </td> </tr> <tr> <td> ```python torch.xxx.reset_peak_memory_stats ``` </td> <td> ```python torch.accelerator.reset_peak_memory_stats ``` </td> </tr> <tr> <td> ```python torch.xxx.reset_accumulated_memory_stats ``` </td> <td> ```python torch.accelerator.reset_accumulated_memory_stats ``` </td> </tr> <tr> <td> ```python torch.xxx.memory_stats ``` </td> <td> ```python torch.accelerator.memory_stats ``` </td> </tr> <tr> <td> ```python torch.xxx.memory_allocated ``` </td> <td> ```python torch.accelerator.memory_allocated ``` </td> </tr> <tr> <td> ```python torch.xxx.max_memory_allocated ``` </td> <td> ```python torch.accelerator.max_memory_allocated ``` </td> </tr> <tr> <td> ```python torch.xxx.memory_reserved ``` </td> <td> ```python torch.accelerator.memory_reserved ``` </td> </tr> <tr> <td> ```python torch.xxx.max_memory_reserved ``` </td> <td> ```python torch.accelerator.max_memory_reserved ``` </td> </tr> </table> </div> # Solution This design follows a similar pattern to `HostAllocator`. We're introducing a base class `DeviceAllocator`, from which `CUDAAllocator` and `XPUAllocator` will inherit. This allows us to provide a unified call path like: `torch.accelerator.empty_cache()` -> `GetDeviceAllocator(allocator)->empty_cache()`. Pull Request resolved: https://github.com/pytorch/pytorch/pull/138222 Approved by: https://github.com/albanD, https://github.com/Camyll
2025-10-20 21:14:14 +08:00 · 2025-08-08 15:17:56 +00:00
parent c5ec5458a5
commit d7114f05b1
8 changed files with 98 additions and 22 deletions
--- a/aten/src/ATen/cuda/CUDAGraph.cpp
+++ b/aten/src/ATen/cuda/CUDAGraph.cpp
@ -2,7 +2,6 @@
 #include <ATen/cuda/CUDAGraph.h>
 #include <ATen/cuda/Exceptions.h>
 #include <ATen/Functions.h>
-#include <c10/cuda/CUDACachingAllocator.h>
 #include <c10/cuda/CUDAFunctions.h>

 #include <cstddef>
--- a/aten/src/ATen/cuda/CUDAGraph.h
+++ b/aten/src/ATen/cuda/CUDAGraph.h
@ -2,6 +2,7 @@

 #include <ATen/Tensor.h>
 #include <c10/core/Device.h>
+#include <c10/cuda/CUDACachingAllocator.h>
 #include <c10/cuda/CUDAGraphsC10Utils.h>
 #include <c10/cuda/CUDAStream.h>
 #include <c10/util/flat_hash_map.h>
--- a/c10/core/CachingDeviceAllocator.cpp
+++ b/c10/core/CachingDeviceAllocator.cpp
@ -0,0 +1,10 @@
+#include <c10/core/CachingDeviceAllocator.h>
+
+namespace c10 {
+
+// Ensures proper DLL export of this pure virtual base class on Windows,
+// since it's mainly used in other DLLs outside c10.dll.
+DeviceAllocator::DeviceAllocator() = default;
+DeviceAllocator::~DeviceAllocator() = default;
+
+} // namespace c10
--- a/c10/core/CachingDeviceAllocator.h
+++ b/c10/core/CachingDeviceAllocator.h
@ -1,6 +1,7 @@
 #pragma once

 #include <c10/core/Allocator.h>
+#include <c10/core/Stream.h>

 namespace c10::CachingDeviceAllocator {

@ -59,3 +60,55 @@ struct DeviceStats {
 };

 } // namespace c10::CachingDeviceAllocator
+
+namespace c10 {
+
+using CaptureId_t = unsigned long long;
+
+// first is set if the instance is created by Graph mode capture_begin.
+// second is set if the instance is created by Graph mode graph_pool_handle.
+using MempoolId_t = std::pair<CaptureId_t, CaptureId_t>;
+
+struct C10_API DeviceAllocator : public c10::Allocator {
+  DeviceAllocator();
+  ~DeviceAllocator() override;
+
+  // Returns true if the allocator has been properly initialized and is ready
+  // for use
+  virtual bool initialized() = 0;
+
+  // Releases all cached device memory from the specified memory pool back to
+  // the system
+  virtual void emptyCache(MempoolId_t mempool_id = {0, 0}) = 0;
+
+  // Associates a memory allocation with a stream to establish dependency
+  // tracking. Prevents memory reuse until all operations on the specified
+  // stream complete
+  virtual void recordStream(const DataPtr& ptr, c10::Stream stream) = 0;
+
+  // Retrieves comprehensive memory statistics for the specified device,
+  // including allocation patterns, usage metrics
+  virtual CachingDeviceAllocator::DeviceStats getDeviceStats(
+      c10::DeviceIndex device) = 0;
+
+  // Resets cumulative allocation statistics for the specified device to zero
+  virtual void resetAccumulatedStats(c10::DeviceIndex device) = 0;
+
+  // Resets peak memory usage statistics for the specified device
+  virtual void resetPeakStats(c10::DeviceIndex device) = 0;
+};
+
+// This function is used to get the DeviceAllocator for a specific device type
+// and keep backward compatibility with c10::GetAllocator.
+C10_API inline DeviceAllocator* getDeviceAllocator(const DeviceType& t) {
+  TORCH_CHECK(
+      t != DeviceType::CPU,
+      "getDeviceAllocator is not supported for CPU device type.");
+  auto* allocator = c10::GetAllocator(t);
+  auto* device_allocator = dynamic_cast<DeviceAllocator*>(allocator);
+  TORCH_INTERNAL_ASSERT(
+      device_allocator, "Allocator for ", t, " is not a DeviceAllocator.");
+  return device_allocator;
+}
+
+} // namespace c10
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp
@ -4118,7 +4118,18 @@ struct BackendStaticInitializer {

  BackendStaticInitializer() {
    auto r = parseEnvForBackend();
+// Register this HIP allocator as the CUDA allocator to allow it to work
+// with both c10::GetAllocator(kCUDA) and c10::getDeviceAllocator(kCUDA)
+// APIs. We don't perform this masquerading inside
+// HIPAllocatorMasqueradingAsCUDA because it needs to happen during static
+// initialization, and doing so there may introduce static initialization
+// order (SIOF) issues.
+#define HIP_MASQUERADING_AS_CUDA \
+  "cud"                          \
+  "a"
+    at::SetAllocator(c10::Device(HIP_MASQUERADING_AS_CUDA).type(), r, 0);
    allocator.store(r);
+#undef HIP_MASQUERADING_AS_CUDA
  }
 };

--- a/c10/cuda/CUDACachingAllocator.h
+++ b/c10/cuda/CUDACachingAllocator.h
@ -202,25 +202,24 @@ struct ShareableHandle {
  std::string handle;
 };

-class CUDAAllocator : public Allocator {
+class CUDAAllocator : public DeviceAllocator {
 public:
  virtual void* raw_alloc(size_t nbytes) = 0;
  virtual void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream) = 0;
  virtual void raw_delete(void* ptr) = 0;
  virtual void init(int device_count) = 0;
-  virtual bool initialized() = 0;
  virtual double getMemoryFraction(c10::DeviceIndex device) = 0;
  virtual void setMemoryFraction(double fraction, c10::DeviceIndex device) = 0;
-  virtual void emptyCache(MempoolId_t mempool_id = {0, 0}) = 0;
  virtual void enable(bool value) = 0;
  virtual bool isEnabled() const = 0;
  virtual void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) = 0;
  virtual void* getBaseAllocation(void* ptr, size_t* size) = 0;
-  virtual void recordStream(const DataPtr&, CUDAStream stream) = 0;
-  virtual c10::CachingDeviceAllocator::DeviceStats getDeviceStats(
-      c10::DeviceIndex device) = 0;
-  virtual void resetAccumulatedStats(c10::DeviceIndex device) = 0;
-  virtual void resetPeakStats(c10::DeviceIndex device) = 0;
+  // Keep for BC only
+  virtual void recordStream(const DataPtr& ptr, CUDAStream stream) = 0;
+  void recordStream(const DataPtr& ptr, c10::Stream stream) override {
+    CUDAStream cuda_stream = CUDAStream(stream);
+    recordStream(ptr, cuda_stream);
+  }
  virtual SnapshotInfo snapshot(MempoolId_t mempool_id = {0, 0}) = 0;
  virtual void beginAllocateToPool(
      c10::DeviceIndex device,
@ -525,6 +524,10 @@ inline void enablePeerAccess(

 namespace c10::cuda {

+// Keep BC only
+using c10::CaptureId_t;
+using c10::MempoolId_t;
+
 // MemPool represents a pool of memory in a caching allocator. Currently,
 // it's just the ID of the pool object maintained in the CUDACachingAllocator.
 //
--- a/c10/cuda/CUDAGraphsC10Utils.h
+++ b/c10/cuda/CUDAGraphsC10Utils.h
@ -9,12 +9,6 @@

 namespace c10::cuda {

-using CaptureId_t = unsigned long long;
-
-// first is set if the instance is created by CUDAGraph::capture_begin.
-// second is set if the instance is created by at::cuda::graph_pool_handle.
-using MempoolId_t = std::pair<CaptureId_t, CaptureId_t>;
-
 // RAII guard for "cudaStreamCaptureMode", a thread-local value
 // that controls the error-checking strictness of a capture.
 struct C10_CUDA_API CUDAStreamCaptureModeGuard {
--- a/c10/xpu/XPUCachingAllocator.cpp
+++ b/c10/xpu/XPUCachingAllocator.cpp
@ -539,7 +539,7 @@ class DeviceCachingAllocator {

 static void local_raw_delete(void* ptr);

-class XPUAllocator : public Allocator {
+class XPUAllocator : public DeviceAllocator {
 private:
  std::mutex mutex;
  ska::flat_hash_map<void*, Block*> allocated_blocks;
@ -575,6 +575,10 @@ class XPUAllocator : public Allocator {
    }
  }

+  bool initialized() override {
+    return !device_allocators.empty();
+  }
+
  void malloc(
      void** devPtr,
      DeviceIndex device,
@ -609,13 +613,13 @@ class XPUAllocator : public Allocator {
    }
  }

-  void emptyCache() {
+  void emptyCache(MempoolId_t mempool_id [[maybe_unused]] = {0, 0}) override {
    for (auto& da : device_allocators) {
      da->emptyCache();
    }
  }

-  void recordStream(const DataPtr& ptr, XPUStream stream) {
+  void recordStream(const DataPtr& ptr, c10::Stream stream) override {
    if (!ptr.get()) {
      return;
    }
@ -625,7 +629,8 @@ class XPUAllocator : public Allocator {

    Block* block = get_allocated_block(ptr.get());
    TORCH_CHECK(block, "No allocated block can be found.");
-    device_allocators[block->device]->recordStream(block, stream);
+    c10::xpu::XPUStream xpu_stream{stream};
+    device_allocators[block->device]->recordStream(block, xpu_stream);
  }

  DataPtr allocate(size_t size) override {
@ -678,17 +683,17 @@ class XPUAllocator : public Allocator {
        ": did you call init?");
  }

-  DeviceStats getDeviceStats(DeviceIndex device) {
+  DeviceStats getDeviceStats(DeviceIndex device) override {
    assertValidDevice(device);
    return device_allocators[device]->getStats();
  }

-  void resetPeakStats(DeviceIndex device) {
+  void resetPeakStats(DeviceIndex device) override {
    assertValidDevice(device);
    device_allocators[device]->resetPeakStats();
  }

-  void resetAccumulatedStats(DeviceIndex device) {
+  void resetAccumulatedStats(DeviceIndex device) override {
    assertValidDevice(device);
    device_allocators[device]->resetAccumulatedStats();
  }