Uses context pointer for deleter to enable multiple CUDAPluggableAllocator usage (#130472)

We should be able to create multiple CUDAPluggableAllocators in the same pytorch program (see https://github.com/pytorch/pytorch/issues/124807, https://github.com/pytorch/pytorch/pull/125722 for context). When mixing CUDAPluggableAllocators in the same pytorch program, we need to make sure that the deleter passed in through the CUDAPluggableAllocator gets "attached" to the data_ptr and persist until program exit (when it's called to free the memory). Currently, CUDAPluggableAllocator maintains a global `current_custom_allocator`. When creating the `DataPtr`, `raw_deleter` attaches `custom_raw_deleter` to the DataPtr which calls `current_custom_allocator->raw_delete(...)`. This approach is fine when using only one allocator, however for multiple allocator use case, DataPtr would be using the deleter of whatever is in the `current_custom_allocator`. For example, if allocation 1 was done with `cudaMalloc` and allocation 2 was done with `ncclMemAlloc`, and if `current_custom_allocator` is currently pointing to the CUDAPluggableAllocator with `ncclMemAlloc` - when cleaning up the allocation 1, we'd be using `ncclMemFree` instead of `cudaFree`. In this PR, we solve the above problem by remembering the `free_fn_` using a deleter context. Hence, there is no need to go through an allocator object to find the deleter. CC: @zdevito @ptrblck @eqy Pull Request resolved: https://github.com/pytorch/pytorch/pull/130472 Approved by: https://github.com/eqy, https://github.com/ezyang
2025-10-20 12:54:11 +08:00 · 2024-07-18 11:33:19 +00:00
parent 28a74b9fa4
commit 38b7d89aa4
5 changed files with 112 additions and 22 deletions
--- a/aten/src/ATen/test/cuda_allocator_test.cpp
+++ b/aten/src/ATen/test/cuda_allocator_test.cpp
@ -5,6 +5,51 @@

 #include <ATen/test/allocator_clone_test.h>

+#include <torch/csrc/cuda/CUDAPluggableAllocator.h>
+
 TEST(AllocatorTestCUDA, test_clone) {
  test_allocator_clone(c10::cuda::CUDACachingAllocator::get());
 }
+
+static int called_dummy_free_0 = 0;
+static int called_dummy_free_1 = 0;
+
+void* dummy_alloc_0(size_t size, int device, void* stream) {return nullptr;}
+void dummy_free_0(void* data, size_t size, int device, void* stream) {
+  called_dummy_free_0++;
+}
+void dummy_free_1(void* data, size_t size, int device, void* stream) {
+  called_dummy_free_1++;
+}
+
+// Tests that data_ptrs have their respective deleters
+// when mixing allocators
+TEST(AllocatorTestCUDA, test_pluggable_allocator_deleters) {
+  // Create a tensor with dummy_allocator_0, where dummy_free_0 is the deleter
+  auto dummy_allocator_0 = torch::cuda::CUDAPluggableAllocator::createCustomAllocator(dummy_alloc_0, dummy_free_0);
+  c10::cuda::CUDACachingAllocator::allocator.store(dummy_allocator_0.get());
+  at::Tensor a = at::empty({0}, at::TensorOptions().device(at::kCUDA));
+
+  // Create a tensor with dummy_allocator_1, where dummy_free_1 is the deleter
+  auto dummy_allocator_1 = torch::cuda::CUDAPluggableAllocator::createCustomAllocator(dummy_alloc_0, dummy_free_1);
+  c10::cuda::CUDACachingAllocator::allocator.store(dummy_allocator_1.get());
+  at::Tensor b = at::empty({0}, at::TensorOptions().device(at::kCUDA));
+
+  // Manually use a's deleter
+  auto* ctx = a.storage().data_ptr().get_context();
+  a.storage().data_ptr().get_deleter()(ctx);
+  a.storage().mutable_data_ptr().release_context();
+
+  // a's deleter is dummy_free_0
+  // dummy_free_0 should be called above, so called_dummy_free_0 should be 1
+  ASSERT_TRUE(called_dummy_free_0 == 1);
+
+  // Manually use b's deleter
+  ctx = b.storage().data_ptr().get_context();
+  b.storage().data_ptr().get_deleter()(ctx);
+  b.storage().mutable_data_ptr().release_context();
+
+  // b's deleter is dummy_free_1
+  // dummy_free_1 should be called above, so called_dummy_free_1 should be 1
+  ASSERT_TRUE(called_dummy_free_1 == 1);
+}
--- a/build_variables.bzl
+++ b/build_variables.bzl
@ -661,6 +661,7 @@ libtorch_cuda_core_sources = [
    "torch/csrc/CudaIPCTypes.cpp",
    "torch/csrc/cuda/comm.cpp",
    "torch/csrc/cuda/memory_snapshot.cpp",
+    "torch/csrc/cuda/CUDAPluggableAllocator.cpp",
    "torch/csrc/inductor/aoti_runner/model_container_runner_cuda.cpp",
    "torch/csrc/inductor/aoti_torch/shim_cuda.cpp",
    "torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp",
@ -772,7 +773,6 @@ libtorch_python_cuda_core_sources = [
    "torch/csrc/cuda/shared/cudart.cpp",
    "torch/csrc/cuda/shared/nvtx.cpp",
    "torch/csrc/cuda/utils.cpp",
-    "torch/csrc/cuda/CUDAPluggableAllocator.cpp",
 ]

 libtorch_python_cuda_sources = libtorch_python_cuda_core_sources + [
--- a/torch/csrc/cuda/CUDAPluggableAllocator.cpp
+++ b/torch/csrc/cuda/CUDAPluggableAllocator.cpp
@ -8,6 +8,23 @@

 namespace torch::cuda::CUDAPluggableAllocator {

+CUDAPluggableAllocatorDeleterContext::CUDAPluggableAllocatorDeleterContext(
+    std::function<FreeFuncType> free_fn,
+    void* data,
+    size_t size,
+    int device,
+    cudaStream_t stream)
+    : free_fn_(free_fn),
+      data_(data),
+      size_(size),
+      device_(device),
+      stream_(stream) {}
+
+void CUDAPluggableAllocatorDeleterContext::free() {
+  free_fn_(data_, size_, device_, stream_);
+  delete this;
+}
+
 int device_count = 0;

 void custom_raw_deleter(void* ptr);
@ -26,8 +43,8 @@ _AllocationMetadata::_AllocationMetadata(
 // This avoids having to link against libtorch for C++ based custom allocators
 // And also use this from python
 CUDAPluggableAllocator::CUDAPluggableAllocator(
-    std::function<void*(size_t, int, cudaStream_t)> alloc_fn,
-    std::function<void(void*, size_t, int, cudaStream_t)> free_fn)
+    std::function<MallocFuncType> alloc_fn,
+    std::function<FreeFuncType> free_fn)
    : alloc_fn_(std::move(alloc_fn)), free_fn_(std::move(free_fn)) {}

 CUDAPluggableAllocator::CUDAPluggableAllocator(CUDAPluggableAllocator& other)
@ -99,8 +116,10 @@ c10::DataPtr CUDAPluggableAllocator::allocate(size_t size) {
  C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
  cudaStream_t stream = c10::cuda::getCurrentCUDAStream(device);
  void* r = this->malloc(size, device, stream);
+  auto* ctx = new CUDAPluggableAllocatorDeleterContext(
+      free_fn_, r, size, device, stream);
  c10::DataPtr data_ptr = {
-      r, r, raw_deleter(), c10::Device(c10::DeviceType::CUDA, device)};
+      r, ctx, raw_deleter(), c10::Device(c10::DeviceType::CUDA, device)};
  return data_ptr;
 }

@ -348,8 +367,8 @@ getCurrentAllocator() {
 // TODO: add more functions in the argument
 std::shared_ptr<c10::cuda::CUDACachingAllocator::CUDAAllocator>
 createCustomAllocator(
-    std::function<void*(size_t, int, cudaStream_t)> alloc_fn,
-    std::function<void(void*, size_t, int, cudaStream_t)> free_fn) {
+    std::function<MallocFuncType> alloc_fn,
+    std::function<FreeFuncType> free_fn) {
  std::shared_ptr<CUDAPluggableAllocator> allocator(
      new CUDAPluggableAllocator(std::move(alloc_fn), std::move(free_fn)));
  allocator->init(device_count);
@ -366,8 +385,8 @@ void changeCurrentAllocator(
  current_custom_allocator = allocator;
 }

-void custom_raw_deleter(void* ptr) {
-  current_custom_allocator->raw_delete(ptr);
+void custom_raw_deleter(void* ctx) {
+  reinterpret_cast<CUDAPluggableAllocatorDeleterContext*>(ctx)->free();
 }

 } // namespace torch::cuda::CUDAPluggableAllocator
--- a/torch/csrc/cuda/CUDAPluggableAllocator.h
+++ b/torch/csrc/cuda/CUDAPluggableAllocator.h
@ -11,19 +11,47 @@

 namespace torch::cuda::CUDAPluggableAllocator {

+using MallocFuncType = void*(size_t, int, cudaStream_t);
+using FreeFuncType = void(void*, size_t, int, cudaStream_t);
+
+// A CUDAPluggableAllocatorDeleterContext object is used as the `ctx`
+// argument for DataPtr. We need context because a user can use
+// multiple allocators in the same PyTorch program, and
+// the allocators can have different free functions, such as:
+// free, cudaFree, cudaFreeAsync, ncclMemFree etc.
+struct TORCH_CUDA_CPP_API CUDAPluggableAllocatorDeleterContext {
+  explicit CUDAPluggableAllocatorDeleterContext(
+      std::function<FreeFuncType> free_fn,
+      void* data,
+      size_t size,
+      int device,
+      cudaStream_t stream);
+
+  void free();
+
+ private:
+  std::function<FreeFuncType> free_fn_;
+  void* data_;
+  size_t size_;
+  int device_;
+  cudaStream_t stream_;
+};
+
 #if defined(TORCH_HIP_VERSION)
 using streamType = c10::hip::HIPStream;
 #else
 using streamType = c10::cuda::CUDAStream;
 #endif

-std::shared_ptr<c10::cuda::CUDACachingAllocator::CUDAAllocator>
+TORCH_CUDA_CPP_API std::shared_ptr<
+    c10::cuda::CUDACachingAllocator::CUDAAllocator>
 getCurrentAllocator();
-std::shared_ptr<c10::cuda::CUDACachingAllocator::CUDAAllocator>
+TORCH_CUDA_CPP_API std::shared_ptr<
+    c10::cuda::CUDACachingAllocator::CUDAAllocator>
 createCustomAllocator(
-    std::function<void*(size_t, int, cudaStream_t)> alloc_fn,
-    std::function<void(void*, size_t, int, cudaStream_t)> free_fn);
-void changeCurrentAllocator(
+    std::function<MallocFuncType> alloc_fn,
+    std::function<FreeFuncType> free_fn);
+TORCH_CUDA_CPP_API void changeCurrentAllocator(
    const std::shared_ptr<c10::cuda::CUDACachingAllocator::CUDAAllocator>&
        allocator);

@ -38,11 +66,11 @@ struct _AllocationMetadata {
  cudaStream_t stream;
 };

-struct CUDAPluggableAllocator
+struct TORCH_CUDA_CPP_API CUDAPluggableAllocator
    : public c10::cuda::CUDACachingAllocator::CUDAAllocator {
  CUDAPluggableAllocator(
-      std::function<void*(size_t, int, cudaStream_t)> alloc_fn,
-      std::function<void(void*, size_t, int, cudaStream_t)> free_fn);
+      std::function<MallocFuncType> alloc_fn,
+      std::function<FreeFuncType> free_fn);

  CUDAPluggableAllocator(CUDAPluggableAllocator& other);

@ -131,8 +159,8 @@ struct CUDAPluggableAllocator
  void copy_data(void* dest, const void* src, std::size_t count) const final;

 protected:
-  std::function<void*(size_t, int, cudaStream_t)> alloc_fn_;
-  std::function<void(void*, size_t, int, cudaStream_t)> free_fn_;
+  std::function<MallocFuncType> alloc_fn_;
+  std::function<FreeFuncType> free_fn_;
  std::function<void(int)> init_fn_;
  std::function<void()> reset_fn_;
  std::function<void(double, int)> memory_fraction_fn_;
--- a/torch/csrc/cuda/Module.cpp
+++ b/torch/csrc/cuda/Module.cpp
@ -1175,16 +1175,14 @@ static void registerCudaPluggableAllocator(PyObject* module) {
            self.set_release_pool(func);
          });
  m.def("_cuda_customAllocator", [](uint64_t malloc_ptr, uint64_t free_ptr) {
-    using MallocFuncType = void*(size_t, int, cudaStream_t);
-    using FreeFuncType = void(void*, size_t, int, cudaStream_t);
+    using namespace torch::cuda::CUDAPluggableAllocator;
    std::function<MallocFuncType> malloc_fn =
        // NOLINTNEXTLINE(performance-no-int-to-ptr)
        reinterpret_cast<MallocFuncType*>(malloc_ptr);
    std::function<FreeFuncType> free_fn =
        // NOLINTNEXTLINE(performance-no-int-to-ptr)
        reinterpret_cast<FreeFuncType*>(free_ptr);
-    return torch::cuda::CUDAPluggableAllocator::createCustomAllocator(
-        malloc_fn, free_fn);
+    return createCustomAllocator(malloc_fn, free_fn);
  });

  // NOLINTNEXTLINE(bugprone-unused-raii)