Update

[ghstack-poisoned]
2025-11-11 22:34:53 +08:00 · 2025-11-10 18:36:26 -08:00
parent 5e90e65d0f
commit 813523de1a
9 changed files with 140 additions and 116 deletions
--- a/aten/src/ATen/cuda/CUDAGraph.cpp
+++ b/aten/src/ATen/cuda/CUDAGraph.cpp
@ -1,6 +1,7 @@
 #include <ATen/cuda/CUDAGeneratorImpl.h>
 #include <ATen/cuda/CUDAGraph.h>
 #include <ATen/cuda/Exceptions.h>
+#include <ATen/cuda/MemPool.h>
 #include <ATen/Functions.h>
 #include <c10/cuda/CUDAFunctions.h>

@ -13,7 +14,7 @@ static bool _cuda_graphs_debug = false;
 MempoolId_t graph_pool_handle() {
  // Sets just the second value, to distinguish it from MempoolId_ts created from
  // cudaStreamGetCaptureInfo id_s in capture_begin.
-  return c10::cuda::MemPool::graph_pool_handle();
+  return at::cuda::MemPool::graph_pool_handle();
 }

 /**
@ -90,7 +91,7 @@ void CUDAGraph::capture_begin(MempoolId_t pool/*=0*/, cudaStreamCaptureMode capt
  } else {
    // User did not ask us to share a mempool. Create graph pool handle using is_user_created=false.
    // Sets just the first value, to distinguish it from MempoolId_ts created by graph_pool_handle().
-    mempool_id_ = c10::cuda::MemPool::graph_pool_handle(false);
+    mempool_id_ = at::cuda::MemPool::graph_pool_handle(false);
    TORCH_INTERNAL_ASSERT(mempool_id_.first > 0);
  }

--- a/aten/src/ATen/cuda/MemPool.cpp
+++ b/aten/src/ATen/cuda/MemPool.cpp
@ -0,0 +1,69 @@
+#include <ATen/core/CachingHostAllocator.h>
+#include <ATen/cuda/MemPool.h>
+
+namespace at::cuda {
+
+// uid_ is incremented when a user creates a MemPool,
+// for example: using graph_pool_handle() or c10::cuda::MemPool().
+//
+// uuid_ is incremented when CUDAGraph creates a MemPool
+// as a result of a user not providing a pool.
+//
+// MempoolId_t of {0, 0} is used to denote when no MemPool has been
+// passed to a function, either by user or CUDAGraphs. For example,
+// default value of MempoolId_t for capture_begin function is {0, 0}.
+// That's why uid_ and uuid_ start at 1.
+std::atomic<CaptureId_t> MemPool::uid_{1};
+std::atomic<CaptureId_t> MemPool::uuid_{1};
+
+MemPool::MemPool(
+    CUDACachingAllocator::CUDAAllocator* allocator,
+    bool is_user_created,
+    bool use_on_oom)
+    : allocator_(allocator), is_user_created_(is_user_created) {
+  if (is_user_created_) {
+    id_ = {0, uid_++};
+  } else {
+    id_ = {uuid_++, 0};
+  }
+  device_ = c10::cuda::current_device();
+  CUDACachingAllocator::createOrIncrefPool(device_, id_, allocator);
+  if (use_on_oom) {
+    CUDACachingAllocator::setUseOnOOM(device_, id_);
+  }
+}
+
+MemPool::~MemPool() {
+  // TORCH_INTERNAL_ASSERT(use_count() == 1);
+  // We used to assert that TORCH_INTERNAL_ASSERT(use_count() == 1);
+  // However, this assertion is not true if a memory pool is shared
+  // with a cuda graph. That CUDAGraph will increase the use count
+  // until it is reset.
+  CUDACachingAllocator::releasePool(device_, id_);
+  c10::cuda::CUDACachingAllocator::emptyCache(id_);
+}
+
+MempoolId_t MemPool::id() {
+  return id_;
+}
+
+CUDACachingAllocator::CUDAAllocator* MemPool::allocator() {
+  return allocator_;
+}
+
+int MemPool::use_count() {
+  return CUDACachingAllocator::getPoolUseCount(device_, id_);
+}
+
+c10::DeviceIndex MemPool::device() {
+  return device_;
+}
+
+MempoolId_t MemPool::graph_pool_handle(bool is_user_created) {
+  if (is_user_created) {
+    return {0, uid_++};
+  }
+  return {uuid_++, 0};
+}
+
+} // namespace at::cuda
--- a/aten/src/ATen/cuda/MemPool.h
+++ b/aten/src/ATen/cuda/MemPool.h
@ -0,0 +1,44 @@
+#pragma once
+
+#include <c10/core/Allocator.h>
+#include <c10/cuda/CUDACachingAllocator.h>
+
+namespace at::cuda {
+
+// Keep BC only
+using c10::CaptureId_t;
+using c10::MempoolId_t;
+
+// MemPool represents a pool of memory in a caching allocator. Currently,
+// it's just the ID of the pool object maintained in the CUDACachingAllocator.
+//
+// An allocator pointer can be passed to the MemPool to define how the
+// allocations should be done in the pool. For example: using a different
+// system allocator such as ncclMemAlloc.
+struct TORCH_CUDA_CPP_API MemPool {
+  MemPool(
+      c10::cuda::CUDACachingAllocator::CUDAAllocator* allocator = nullptr,
+      bool is_user_created = true,
+      bool use_on_oom = false);
+  MemPool(const MemPool&) = delete;
+  MemPool(MemPool&&) = default;
+  MemPool& operator=(const MemPool&) = delete;
+  MemPool& operator=(MemPool&&) = default;
+  ~MemPool();
+
+  MempoolId_t id();
+  c10::cuda::CUDACachingAllocator::CUDAAllocator* allocator();
+  int use_count();
+  c10::DeviceIndex device();
+  static MempoolId_t graph_pool_handle(bool is_user_created = true);
+
+ private:
+  static std::atomic<CaptureId_t> uid_;
+  static std::atomic<CaptureId_t> uuid_;
+  c10::cuda::CUDACachingAllocator::CUDAAllocator* allocator_;
+  bool is_user_created_;
+  MempoolId_t id_;
+  c10::DeviceIndex device_;
+};
+
+} // namespace at::cuda
--- a/c10/core/Allocator.h
+++ b/c10/core/Allocator.h
@ -19,6 +19,17 @@

 namespace c10 {

+using CaptureId_t = unsigned long long;
+// first is set if the instance is created by CUDAGraph::capture_begin.
+// second is set if the instance is created by at::cuda::graph_pool_handle.
+using MempoolId_t = std::pair<CaptureId_t, CaptureId_t>;
+
+struct MempoolIdHash {
+  std::size_t operator()(const MempoolId_t& mempool_id) const noexcept {
+    return mempool_id.first != 0 ? mempool_id.first : mempool_id.second;
+  }
+};
+
 // A DataPtr is a unique pointer (with an attached deleter and some
 // context for the deleter) to some memory, which also records what
 // device is for its data.
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp
@ -1012,12 +1012,6 @@ PrivatePoolState::PrivatePoolState(
  }
 }

-struct MempoolIdHash {
-  std::size_t operator()(const MempoolId_t& mempool_id) const noexcept {
-    return mempool_id.first != 0 ? mempool_id.first : mempool_id.second;
-  }
-};
-
 cudaError_t allocPrimitive(void** ptr, size_t size, AllocParams& p) {
  if (p.pool->owner_PrivatePool && p.pool->owner_PrivatePool->allocator()) {
    *ptr = p.pool->owner_PrivatePool->allocator()->raw_alloc(size);
@ -4513,66 +4507,3 @@ std::atomic<CUDAAllocator*> allocator;
 static BackendStaticInitializer backend_static_initializer;
 } // namespace cuda::CUDACachingAllocator
 } // namespace c10
-
-namespace c10::cuda {
-
-// uid_ is incremented when a user creates a MemPool,
-// for example: using graph_pool_handle() or c10::cuda::MemPool().
-//
-// uuid_ is incremented when CUDAGraph creates a MemPool
-// as a result of a user not providing a pool.
-//
-// MempoolId_t of {0, 0} is used to denote when no MemPool has been
-// passed to a function, either by user or CUDAGraphs. For example,
-// default value of MempoolId_t for capture_begin function is {0, 0}.
-// That's why uid_ and uuid_ start at 1.
-std::atomic<CaptureId_t> MemPool::uid_{1};
-std::atomic<CaptureId_t> MemPool::uuid_{1};
-
-MemPool::MemPool(
-    CUDACachingAllocator::CUDAAllocator* allocator,
-    bool is_user_created,
-    bool use_on_oom)
-    : allocator_(allocator), is_user_created_(is_user_created) {
-  if (is_user_created_) {
-    id_ = {0, uid_++};
-  } else {
-    id_ = {uuid_++, 0};
-  }
-  device_ = c10::cuda::current_device();
-  CUDACachingAllocator::createOrIncrefPool(device_, id_, allocator);
-  if (use_on_oom) {
-    CUDACachingAllocator::setUseOnOOM(device_, id_);
-  }
-}
-
-MemPool::~MemPool() {
-  TORCH_INTERNAL_ASSERT(use_count() == 1);
-  CUDACachingAllocator::releasePool(device_, id_);
-  c10::cuda::CUDACachingAllocator::emptyCache(id_);
-}
-
-MempoolId_t MemPool::id() {
-  return id_;
-}
-
-CUDACachingAllocator::CUDAAllocator* MemPool::allocator() {
-  return allocator_;
-}
-
-int MemPool::use_count() {
-  return CUDACachingAllocator::getPoolUseCount(device_, id_);
-}
-
-c10::DeviceIndex MemPool::device() {
-  return device_;
-}
-
-MempoolId_t MemPool::graph_pool_handle(bool is_user_created) {
-  if (is_user_created) {
-    return {0, uid_++};
-  }
-  return {uuid_++, 0};
-}
-
-} // namespace c10::cuda
--- a/c10/cuda/CUDACachingAllocator.h
+++ b/c10/cuda/CUDACachingAllocator.h
@ -554,41 +554,7 @@ inline std::string getUserMetadata() {
 } // namespace c10::cuda::CUDACachingAllocator

 namespace c10::cuda {
-
 // Keep BC only
 using c10::CaptureId_t;
 using c10::MempoolId_t;
-
-// MemPool represents a pool of memory in a caching allocator. Currently,
-// it's just the ID of the pool object maintained in the CUDACachingAllocator.
-//
-// An allocator pointer can be passed to the MemPool to define how the
-// allocations should be done in the pool. For example: using a different
-// system allocator such as ncclMemAlloc.
-struct C10_CUDA_API MemPool {
-  MemPool(
-      CUDACachingAllocator::CUDAAllocator* allocator = nullptr,
-      bool is_user_created = true,
-      bool use_on_oom = false);
-  MemPool(const MemPool&) = delete;
-  MemPool(MemPool&&) = default;
-  MemPool& operator=(const MemPool&) = delete;
-  MemPool& operator=(MemPool&&) = default;
-  ~MemPool();
-
-  MempoolId_t id();
-  CUDACachingAllocator::CUDAAllocator* allocator();
-  int use_count();
-  c10::DeviceIndex device();
-  static MempoolId_t graph_pool_handle(bool is_user_created = true);
-
- private:
-  static std::atomic<CaptureId_t> uid_;
-  static std::atomic<CaptureId_t> uuid_;
-  CUDACachingAllocator::CUDAAllocator* allocator_;
-  bool is_user_created_;
-  MempoolId_t id_;
-  c10::DeviceIndex device_;
-};
-
 } // namespace c10::cuda
--- a/torch/csrc/cuda/MemPool.cpp
+++ b/torch/csrc/cuda/MemPool.cpp
@ -4,6 +4,7 @@
 #include <torch/csrc/utils/device_lazy_init.h>
 #include <torch/csrc/utils/pybind.h>

+#include <ATen/cuda/MemPool.h>
 #include <c10/cuda/CUDACachingAllocator.h>

 template <typename T>
@ -12,16 +13,16 @@ using shared_ptr_class_ = py::class_<T, std::shared_ptr<T>>;
 // NOLINTNEXTLINE(misc-use-internal-linkage)
 void THCPMemPool_init(PyObject* module) {
  auto torch_C_m = py::handle(module).cast<py::module>();
-  shared_ptr_class_<::c10::cuda::MemPool>(torch_C_m, "_MemPool")
+  shared_ptr_class_<::at::cuda::MemPool>(torch_C_m, "_MemPool")
      .def(
          py::init([](c10::cuda::CUDACachingAllocator::CUDAAllocator* allocator,
                      bool is_user_created,
                      bool use_on_oom) {
            torch::utils::device_lazy_init(at::kCUDA);
-            return std::make_shared<::c10::cuda::MemPool>(
+            return std::make_shared<::at::cuda::MemPool>(
                allocator, is_user_created, use_on_oom);
          }))
-      .def_property_readonly("id", &::c10::cuda::MemPool::id)
-      .def_property_readonly("allocator", &::c10::cuda::MemPool::allocator)
-      .def("use_count", &::c10::cuda::MemPool::use_count);
+      .def_property_readonly("id", &::at::cuda::MemPool::id)
+      .def_property_readonly("allocator", &::at::cuda::MemPool::allocator)
+      .def("use_count", &::at::cuda::MemPool::use_count);
 }
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@ -1103,7 +1103,7 @@ ErrorType ProcessGroupNCCL::getError() {
  return error_;
 }

-void ProcessGroupNCCL::registerMemPool(c10::cuda::MemPool* pool, bool symm) {
+void ProcessGroupNCCL::registerMemPool(at::cuda::MemPool* pool, bool symm) {
  const auto key = std::to_string(pool->device());
  LOG(INFO) << logPrefix()
            << "Performing NCCL user buffer registration for all buffers in "
@ -1137,7 +1137,7 @@ void ProcessGroupNCCL::registerMemPool(c10::cuda::MemPool* pool, bool symm) {
  }
 }

-void ProcessGroupNCCL::deregisterMemPool(c10::cuda::MemPool* pool) {
+void ProcessGroupNCCL::deregisterMemPool(at::cuda::MemPool* pool) {
  const auto key = std::to_string(pool->device());
  LOG(INFO) << logPrefix()
            << "Performing NCCL user buffer deregistration for all buffers in "
@ -5819,7 +5819,7 @@ at::Tensor ProcessGroupNCCL::allocateTensor(
        reinterpret_cast<c10::cuda::CUDACachingAllocator::CUDAAllocator*>(
            getMemAllocator().get());
    // Pool is created
-    memPool_ = std::make_unique<c10::cuda::MemPool>(allocator);
+    memPool_ = std::make_unique<at::cuda::MemPool>(allocator);
    // Register so that we call ncclCommRegister on all new allocations
    registerMemPool(memPool_.get(), /*symmetric*/ false);
    LOG(INFO) << logPrefix() << "Created memory pool";
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
@ -30,6 +30,7 @@
 #include <ATen/DynamicLibrary.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/CUDAEvent.h>
+#include <ATen/cuda/MemPool.h>
 #include <c10/core/Stream.h>
 #include <c10/core/StreamGuard.h>
 #include <c10/cuda/CUDACachingAllocator.h>
@ -1007,11 +1008,11 @@ class TORCH_API ProcessGroupNCCL : public Backend {

  // Performs NCCL user buffer registration for all buffers in
  // the given MemPool
-  void registerMemPool(c10::cuda::MemPool* pool, bool symm = false);
+  void registerMemPool(at::cuda::MemPool* pool, bool symm = false);

  // Performs NCCL user buffer de-registration for all buffers in
  // the given MemPool
-  void deregisterMemPool(c10::cuda::MemPool* pool);
+  void deregisterMemPool(at::cuda::MemPool* pool);

  // This method adds a temporary extension for the timeout period,
  // applying to all collectives between the calling of this API and
@ -1469,7 +1470,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
  std::optional<bool> useNonblocking_{std::nullopt};

  // Communication-optimized memory pool associated with this PG
-  std::unique_ptr<c10::cuda::MemPool> memPool_ = nullptr;
+  std::unique_ptr<at::cuda::MemPool> memPool_ = nullptr;
 };

 // Reset the flighrecorder recordings for the current rank.