[ghstack-poisoned]
This commit is contained in:
Daniel Galvez
2025-11-10 18:36:26 -08:00
parent 5e90e65d0f
commit 813523de1a
9 changed files with 140 additions and 116 deletions

View File

@ -1,6 +1,7 @@
#include <ATen/cuda/CUDAGeneratorImpl.h>
#include <ATen/cuda/CUDAGraph.h>
#include <ATen/cuda/Exceptions.h>
#include <ATen/cuda/MemPool.h>
#include <ATen/Functions.h>
#include <c10/cuda/CUDAFunctions.h>
@ -13,7 +14,7 @@ static bool _cuda_graphs_debug = false;
MempoolId_t graph_pool_handle() {
// Sets just the second value, to distinguish it from MempoolId_ts created from
// cudaStreamGetCaptureInfo id_s in capture_begin.
return c10::cuda::MemPool::graph_pool_handle();
return at::cuda::MemPool::graph_pool_handle();
}
/**
@ -90,7 +91,7 @@ void CUDAGraph::capture_begin(MempoolId_t pool/*=0*/, cudaStreamCaptureMode capt
} else {
// User did not ask us to share a mempool. Create graph pool handle using is_user_created=false.
// Sets just the first value, to distinguish it from MempoolId_ts created by graph_pool_handle().
mempool_id_ = c10::cuda::MemPool::graph_pool_handle(false);
mempool_id_ = at::cuda::MemPool::graph_pool_handle(false);
TORCH_INTERNAL_ASSERT(mempool_id_.first > 0);
}

View File

@ -0,0 +1,69 @@
#include <ATen/core/CachingHostAllocator.h>
#include <ATen/cuda/MemPool.h>
namespace at::cuda {
// uid_ is incremented when a user creates a MemPool,
// for example: using graph_pool_handle() or c10::cuda::MemPool().
//
// uuid_ is incremented when CUDAGraph creates a MemPool
// as a result of a user not providing a pool.
//
// MempoolId_t of {0, 0} is used to denote when no MemPool has been
// passed to a function, either by user or CUDAGraphs. For example,
// default value of MempoolId_t for capture_begin function is {0, 0}.
// That's why uid_ and uuid_ start at 1.
std::atomic<CaptureId_t> MemPool::uid_{1};
std::atomic<CaptureId_t> MemPool::uuid_{1};
MemPool::MemPool(
CUDACachingAllocator::CUDAAllocator* allocator,
bool is_user_created,
bool use_on_oom)
: allocator_(allocator), is_user_created_(is_user_created) {
if (is_user_created_) {
id_ = {0, uid_++};
} else {
id_ = {uuid_++, 0};
}
device_ = c10::cuda::current_device();
CUDACachingAllocator::createOrIncrefPool(device_, id_, allocator);
if (use_on_oom) {
CUDACachingAllocator::setUseOnOOM(device_, id_);
}
}
MemPool::~MemPool() {
// TORCH_INTERNAL_ASSERT(use_count() == 1);
// We used to assert that TORCH_INTERNAL_ASSERT(use_count() == 1);
// However, this assertion is not true if a memory pool is shared
// with a cuda graph. That CUDAGraph will increase the use count
// until it is reset.
CUDACachingAllocator::releasePool(device_, id_);
c10::cuda::CUDACachingAllocator::emptyCache(id_);
}
MempoolId_t MemPool::id() {
return id_;
}
CUDACachingAllocator::CUDAAllocator* MemPool::allocator() {
return allocator_;
}
int MemPool::use_count() {
return CUDACachingAllocator::getPoolUseCount(device_, id_);
}
c10::DeviceIndex MemPool::device() {
return device_;
}
MempoolId_t MemPool::graph_pool_handle(bool is_user_created) {
if (is_user_created) {
return {0, uid_++};
}
return {uuid_++, 0};
}
} // namespace at::cuda

View File

@ -0,0 +1,44 @@
#pragma once
#include <c10/core/Allocator.h>
#include <c10/cuda/CUDACachingAllocator.h>
namespace at::cuda {
// Keep BC only
using c10::CaptureId_t;
using c10::MempoolId_t;
// MemPool represents a pool of memory in a caching allocator. Currently,
// it's just the ID of the pool object maintained in the CUDACachingAllocator.
//
// An allocator pointer can be passed to the MemPool to define how the
// allocations should be done in the pool. For example: using a different
// system allocator such as ncclMemAlloc.
struct TORCH_CUDA_CPP_API MemPool {
MemPool(
c10::cuda::CUDACachingAllocator::CUDAAllocator* allocator = nullptr,
bool is_user_created = true,
bool use_on_oom = false);
MemPool(const MemPool&) = delete;
MemPool(MemPool&&) = default;
MemPool& operator=(const MemPool&) = delete;
MemPool& operator=(MemPool&&) = default;
~MemPool();
MempoolId_t id();
c10::cuda::CUDACachingAllocator::CUDAAllocator* allocator();
int use_count();
c10::DeviceIndex device();
static MempoolId_t graph_pool_handle(bool is_user_created = true);
private:
static std::atomic<CaptureId_t> uid_;
static std::atomic<CaptureId_t> uuid_;
c10::cuda::CUDACachingAllocator::CUDAAllocator* allocator_;
bool is_user_created_;
MempoolId_t id_;
c10::DeviceIndex device_;
};
} // namespace at::cuda

View File

@ -19,6 +19,17 @@
namespace c10 {
using CaptureId_t = unsigned long long;
// first is set if the instance is created by CUDAGraph::capture_begin.
// second is set if the instance is created by at::cuda::graph_pool_handle.
using MempoolId_t = std::pair<CaptureId_t, CaptureId_t>;
struct MempoolIdHash {
std::size_t operator()(const MempoolId_t& mempool_id) const noexcept {
return mempool_id.first != 0 ? mempool_id.first : mempool_id.second;
}
};
// A DataPtr is a unique pointer (with an attached deleter and some
// context for the deleter) to some memory, which also records what
// device is for its data.

View File

@ -1012,12 +1012,6 @@ PrivatePoolState::PrivatePoolState(
}
}
struct MempoolIdHash {
std::size_t operator()(const MempoolId_t& mempool_id) const noexcept {
return mempool_id.first != 0 ? mempool_id.first : mempool_id.second;
}
};
cudaError_t allocPrimitive(void** ptr, size_t size, AllocParams& p) {
if (p.pool->owner_PrivatePool && p.pool->owner_PrivatePool->allocator()) {
*ptr = p.pool->owner_PrivatePool->allocator()->raw_alloc(size);
@ -4513,66 +4507,3 @@ std::atomic<CUDAAllocator*> allocator;
static BackendStaticInitializer backend_static_initializer;
} // namespace cuda::CUDACachingAllocator
} // namespace c10
namespace c10::cuda {
// uid_ is incremented when a user creates a MemPool,
// for example: using graph_pool_handle() or c10::cuda::MemPool().
//
// uuid_ is incremented when CUDAGraph creates a MemPool
// as a result of a user not providing a pool.
//
// MempoolId_t of {0, 0} is used to denote when no MemPool has been
// passed to a function, either by user or CUDAGraphs. For example,
// default value of MempoolId_t for capture_begin function is {0, 0}.
// That's why uid_ and uuid_ start at 1.
std::atomic<CaptureId_t> MemPool::uid_{1};
std::atomic<CaptureId_t> MemPool::uuid_{1};
MemPool::MemPool(
CUDACachingAllocator::CUDAAllocator* allocator,
bool is_user_created,
bool use_on_oom)
: allocator_(allocator), is_user_created_(is_user_created) {
if (is_user_created_) {
id_ = {0, uid_++};
} else {
id_ = {uuid_++, 0};
}
device_ = c10::cuda::current_device();
CUDACachingAllocator::createOrIncrefPool(device_, id_, allocator);
if (use_on_oom) {
CUDACachingAllocator::setUseOnOOM(device_, id_);
}
}
MemPool::~MemPool() {
TORCH_INTERNAL_ASSERT(use_count() == 1);
CUDACachingAllocator::releasePool(device_, id_);
c10::cuda::CUDACachingAllocator::emptyCache(id_);
}
MempoolId_t MemPool::id() {
return id_;
}
CUDACachingAllocator::CUDAAllocator* MemPool::allocator() {
return allocator_;
}
int MemPool::use_count() {
return CUDACachingAllocator::getPoolUseCount(device_, id_);
}
c10::DeviceIndex MemPool::device() {
return device_;
}
MempoolId_t MemPool::graph_pool_handle(bool is_user_created) {
if (is_user_created) {
return {0, uid_++};
}
return {uuid_++, 0};
}
} // namespace c10::cuda

View File

@ -554,41 +554,7 @@ inline std::string getUserMetadata() {
} // namespace c10::cuda::CUDACachingAllocator
namespace c10::cuda {
// Keep BC only
using c10::CaptureId_t;
using c10::MempoolId_t;
// MemPool represents a pool of memory in a caching allocator. Currently,
// it's just the ID of the pool object maintained in the CUDACachingAllocator.
//
// An allocator pointer can be passed to the MemPool to define how the
// allocations should be done in the pool. For example: using a different
// system allocator such as ncclMemAlloc.
struct C10_CUDA_API MemPool {
MemPool(
CUDACachingAllocator::CUDAAllocator* allocator = nullptr,
bool is_user_created = true,
bool use_on_oom = false);
MemPool(const MemPool&) = delete;
MemPool(MemPool&&) = default;
MemPool& operator=(const MemPool&) = delete;
MemPool& operator=(MemPool&&) = default;
~MemPool();
MempoolId_t id();
CUDACachingAllocator::CUDAAllocator* allocator();
int use_count();
c10::DeviceIndex device();
static MempoolId_t graph_pool_handle(bool is_user_created = true);
private:
static std::atomic<CaptureId_t> uid_;
static std::atomic<CaptureId_t> uuid_;
CUDACachingAllocator::CUDAAllocator* allocator_;
bool is_user_created_;
MempoolId_t id_;
c10::DeviceIndex device_;
};
} // namespace c10::cuda

View File

@ -4,6 +4,7 @@
#include <torch/csrc/utils/device_lazy_init.h>
#include <torch/csrc/utils/pybind.h>
#include <ATen/cuda/MemPool.h>
#include <c10/cuda/CUDACachingAllocator.h>
template <typename T>
@ -12,16 +13,16 @@ using shared_ptr_class_ = py::class_<T, std::shared_ptr<T>>;
// NOLINTNEXTLINE(misc-use-internal-linkage)
void THCPMemPool_init(PyObject* module) {
auto torch_C_m = py::handle(module).cast<py::module>();
shared_ptr_class_<::c10::cuda::MemPool>(torch_C_m, "_MemPool")
shared_ptr_class_<::at::cuda::MemPool>(torch_C_m, "_MemPool")
.def(
py::init([](c10::cuda::CUDACachingAllocator::CUDAAllocator* allocator,
bool is_user_created,
bool use_on_oom) {
torch::utils::device_lazy_init(at::kCUDA);
return std::make_shared<::c10::cuda::MemPool>(
return std::make_shared<::at::cuda::MemPool>(
allocator, is_user_created, use_on_oom);
}))
.def_property_readonly("id", &::c10::cuda::MemPool::id)
.def_property_readonly("allocator", &::c10::cuda::MemPool::allocator)
.def("use_count", &::c10::cuda::MemPool::use_count);
.def_property_readonly("id", &::at::cuda::MemPool::id)
.def_property_readonly("allocator", &::at::cuda::MemPool::allocator)
.def("use_count", &::at::cuda::MemPool::use_count);
}

View File

@ -1103,7 +1103,7 @@ ErrorType ProcessGroupNCCL::getError() {
return error_;
}
void ProcessGroupNCCL::registerMemPool(c10::cuda::MemPool* pool, bool symm) {
void ProcessGroupNCCL::registerMemPool(at::cuda::MemPool* pool, bool symm) {
const auto key = std::to_string(pool->device());
LOG(INFO) << logPrefix()
<< "Performing NCCL user buffer registration for all buffers in "
@ -1137,7 +1137,7 @@ void ProcessGroupNCCL::registerMemPool(c10::cuda::MemPool* pool, bool symm) {
}
}
void ProcessGroupNCCL::deregisterMemPool(c10::cuda::MemPool* pool) {
void ProcessGroupNCCL::deregisterMemPool(at::cuda::MemPool* pool) {
const auto key = std::to_string(pool->device());
LOG(INFO) << logPrefix()
<< "Performing NCCL user buffer deregistration for all buffers in "
@ -5819,7 +5819,7 @@ at::Tensor ProcessGroupNCCL::allocateTensor(
reinterpret_cast<c10::cuda::CUDACachingAllocator::CUDAAllocator*>(
getMemAllocator().get());
// Pool is created
memPool_ = std::make_unique<c10::cuda::MemPool>(allocator);
memPool_ = std::make_unique<at::cuda::MemPool>(allocator);
// Register so that we call ncclCommRegister on all new allocations
registerMemPool(memPool_.get(), /*symmetric*/ false);
LOG(INFO) << logPrefix() << "Created memory pool";

View File

@ -30,6 +30,7 @@
#include <ATen/DynamicLibrary.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/CUDAEvent.h>
#include <ATen/cuda/MemPool.h>
#include <c10/core/Stream.h>
#include <c10/core/StreamGuard.h>
#include <c10/cuda/CUDACachingAllocator.h>
@ -1007,11 +1008,11 @@ class TORCH_API ProcessGroupNCCL : public Backend {
// Performs NCCL user buffer registration for all buffers in
// the given MemPool
void registerMemPool(c10::cuda::MemPool* pool, bool symm = false);
void registerMemPool(at::cuda::MemPool* pool, bool symm = false);
// Performs NCCL user buffer de-registration for all buffers in
// the given MemPool
void deregisterMemPool(c10::cuda::MemPool* pool);
void deregisterMemPool(at::cuda::MemPool* pool);
// This method adds a temporary extension for the timeout period,
// applying to all collectives between the calling of this API and
@ -1469,7 +1470,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
std::optional<bool> useNonblocking_{std::nullopt};
// Communication-optimized memory pool associated with this PG
std::unique_ptr<c10::cuda::MemPool> memPool_ = nullptr;
std::unique_ptr<at::cuda::MemPool> memPool_ = nullptr;
};
// Reset the flighrecorder recordings for the current rank.