mirror of
https://github.com/pytorch/pytorch.git
synced 2025-11-11 22:34:53 +08:00
Update
[ghstack-poisoned]
This commit is contained in:
@ -1,6 +1,7 @@
|
||||
#include <ATen/cuda/CUDAGeneratorImpl.h>
|
||||
#include <ATen/cuda/CUDAGraph.h>
|
||||
#include <ATen/cuda/Exceptions.h>
|
||||
#include <ATen/cuda/MemPool.h>
|
||||
#include <ATen/Functions.h>
|
||||
#include <c10/cuda/CUDAFunctions.h>
|
||||
|
||||
@ -13,7 +14,7 @@ static bool _cuda_graphs_debug = false;
|
||||
MempoolId_t graph_pool_handle() {
|
||||
// Sets just the second value, to distinguish it from MempoolId_ts created from
|
||||
// cudaStreamGetCaptureInfo id_s in capture_begin.
|
||||
return c10::cuda::MemPool::graph_pool_handle();
|
||||
return at::cuda::MemPool::graph_pool_handle();
|
||||
}
|
||||
|
||||
/**
|
||||
@ -90,7 +91,7 @@ void CUDAGraph::capture_begin(MempoolId_t pool/*=0*/, cudaStreamCaptureMode capt
|
||||
} else {
|
||||
// User did not ask us to share a mempool. Create graph pool handle using is_user_created=false.
|
||||
// Sets just the first value, to distinguish it from MempoolId_ts created by graph_pool_handle().
|
||||
mempool_id_ = c10::cuda::MemPool::graph_pool_handle(false);
|
||||
mempool_id_ = at::cuda::MemPool::graph_pool_handle(false);
|
||||
TORCH_INTERNAL_ASSERT(mempool_id_.first > 0);
|
||||
}
|
||||
|
||||
|
||||
69
aten/src/ATen/cuda/MemPool.cpp
Normal file
69
aten/src/ATen/cuda/MemPool.cpp
Normal file
@ -0,0 +1,69 @@
|
||||
#include <ATen/core/CachingHostAllocator.h>
|
||||
#include <ATen/cuda/MemPool.h>
|
||||
|
||||
namespace at::cuda {
|
||||
|
||||
// uid_ is incremented when a user creates a MemPool,
|
||||
// for example: using graph_pool_handle() or c10::cuda::MemPool().
|
||||
//
|
||||
// uuid_ is incremented when CUDAGraph creates a MemPool
|
||||
// as a result of a user not providing a pool.
|
||||
//
|
||||
// MempoolId_t of {0, 0} is used to denote when no MemPool has been
|
||||
// passed to a function, either by user or CUDAGraphs. For example,
|
||||
// default value of MempoolId_t for capture_begin function is {0, 0}.
|
||||
// That's why uid_ and uuid_ start at 1.
|
||||
std::atomic<CaptureId_t> MemPool::uid_{1};
|
||||
std::atomic<CaptureId_t> MemPool::uuid_{1};
|
||||
|
||||
MemPool::MemPool(
|
||||
CUDACachingAllocator::CUDAAllocator* allocator,
|
||||
bool is_user_created,
|
||||
bool use_on_oom)
|
||||
: allocator_(allocator), is_user_created_(is_user_created) {
|
||||
if (is_user_created_) {
|
||||
id_ = {0, uid_++};
|
||||
} else {
|
||||
id_ = {uuid_++, 0};
|
||||
}
|
||||
device_ = c10::cuda::current_device();
|
||||
CUDACachingAllocator::createOrIncrefPool(device_, id_, allocator);
|
||||
if (use_on_oom) {
|
||||
CUDACachingAllocator::setUseOnOOM(device_, id_);
|
||||
}
|
||||
}
|
||||
|
||||
MemPool::~MemPool() {
|
||||
// TORCH_INTERNAL_ASSERT(use_count() == 1);
|
||||
// We used to assert that TORCH_INTERNAL_ASSERT(use_count() == 1);
|
||||
// However, this assertion is not true if a memory pool is shared
|
||||
// with a cuda graph. That CUDAGraph will increase the use count
|
||||
// until it is reset.
|
||||
CUDACachingAllocator::releasePool(device_, id_);
|
||||
c10::cuda::CUDACachingAllocator::emptyCache(id_);
|
||||
}
|
||||
|
||||
MempoolId_t MemPool::id() {
|
||||
return id_;
|
||||
}
|
||||
|
||||
CUDACachingAllocator::CUDAAllocator* MemPool::allocator() {
|
||||
return allocator_;
|
||||
}
|
||||
|
||||
int MemPool::use_count() {
|
||||
return CUDACachingAllocator::getPoolUseCount(device_, id_);
|
||||
}
|
||||
|
||||
c10::DeviceIndex MemPool::device() {
|
||||
return device_;
|
||||
}
|
||||
|
||||
MempoolId_t MemPool::graph_pool_handle(bool is_user_created) {
|
||||
if (is_user_created) {
|
||||
return {0, uid_++};
|
||||
}
|
||||
return {uuid_++, 0};
|
||||
}
|
||||
|
||||
} // namespace at::cuda
|
||||
44
aten/src/ATen/cuda/MemPool.h
Normal file
44
aten/src/ATen/cuda/MemPool.h
Normal file
@ -0,0 +1,44 @@
|
||||
#pragma once
|
||||
|
||||
#include <c10/core/Allocator.h>
|
||||
#include <c10/cuda/CUDACachingAllocator.h>
|
||||
|
||||
namespace at::cuda {
|
||||
|
||||
// Keep BC only
|
||||
using c10::CaptureId_t;
|
||||
using c10::MempoolId_t;
|
||||
|
||||
// MemPool represents a pool of memory in a caching allocator. Currently,
|
||||
// it's just the ID of the pool object maintained in the CUDACachingAllocator.
|
||||
//
|
||||
// An allocator pointer can be passed to the MemPool to define how the
|
||||
// allocations should be done in the pool. For example: using a different
|
||||
// system allocator such as ncclMemAlloc.
|
||||
struct TORCH_CUDA_CPP_API MemPool {
|
||||
MemPool(
|
||||
c10::cuda::CUDACachingAllocator::CUDAAllocator* allocator = nullptr,
|
||||
bool is_user_created = true,
|
||||
bool use_on_oom = false);
|
||||
MemPool(const MemPool&) = delete;
|
||||
MemPool(MemPool&&) = default;
|
||||
MemPool& operator=(const MemPool&) = delete;
|
||||
MemPool& operator=(MemPool&&) = default;
|
||||
~MemPool();
|
||||
|
||||
MempoolId_t id();
|
||||
c10::cuda::CUDACachingAllocator::CUDAAllocator* allocator();
|
||||
int use_count();
|
||||
c10::DeviceIndex device();
|
||||
static MempoolId_t graph_pool_handle(bool is_user_created = true);
|
||||
|
||||
private:
|
||||
static std::atomic<CaptureId_t> uid_;
|
||||
static std::atomic<CaptureId_t> uuid_;
|
||||
c10::cuda::CUDACachingAllocator::CUDAAllocator* allocator_;
|
||||
bool is_user_created_;
|
||||
MempoolId_t id_;
|
||||
c10::DeviceIndex device_;
|
||||
};
|
||||
|
||||
} // namespace at::cuda
|
||||
@ -19,6 +19,17 @@
|
||||
|
||||
namespace c10 {
|
||||
|
||||
using CaptureId_t = unsigned long long;
|
||||
// first is set if the instance is created by CUDAGraph::capture_begin.
|
||||
// second is set if the instance is created by at::cuda::graph_pool_handle.
|
||||
using MempoolId_t = std::pair<CaptureId_t, CaptureId_t>;
|
||||
|
||||
struct MempoolIdHash {
|
||||
std::size_t operator()(const MempoolId_t& mempool_id) const noexcept {
|
||||
return mempool_id.first != 0 ? mempool_id.first : mempool_id.second;
|
||||
}
|
||||
};
|
||||
|
||||
// A DataPtr is a unique pointer (with an attached deleter and some
|
||||
// context for the deleter) to some memory, which also records what
|
||||
// device is for its data.
|
||||
|
||||
@ -1012,12 +1012,6 @@ PrivatePoolState::PrivatePoolState(
|
||||
}
|
||||
}
|
||||
|
||||
struct MempoolIdHash {
|
||||
std::size_t operator()(const MempoolId_t& mempool_id) const noexcept {
|
||||
return mempool_id.first != 0 ? mempool_id.first : mempool_id.second;
|
||||
}
|
||||
};
|
||||
|
||||
cudaError_t allocPrimitive(void** ptr, size_t size, AllocParams& p) {
|
||||
if (p.pool->owner_PrivatePool && p.pool->owner_PrivatePool->allocator()) {
|
||||
*ptr = p.pool->owner_PrivatePool->allocator()->raw_alloc(size);
|
||||
@ -4513,66 +4507,3 @@ std::atomic<CUDAAllocator*> allocator;
|
||||
static BackendStaticInitializer backend_static_initializer;
|
||||
} // namespace cuda::CUDACachingAllocator
|
||||
} // namespace c10
|
||||
|
||||
namespace c10::cuda {
|
||||
|
||||
// uid_ is incremented when a user creates a MemPool,
|
||||
// for example: using graph_pool_handle() or c10::cuda::MemPool().
|
||||
//
|
||||
// uuid_ is incremented when CUDAGraph creates a MemPool
|
||||
// as a result of a user not providing a pool.
|
||||
//
|
||||
// MempoolId_t of {0, 0} is used to denote when no MemPool has been
|
||||
// passed to a function, either by user or CUDAGraphs. For example,
|
||||
// default value of MempoolId_t for capture_begin function is {0, 0}.
|
||||
// That's why uid_ and uuid_ start at 1.
|
||||
std::atomic<CaptureId_t> MemPool::uid_{1};
|
||||
std::atomic<CaptureId_t> MemPool::uuid_{1};
|
||||
|
||||
MemPool::MemPool(
|
||||
CUDACachingAllocator::CUDAAllocator* allocator,
|
||||
bool is_user_created,
|
||||
bool use_on_oom)
|
||||
: allocator_(allocator), is_user_created_(is_user_created) {
|
||||
if (is_user_created_) {
|
||||
id_ = {0, uid_++};
|
||||
} else {
|
||||
id_ = {uuid_++, 0};
|
||||
}
|
||||
device_ = c10::cuda::current_device();
|
||||
CUDACachingAllocator::createOrIncrefPool(device_, id_, allocator);
|
||||
if (use_on_oom) {
|
||||
CUDACachingAllocator::setUseOnOOM(device_, id_);
|
||||
}
|
||||
}
|
||||
|
||||
MemPool::~MemPool() {
|
||||
TORCH_INTERNAL_ASSERT(use_count() == 1);
|
||||
CUDACachingAllocator::releasePool(device_, id_);
|
||||
c10::cuda::CUDACachingAllocator::emptyCache(id_);
|
||||
}
|
||||
|
||||
MempoolId_t MemPool::id() {
|
||||
return id_;
|
||||
}
|
||||
|
||||
CUDACachingAllocator::CUDAAllocator* MemPool::allocator() {
|
||||
return allocator_;
|
||||
}
|
||||
|
||||
int MemPool::use_count() {
|
||||
return CUDACachingAllocator::getPoolUseCount(device_, id_);
|
||||
}
|
||||
|
||||
c10::DeviceIndex MemPool::device() {
|
||||
return device_;
|
||||
}
|
||||
|
||||
MempoolId_t MemPool::graph_pool_handle(bool is_user_created) {
|
||||
if (is_user_created) {
|
||||
return {0, uid_++};
|
||||
}
|
||||
return {uuid_++, 0};
|
||||
}
|
||||
|
||||
} // namespace c10::cuda
|
||||
|
||||
@ -554,41 +554,7 @@ inline std::string getUserMetadata() {
|
||||
} // namespace c10::cuda::CUDACachingAllocator
|
||||
|
||||
namespace c10::cuda {
|
||||
|
||||
// Keep BC only
|
||||
using c10::CaptureId_t;
|
||||
using c10::MempoolId_t;
|
||||
|
||||
// MemPool represents a pool of memory in a caching allocator. Currently,
|
||||
// it's just the ID of the pool object maintained in the CUDACachingAllocator.
|
||||
//
|
||||
// An allocator pointer can be passed to the MemPool to define how the
|
||||
// allocations should be done in the pool. For example: using a different
|
||||
// system allocator such as ncclMemAlloc.
|
||||
struct C10_CUDA_API MemPool {
|
||||
MemPool(
|
||||
CUDACachingAllocator::CUDAAllocator* allocator = nullptr,
|
||||
bool is_user_created = true,
|
||||
bool use_on_oom = false);
|
||||
MemPool(const MemPool&) = delete;
|
||||
MemPool(MemPool&&) = default;
|
||||
MemPool& operator=(const MemPool&) = delete;
|
||||
MemPool& operator=(MemPool&&) = default;
|
||||
~MemPool();
|
||||
|
||||
MempoolId_t id();
|
||||
CUDACachingAllocator::CUDAAllocator* allocator();
|
||||
int use_count();
|
||||
c10::DeviceIndex device();
|
||||
static MempoolId_t graph_pool_handle(bool is_user_created = true);
|
||||
|
||||
private:
|
||||
static std::atomic<CaptureId_t> uid_;
|
||||
static std::atomic<CaptureId_t> uuid_;
|
||||
CUDACachingAllocator::CUDAAllocator* allocator_;
|
||||
bool is_user_created_;
|
||||
MempoolId_t id_;
|
||||
c10::DeviceIndex device_;
|
||||
};
|
||||
|
||||
} // namespace c10::cuda
|
||||
|
||||
@ -4,6 +4,7 @@
|
||||
#include <torch/csrc/utils/device_lazy_init.h>
|
||||
#include <torch/csrc/utils/pybind.h>
|
||||
|
||||
#include <ATen/cuda/MemPool.h>
|
||||
#include <c10/cuda/CUDACachingAllocator.h>
|
||||
|
||||
template <typename T>
|
||||
@ -12,16 +13,16 @@ using shared_ptr_class_ = py::class_<T, std::shared_ptr<T>>;
|
||||
// NOLINTNEXTLINE(misc-use-internal-linkage)
|
||||
void THCPMemPool_init(PyObject* module) {
|
||||
auto torch_C_m = py::handle(module).cast<py::module>();
|
||||
shared_ptr_class_<::c10::cuda::MemPool>(torch_C_m, "_MemPool")
|
||||
shared_ptr_class_<::at::cuda::MemPool>(torch_C_m, "_MemPool")
|
||||
.def(
|
||||
py::init([](c10::cuda::CUDACachingAllocator::CUDAAllocator* allocator,
|
||||
bool is_user_created,
|
||||
bool use_on_oom) {
|
||||
torch::utils::device_lazy_init(at::kCUDA);
|
||||
return std::make_shared<::c10::cuda::MemPool>(
|
||||
return std::make_shared<::at::cuda::MemPool>(
|
||||
allocator, is_user_created, use_on_oom);
|
||||
}))
|
||||
.def_property_readonly("id", &::c10::cuda::MemPool::id)
|
||||
.def_property_readonly("allocator", &::c10::cuda::MemPool::allocator)
|
||||
.def("use_count", &::c10::cuda::MemPool::use_count);
|
||||
.def_property_readonly("id", &::at::cuda::MemPool::id)
|
||||
.def_property_readonly("allocator", &::at::cuda::MemPool::allocator)
|
||||
.def("use_count", &::at::cuda::MemPool::use_count);
|
||||
}
|
||||
|
||||
@ -1103,7 +1103,7 @@ ErrorType ProcessGroupNCCL::getError() {
|
||||
return error_;
|
||||
}
|
||||
|
||||
void ProcessGroupNCCL::registerMemPool(c10::cuda::MemPool* pool, bool symm) {
|
||||
void ProcessGroupNCCL::registerMemPool(at::cuda::MemPool* pool, bool symm) {
|
||||
const auto key = std::to_string(pool->device());
|
||||
LOG(INFO) << logPrefix()
|
||||
<< "Performing NCCL user buffer registration for all buffers in "
|
||||
@ -1137,7 +1137,7 @@ void ProcessGroupNCCL::registerMemPool(c10::cuda::MemPool* pool, bool symm) {
|
||||
}
|
||||
}
|
||||
|
||||
void ProcessGroupNCCL::deregisterMemPool(c10::cuda::MemPool* pool) {
|
||||
void ProcessGroupNCCL::deregisterMemPool(at::cuda::MemPool* pool) {
|
||||
const auto key = std::to_string(pool->device());
|
||||
LOG(INFO) << logPrefix()
|
||||
<< "Performing NCCL user buffer deregistration for all buffers in "
|
||||
@ -5819,7 +5819,7 @@ at::Tensor ProcessGroupNCCL::allocateTensor(
|
||||
reinterpret_cast<c10::cuda::CUDACachingAllocator::CUDAAllocator*>(
|
||||
getMemAllocator().get());
|
||||
// Pool is created
|
||||
memPool_ = std::make_unique<c10::cuda::MemPool>(allocator);
|
||||
memPool_ = std::make_unique<at::cuda::MemPool>(allocator);
|
||||
// Register so that we call ncclCommRegister on all new allocations
|
||||
registerMemPool(memPool_.get(), /*symmetric*/ false);
|
||||
LOG(INFO) << logPrefix() << "Created memory pool";
|
||||
|
||||
@ -30,6 +30,7 @@
|
||||
#include <ATen/DynamicLibrary.h>
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <ATen/cuda/CUDAEvent.h>
|
||||
#include <ATen/cuda/MemPool.h>
|
||||
#include <c10/core/Stream.h>
|
||||
#include <c10/core/StreamGuard.h>
|
||||
#include <c10/cuda/CUDACachingAllocator.h>
|
||||
@ -1007,11 +1008,11 @@ class TORCH_API ProcessGroupNCCL : public Backend {
|
||||
|
||||
// Performs NCCL user buffer registration for all buffers in
|
||||
// the given MemPool
|
||||
void registerMemPool(c10::cuda::MemPool* pool, bool symm = false);
|
||||
void registerMemPool(at::cuda::MemPool* pool, bool symm = false);
|
||||
|
||||
// Performs NCCL user buffer de-registration for all buffers in
|
||||
// the given MemPool
|
||||
void deregisterMemPool(c10::cuda::MemPool* pool);
|
||||
void deregisterMemPool(at::cuda::MemPool* pool);
|
||||
|
||||
// This method adds a temporary extension for the timeout period,
|
||||
// applying to all collectives between the calling of this API and
|
||||
@ -1469,7 +1470,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
|
||||
std::optional<bool> useNonblocking_{std::nullopt};
|
||||
|
||||
// Communication-optimized memory pool associated with this PG
|
||||
std::unique_ptr<c10::cuda::MemPool> memPool_ = nullptr;
|
||||
std::unique_ptr<at::cuda::MemPool> memPool_ = nullptr;
|
||||
};
|
||||
|
||||
// Reset the flighrecorder recordings for the current rank.
|
||||
|
||||
Reference in New Issue
Block a user