mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Add DeviceAllocator as the base device allocator (#138222)
# Motivation In line with [RFC] [A device-agnostic Python device memory related API design for stream-based accelerators](https://github.com/pytorch/pytorch/issues/134978), some memory-related APIs are widely used in popular repositories, such as HuggingFace [so many if-else conditional code](https://github.com/search?q=repo%3Ahuggingface%2Faccelerate%20torch.cuda.empty_cache&type=code). We would like to introduce a generic API set under torch.accelerator namespace to generalize these user cases. <div align="center"> <table> <tr> <td> Device-specific memory APIs torch.xxx.foo</td> <td> Device-agnostic memory APIs torch.accelerator.foo</td> </tr> <tr> <td> ```python torch.xxx.empty_cache ``` </td> <td> ```python torch.accelerator.empty_cache ``` </td> </tr> <tr> <td> ```python torch.xxx.reset_peak_memory_stats ``` </td> <td> ```python torch.accelerator.reset_peak_memory_stats ``` </td> </tr> <tr> <td> ```python torch.xxx.reset_accumulated_memory_stats ``` </td> <td> ```python torch.accelerator.reset_accumulated_memory_stats ``` </td> </tr> <tr> <td> ```python torch.xxx.memory_stats ``` </td> <td> ```python torch.accelerator.memory_stats ``` </td> </tr> <tr> <td> ```python torch.xxx.memory_allocated ``` </td> <td> ```python torch.accelerator.memory_allocated ``` </td> </tr> <tr> <td> ```python torch.xxx.max_memory_allocated ``` </td> <td> ```python torch.accelerator.max_memory_allocated ``` </td> </tr> <tr> <td> ```python torch.xxx.memory_reserved ``` </td> <td> ```python torch.accelerator.memory_reserved ``` </td> </tr> <tr> <td> ```python torch.xxx.max_memory_reserved ``` </td> <td> ```python torch.accelerator.max_memory_reserved ``` </td> </tr> </table> </div> # Solution This design follows a similar pattern to `HostAllocator`. We're introducing a base class `DeviceAllocator`, from which `CUDAAllocator` and `XPUAllocator` will inherit. This allows us to provide a unified call path like: `torch.accelerator.empty_cache()` -> `GetDeviceAllocator(allocator)->empty_cache()`. Pull Request resolved: https://github.com/pytorch/pytorch/pull/138222 Approved by: https://github.com/albanD, https://github.com/Camyll
This commit is contained in:
committed by
PyTorch MergeBot
parent
f6d138807f
commit
1179e33323
@ -2,7 +2,6 @@
|
||||
#include <ATen/cuda/CUDAGraph.h>
|
||||
#include <ATen/cuda/Exceptions.h>
|
||||
#include <ATen/Functions.h>
|
||||
#include <c10/cuda/CUDACachingAllocator.h>
|
||||
#include <c10/cuda/CUDAFunctions.h>
|
||||
|
||||
#include <cstddef>
|
||||
|
@ -2,6 +2,7 @@
|
||||
|
||||
#include <ATen/Tensor.h>
|
||||
#include <c10/core/Device.h>
|
||||
#include <c10/cuda/CUDACachingAllocator.h>
|
||||
#include <c10/cuda/CUDAGraphsC10Utils.h>
|
||||
#include <c10/cuda/CUDAStream.h>
|
||||
#include <c10/util/flat_hash_map.h>
|
||||
|
@ -1,6 +1,6 @@
|
||||
#pragma once
|
||||
|
||||
#include <c10/core/Allocator.h>
|
||||
#include <c10/core/CachingDeviceAllocator.h>
|
||||
#include <c10/core/DeviceType.h>
|
||||
|
||||
// Use of c10::hip namespace here makes hipification easier, because
|
||||
@ -10,10 +10,10 @@ namespace c10::hip {
|
||||
// Takes a valid HIPAllocator (of any sort) and turns it into
|
||||
// an allocator pretending to be a CUDA allocator. See
|
||||
// Note [Masquerading as CUDA]
|
||||
class HIPAllocatorMasqueradingAsCUDA final : public Allocator {
|
||||
Allocator* allocator_;
|
||||
class HIPAllocatorMasqueradingAsCUDA final : public DeviceAllocator {
|
||||
DeviceAllocator* allocator_;
|
||||
public:
|
||||
explicit HIPAllocatorMasqueradingAsCUDA(Allocator* allocator)
|
||||
explicit HIPAllocatorMasqueradingAsCUDA(DeviceAllocator* allocator)
|
||||
: allocator_(allocator) {}
|
||||
DataPtr allocate(size_t size) override {
|
||||
DataPtr r = allocator_->allocate(size);
|
||||
@ -26,6 +26,24 @@ public:
|
||||
void copy_data(void* dest, const void* src, std::size_t count) const final {
|
||||
allocator_->copy_data(dest, src, count);
|
||||
}
|
||||
bool initialized() override {
|
||||
return allocator_->initialized();
|
||||
}
|
||||
void emptyCache(MempoolId_t mempool_id = {0, 0}) {
|
||||
allocator_->emptyCache(mempool_id);
|
||||
}
|
||||
void recordStream(const DataPtr& ptr, c10::Stream stream) {
|
||||
allocator_->recordStream(ptr, stream);
|
||||
}
|
||||
CachingDeviceAllocator::DeviceStats getDeviceStats(c10::DeviceIndex device) {
|
||||
return allocator_->getDeviceStats(device);
|
||||
}
|
||||
void resetAccumulatedStats(c10::DeviceIndex device) {
|
||||
allocator_->resetAccumulatedStats(device);
|
||||
}
|
||||
void resetPeakStats(c10::DeviceIndex device) {
|
||||
allocator_->resetPeakStats(device);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace c10::hip
|
||||
|
@ -4,8 +4,9 @@
|
||||
namespace c10 { namespace hip {
|
||||
namespace HIPCachingAllocatorMasqueradingAsCUDA {
|
||||
|
||||
Allocator* get() {
|
||||
static HIPAllocatorMasqueradingAsCUDA allocator(HIPCachingAllocator::get());
|
||||
|
||||
Allocator* get() {
|
||||
return &allocator;
|
||||
}
|
||||
|
||||
@ -13,5 +14,9 @@ void recordStreamMasqueradingAsCUDA(const DataPtr& ptr, HIPStreamMasqueradingAsC
|
||||
HIPCachingAllocator::recordStream(ptr, stream.hip_stream());
|
||||
}
|
||||
|
||||
// Register this HIP allocator as CUDA allocator to enable access through both
|
||||
// c10::GetAllocator(kCUDA) and c10::getDeviceAllocator(kCUDA) APIs
|
||||
REGISTER_ALLOCATOR(kCUDA, &allocator)
|
||||
|
||||
} // namespace HIPCachingAllocatorMasqueradingAsCUDA
|
||||
}} // namespace c10::hip
|
||||
|
10
c10/core/CachingDeviceAllocator.cpp
Normal file
10
c10/core/CachingDeviceAllocator.cpp
Normal file
@ -0,0 +1,10 @@
|
||||
#include <c10/core/CachingDeviceAllocator.h>
|
||||
|
||||
namespace c10 {
|
||||
|
||||
// Ensures proper DLL export of this pure virtual base class on Windows,
|
||||
// since it's mainly used in other DLLs outside c10.dll.
|
||||
DeviceAllocator::DeviceAllocator() = default;
|
||||
DeviceAllocator::~DeviceAllocator() = default;
|
||||
|
||||
} // namespace c10
|
@ -1,6 +1,7 @@
|
||||
#pragma once
|
||||
|
||||
#include <c10/core/Allocator.h>
|
||||
#include <c10/core/Stream.h>
|
||||
|
||||
namespace c10::CachingDeviceAllocator {
|
||||
|
||||
@ -59,3 +60,55 @@ struct DeviceStats {
|
||||
};
|
||||
|
||||
} // namespace c10::CachingDeviceAllocator
|
||||
|
||||
namespace c10 {
|
||||
|
||||
using CaptureId_t = unsigned long long;
|
||||
|
||||
// first is set if the instance is created by Graph mode capture_begin.
|
||||
// second is set if the instance is created by Graph mode graph_pool_handle.
|
||||
using MempoolId_t = std::pair<CaptureId_t, CaptureId_t>;
|
||||
|
||||
struct C10_API DeviceAllocator : public c10::Allocator {
|
||||
DeviceAllocator();
|
||||
~DeviceAllocator() override;
|
||||
|
||||
// Returns true if the allocator has been properly initialized and is ready
|
||||
// for use
|
||||
virtual bool initialized() = 0;
|
||||
|
||||
// Releases all cached device memory from the specified memory pool back to
|
||||
// the system
|
||||
virtual void emptyCache(MempoolId_t mempool_id = {0, 0}) = 0;
|
||||
|
||||
// Associates a memory allocation with a stream to establish dependency
|
||||
// tracking. Prevents memory reuse until all operations on the specified
|
||||
// stream complete
|
||||
virtual void recordStream(const DataPtr& ptr, c10::Stream stream) = 0;
|
||||
|
||||
// Retrieves comprehensive memory statistics for the specified device,
|
||||
// including allocation patterns, usage metrics
|
||||
virtual CachingDeviceAllocator::DeviceStats getDeviceStats(
|
||||
c10::DeviceIndex device) = 0;
|
||||
|
||||
// Resets cumulative allocation statistics for the specified device to zero
|
||||
virtual void resetAccumulatedStats(c10::DeviceIndex device) = 0;
|
||||
|
||||
// Resets peak memory usage statistics for the specified device
|
||||
virtual void resetPeakStats(c10::DeviceIndex device) = 0;
|
||||
};
|
||||
|
||||
// This function is used to get the DeviceAllocator for a specific device type
|
||||
// and keep backward compatibility with c10::GetAllocator.
|
||||
C10_API inline DeviceAllocator* getDeviceAllocator(const DeviceType& t) {
|
||||
TORCH_CHECK(
|
||||
t != DeviceType::CPU,
|
||||
"getDeviceAllocator is not supported for CPU device type.");
|
||||
auto* allocator = c10::GetAllocator(t);
|
||||
auto* device_allocator = dynamic_cast<DeviceAllocator*>(allocator);
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
device_allocator, "Allocator for ", t, " is not a DeviceAllocator.");
|
||||
return device_allocator;
|
||||
}
|
||||
|
||||
} // namespace c10
|
||||
|
@ -4179,6 +4179,7 @@ struct BackendStaticInitializer {
|
||||
|
||||
BackendStaticInitializer() {
|
||||
auto r = parseEnvForBackend();
|
||||
at::SetAllocator(kCUDA, r, 0);
|
||||
allocator.store(r);
|
||||
}
|
||||
};
|
||||
|
@ -202,25 +202,24 @@ struct ShareableHandle {
|
||||
std::string handle;
|
||||
};
|
||||
|
||||
class CUDAAllocator : public Allocator {
|
||||
class CUDAAllocator : public DeviceAllocator {
|
||||
public:
|
||||
virtual void* raw_alloc(size_t nbytes) = 0;
|
||||
virtual void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream) = 0;
|
||||
virtual void raw_delete(void* ptr) = 0;
|
||||
virtual void init(int device_count) = 0;
|
||||
virtual bool initialized() = 0;
|
||||
virtual double getMemoryFraction(c10::DeviceIndex device) = 0;
|
||||
virtual void setMemoryFraction(double fraction, c10::DeviceIndex device) = 0;
|
||||
virtual void emptyCache(MempoolId_t mempool_id = {0, 0}) = 0;
|
||||
virtual void enable(bool value) = 0;
|
||||
virtual bool isEnabled() const = 0;
|
||||
virtual void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) = 0;
|
||||
virtual void* getBaseAllocation(void* ptr, size_t* size) = 0;
|
||||
virtual void recordStream(const DataPtr&, CUDAStream stream) = 0;
|
||||
virtual c10::CachingDeviceAllocator::DeviceStats getDeviceStats(
|
||||
c10::DeviceIndex device) = 0;
|
||||
virtual void resetAccumulatedStats(c10::DeviceIndex device) = 0;
|
||||
virtual void resetPeakStats(c10::DeviceIndex device) = 0;
|
||||
// Keep for BC only
|
||||
virtual void recordStream(const DataPtr& ptr, CUDAStream stream) = 0;
|
||||
void recordStream(const DataPtr& ptr, c10::Stream stream) override {
|
||||
CUDAStream cuda_stream = CUDAStream(stream);
|
||||
recordStream(ptr, cuda_stream);
|
||||
}
|
||||
virtual SnapshotInfo snapshot(MempoolId_t mempool_id = {0, 0}) = 0;
|
||||
virtual void beginAllocateToPool(
|
||||
c10::DeviceIndex device,
|
||||
@ -525,6 +524,10 @@ inline void enablePeerAccess(
|
||||
|
||||
namespace c10::cuda {
|
||||
|
||||
// Keep BC only
|
||||
using c10::CaptureId_t;
|
||||
using c10::MempoolId_t;
|
||||
|
||||
// MemPool represents a pool of memory in a caching allocator. Currently,
|
||||
// it's just the ID of the pool object maintained in the CUDACachingAllocator.
|
||||
//
|
||||
|
@ -9,12 +9,6 @@
|
||||
|
||||
namespace c10::cuda {
|
||||
|
||||
using CaptureId_t = unsigned long long;
|
||||
|
||||
// first is set if the instance is created by CUDAGraph::capture_begin.
|
||||
// second is set if the instance is created by at::cuda::graph_pool_handle.
|
||||
using MempoolId_t = std::pair<CaptureId_t, CaptureId_t>;
|
||||
|
||||
// RAII guard for "cudaStreamCaptureMode", a thread-local value
|
||||
// that controls the error-checking strictness of a capture.
|
||||
struct C10_CUDA_API CUDAStreamCaptureModeGuard {
|
||||
|
@ -540,7 +540,7 @@ class DeviceCachingAllocator {
|
||||
|
||||
static void local_raw_delete(void* ptr);
|
||||
|
||||
class XPUAllocator : public Allocator {
|
||||
class XPUAllocator : public DeviceAllocator {
|
||||
private:
|
||||
std::mutex mutex;
|
||||
ska::flat_hash_map<void*, Block*> allocated_blocks;
|
||||
@ -576,6 +576,10 @@ class XPUAllocator : public Allocator {
|
||||
}
|
||||
}
|
||||
|
||||
bool initialized() override {
|
||||
return !device_allocators.empty();
|
||||
}
|
||||
|
||||
void malloc(
|
||||
void** devPtr,
|
||||
DeviceIndex device,
|
||||
@ -610,13 +614,13 @@ class XPUAllocator : public Allocator {
|
||||
}
|
||||
}
|
||||
|
||||
void emptyCache() {
|
||||
void emptyCache(MempoolId_t mempool_id [[maybe_unused]] = {0, 0}) override {
|
||||
for (auto& da : device_allocators) {
|
||||
da->emptyCache();
|
||||
}
|
||||
}
|
||||
|
||||
void recordStream(const DataPtr& ptr, XPUStream stream) {
|
||||
void recordStream(const DataPtr& ptr, c10::Stream stream) override {
|
||||
if (!ptr.get()) {
|
||||
return;
|
||||
}
|
||||
@ -626,7 +630,8 @@ class XPUAllocator : public Allocator {
|
||||
|
||||
Block* block = get_allocated_block(ptr.get());
|
||||
TORCH_CHECK(block, "No allocated block can be found.");
|
||||
device_allocators[block->device]->recordStream(block, stream);
|
||||
c10::xpu::XPUStream xpu_stream{stream};
|
||||
device_allocators[block->device]->recordStream(block, xpu_stream);
|
||||
}
|
||||
|
||||
DataPtr allocate(size_t size) override {
|
||||
@ -679,17 +684,17 @@ class XPUAllocator : public Allocator {
|
||||
": did you call init?");
|
||||
}
|
||||
|
||||
DeviceStats getDeviceStats(DeviceIndex device) {
|
||||
DeviceStats getDeviceStats(DeviceIndex device) override {
|
||||
assertValidDevice(device);
|
||||
return device_allocators[device]->getStats();
|
||||
}
|
||||
|
||||
void resetPeakStats(DeviceIndex device) {
|
||||
void resetPeakStats(DeviceIndex device) override {
|
||||
assertValidDevice(device);
|
||||
device_allocators[device]->resetPeakStats();
|
||||
}
|
||||
|
||||
void resetAccumulatedStats(DeviceIndex device) {
|
||||
void resetAccumulatedStats(DeviceIndex device) override {
|
||||
assertValidDevice(device);
|
||||
device_allocators[device]->resetAccumulatedStats();
|
||||
}
|
||||
|
Reference in New Issue
Block a user