mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
# Motivation In line with [RFC] [A device-agnostic Python device memory related API design for stream-based accelerators](https://github.com/pytorch/pytorch/issues/134978), some memory-related APIs are widely used in popular repositories, such as HuggingFace [so many if-else conditional code](https://github.com/search?q=repo%3Ahuggingface%2Faccelerate%20torch.cuda.empty_cache&type=code). We would like to introduce a generic API set under torch.accelerator namespace to generalize these user cases. <div align="center"> <table> <tr> <td> Device-specific memory APIs torch.xxx.foo</td> <td> Device-agnostic memory APIs torch.accelerator.foo</td> </tr> <tr> <td> ```python torch.xxx.empty_cache ``` </td> <td> ```python torch.accelerator.empty_cache ``` </td> </tr> <tr> <td> ```python torch.xxx.reset_peak_memory_stats ``` </td> <td> ```python torch.accelerator.reset_peak_memory_stats ``` </td> </tr> <tr> <td> ```python torch.xxx.reset_accumulated_memory_stats ``` </td> <td> ```python torch.accelerator.reset_accumulated_memory_stats ``` </td> </tr> <tr> <td> ```python torch.xxx.memory_stats ``` </td> <td> ```python torch.accelerator.memory_stats ``` </td> </tr> <tr> <td> ```python torch.xxx.memory_allocated ``` </td> <td> ```python torch.accelerator.memory_allocated ``` </td> </tr> <tr> <td> ```python torch.xxx.max_memory_allocated ``` </td> <td> ```python torch.accelerator.max_memory_allocated ``` </td> </tr> <tr> <td> ```python torch.xxx.memory_reserved ``` </td> <td> ```python torch.accelerator.memory_reserved ``` </td> </tr> <tr> <td> ```python torch.xxx.max_memory_reserved ``` </td> <td> ```python torch.accelerator.max_memory_reserved ``` </td> </tr> </table> </div> # Solution This design follows a similar pattern to `HostAllocator`. We're introducing a base class `DeviceAllocator`, from which `CUDAAllocator` and `XPUAllocator` will inherit. This allows us to provide a unified call path like: `torch.accelerator.empty_cache()` -> `GetDeviceAllocator(allocator)->empty_cache()`. Pull Request resolved: https://github.com/pytorch/pytorch/pull/138222 Approved by: https://github.com/albanD, https://github.com/Camyll
115 lines
3.9 KiB
C++
115 lines
3.9 KiB
C++
#pragma once
|
|
|
|
#include <c10/core/Allocator.h>
|
|
#include <c10/core/Stream.h>
|
|
|
|
namespace c10::CachingDeviceAllocator {
|
|
|
|
using namespace c10::CachingAllocator;
|
|
|
|
// Struct containing memory allocator summary statistics for a device.
|
|
struct DeviceStats {
|
|
// COUNT: allocations requested by client code
|
|
StatArray allocation;
|
|
// COUNT: number of allocated segments from device memory allocation.
|
|
StatArray segment;
|
|
// COUNT: number of active memory blocks (allocated or used by stream)
|
|
StatArray active;
|
|
// COUNT: number of inactive, split memory blocks (unallocated but can't be
|
|
// released via device memory deallocation)
|
|
StatArray inactive_split;
|
|
|
|
// SUM: bytes allocated by this memory allocator
|
|
StatArray allocated_bytes;
|
|
// SUM: bytes reserved by this memory allocator (both free and used)
|
|
StatArray reserved_bytes;
|
|
// SUM: bytes within active memory blocks
|
|
StatArray active_bytes;
|
|
// SUM: bytes within inactive, split memory blocks
|
|
StatArray inactive_split_bytes;
|
|
// SUM: bytes requested by client code
|
|
StatArray requested_bytes;
|
|
|
|
// COUNT: total number of failed calls to device malloc necessitating cache
|
|
// flushes.
|
|
int64_t num_alloc_retries = 0;
|
|
|
|
// COUNT: total number of OOMs (i.e. failed calls to device memory allocation
|
|
// after cache flush)
|
|
int64_t num_ooms = 0;
|
|
|
|
// COUNT: total number of oversize blocks allocated from pool
|
|
Stat oversize_allocations;
|
|
|
|
// COUNT: total number of oversize blocks requiring malloc
|
|
Stat oversize_segments;
|
|
|
|
// COUNT: total number of synchronize_and_free_events() calls
|
|
int64_t num_sync_all_streams = 0;
|
|
|
|
// COUNT: total number of device memory allocation calls. This includes both
|
|
// mapped and malloced memory.
|
|
int64_t num_device_alloc = 0;
|
|
|
|
// COUNT: total number of device memory deallocation calls. This includes both
|
|
// un-mapped and free memory.
|
|
int64_t num_device_free = 0;
|
|
|
|
// SIZE: maximum block size that is allowed to be split.
|
|
int64_t max_split_size = 0;
|
|
};
|
|
|
|
} // namespace c10::CachingDeviceAllocator
|
|
|
|
namespace c10 {
|
|
|
|
using CaptureId_t = unsigned long long;
|
|
|
|
// first is set if the instance is created by Graph mode capture_begin.
|
|
// second is set if the instance is created by Graph mode graph_pool_handle.
|
|
using MempoolId_t = std::pair<CaptureId_t, CaptureId_t>;
|
|
|
|
struct C10_API DeviceAllocator : public c10::Allocator {
|
|
DeviceAllocator();
|
|
~DeviceAllocator() override;
|
|
|
|
// Returns true if the allocator has been properly initialized and is ready
|
|
// for use
|
|
virtual bool initialized() = 0;
|
|
|
|
// Releases all cached device memory from the specified memory pool back to
|
|
// the system
|
|
virtual void emptyCache(MempoolId_t mempool_id = {0, 0}) = 0;
|
|
|
|
// Associates a memory allocation with a stream to establish dependency
|
|
// tracking. Prevents memory reuse until all operations on the specified
|
|
// stream complete
|
|
virtual void recordStream(const DataPtr& ptr, c10::Stream stream) = 0;
|
|
|
|
// Retrieves comprehensive memory statistics for the specified device,
|
|
// including allocation patterns, usage metrics
|
|
virtual CachingDeviceAllocator::DeviceStats getDeviceStats(
|
|
c10::DeviceIndex device) = 0;
|
|
|
|
// Resets cumulative allocation statistics for the specified device to zero
|
|
virtual void resetAccumulatedStats(c10::DeviceIndex device) = 0;
|
|
|
|
// Resets peak memory usage statistics for the specified device
|
|
virtual void resetPeakStats(c10::DeviceIndex device) = 0;
|
|
};
|
|
|
|
// This function is used to get the DeviceAllocator for a specific device type
|
|
// and keep backward compatibility with c10::GetAllocator.
|
|
C10_API inline DeviceAllocator* getDeviceAllocator(const DeviceType& t) {
|
|
TORCH_CHECK(
|
|
t != DeviceType::CPU,
|
|
"getDeviceAllocator is not supported for CPU device type.");
|
|
auto* allocator = c10::GetAllocator(t);
|
|
auto* device_allocator = dynamic_cast<DeviceAllocator*>(allocator);
|
|
TORCH_INTERNAL_ASSERT(
|
|
device_allocator, "Allocator for ", t, " is not a DeviceAllocator.");
|
|
return device_allocator;
|
|
}
|
|
|
|
} // namespace c10
|