#pragma once #include #include namespace c10::CachingDeviceAllocator { using namespace c10::CachingAllocator; // Struct containing memory allocator summary statistics for a device. struct DeviceStats { // COUNT: allocations requested by client code StatArray allocation; // COUNT: number of allocated segments from device memory allocation. StatArray segment; // COUNT: number of active memory blocks (allocated or used by stream) StatArray active; // COUNT: number of inactive, split memory blocks (unallocated but can't be // released via device memory deallocation) StatArray inactive_split; // SUM: bytes allocated by this memory allocator StatArray allocated_bytes; // SUM: bytes reserved by this memory allocator (both free and used) StatArray reserved_bytes; // SUM: bytes within active memory blocks StatArray active_bytes; // SUM: bytes within inactive, split memory blocks StatArray inactive_split_bytes; // SUM: bytes requested by client code StatArray requested_bytes; // COUNT: total number of failed calls to device malloc necessitating cache // flushes. int64_t num_alloc_retries = 0; // COUNT: total number of OOMs (i.e. failed calls to device memory allocation // after cache flush) int64_t num_ooms = 0; // COUNT: total number of oversize blocks allocated from pool Stat oversize_allocations; // COUNT: total number of oversize blocks requiring malloc Stat oversize_segments; // COUNT: total number of synchronize_and_free_events() calls int64_t num_sync_all_streams = 0; // COUNT: total number of device memory allocation calls. This includes both // mapped and malloced memory. int64_t num_device_alloc = 0; // COUNT: total number of device memory deallocation calls. This includes both // un-mapped and free memory. int64_t num_device_free = 0; // SIZE: maximum block size that is allowed to be split. int64_t max_split_size = 0; }; } // namespace c10::CachingDeviceAllocator namespace c10 { using CaptureId_t = unsigned long long; // first is set if the instance is created by Graph mode capture_begin. // second is set if the instance is created by Graph mode graph_pool_handle. using MempoolId_t = std::pair; struct C10_API DeviceAllocator : public c10::Allocator { DeviceAllocator(); ~DeviceAllocator() override; // Returns true if the allocator has been properly initialized and is ready // for use virtual bool initialized() = 0; // Releases all cached device memory from the specified memory pool back to // the system virtual void emptyCache(MempoolId_t mempool_id = {0, 0}) = 0; // Associates a memory allocation with a stream to establish dependency // tracking. Prevents memory reuse until all operations on the specified // stream complete virtual void recordStream(const DataPtr& ptr, c10::Stream stream) = 0; // Retrieves comprehensive memory statistics for the specified device, // including allocation patterns, usage metrics virtual CachingDeviceAllocator::DeviceStats getDeviceStats( c10::DeviceIndex device) = 0; // Resets cumulative allocation statistics for the specified device to zero virtual void resetAccumulatedStats(c10::DeviceIndex device) = 0; // Resets peak memory usage statistics for the specified device virtual void resetPeakStats(c10::DeviceIndex device) = 0; }; // This function is used to get the DeviceAllocator for a specific device type // and keep backward compatibility with c10::GetAllocator. C10_API inline DeviceAllocator* getDeviceAllocator(const DeviceType& t) { TORCH_CHECK( t != DeviceType::CPU, "getDeviceAllocator is not supported for CPU device type."); auto* allocator = c10::GetAllocator(t); auto* device_allocator = dynamic_cast(allocator); TORCH_INTERNAL_ASSERT( device_allocator, "Allocator for ", t, " is not a DeviceAllocator."); return device_allocator; } } // namespace c10