From 3dd872e6d53560933d8d7fc11357617746d37168 Mon Sep 17 00:00:00 2001 From: PyTorch MergeBot Date: Wed, 25 Jun 2025 00:11:35 +0000 Subject: [PATCH] Revert "Add DeviceAllocator as the base device allocator (#138222)" This reverts commit 92409b6c89fbfbd3caa79c81b1e3d9e7917d3bc7. Reverted https://github.com/pytorch/pytorch/pull/138222 on behalf of https://github.com/Camyll due to internal build failures ([comment](https://github.com/pytorch/pytorch/pull/138222#issuecomment-3002206756)) --- aten/src/ATen/cuda/CUDAGraph.cpp | 1 + aten/src/ATen/cuda/CUDAGraph.h | 1 - .../hip/impl/HIPAllocatorMasqueradingAsCUDA.h | 26 ++------- .../HIPCachingAllocatorMasqueradingAsCUDA.cpp | 7 +-- c10/core/CachingDeviceAllocator.cpp | 10 ---- c10/core/CachingDeviceAllocator.h | 53 ------------------- c10/cuda/CUDACachingAllocator.cpp | 6 +-- c10/cuda/CUDACachingAllocator.h | 13 +++-- c10/cuda/CUDAGraphsC10Utils.h | 6 +++ c10/cuda/CUDAMallocAsyncAllocator.cpp | 5 +- c10/xpu/XPUCachingAllocator.cpp | 19 +++---- torch/csrc/cuda/CUDAPluggableAllocator.cpp | 3 +- torch/csrc/cuda/CUDAPluggableAllocator.h | 2 +- 13 files changed, 33 insertions(+), 119 deletions(-) delete mode 100644 c10/core/CachingDeviceAllocator.cpp diff --git a/aten/src/ATen/cuda/CUDAGraph.cpp b/aten/src/ATen/cuda/CUDAGraph.cpp index 2800e505a9b7..7fba7c4c7424 100644 --- a/aten/src/ATen/cuda/CUDAGraph.cpp +++ b/aten/src/ATen/cuda/CUDAGraph.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include #include diff --git a/aten/src/ATen/cuda/CUDAGraph.h b/aten/src/ATen/cuda/CUDAGraph.h index 4f2aa31dd1c3..c8cae16b624f 100644 --- a/aten/src/ATen/cuda/CUDAGraph.h +++ b/aten/src/ATen/cuda/CUDAGraph.h @@ -2,7 +2,6 @@ #include #include -#include #include #include #include diff --git a/aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h b/aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h index c1ecea34db16..39ab441478e8 100644 --- a/aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h +++ b/aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include // Use of c10::hip namespace here makes hipification easier, because @@ -10,10 +10,10 @@ namespace c10::hip { // Takes a valid HIPAllocator (of any sort) and turns it into // an allocator pretending to be a CUDA allocator. See // Note [Masquerading as CUDA] -class HIPAllocatorMasqueradingAsCUDA final : public DeviceAllocator { - DeviceAllocator* allocator_; +class HIPAllocatorMasqueradingAsCUDA final : public Allocator { + Allocator* allocator_; public: - explicit HIPAllocatorMasqueradingAsCUDA(DeviceAllocator* allocator) + explicit HIPAllocatorMasqueradingAsCUDA(Allocator* allocator) : allocator_(allocator) {} DataPtr allocate(size_t size) override { DataPtr r = allocator_->allocate(size); @@ -26,24 +26,6 @@ public: void copy_data(void* dest, const void* src, std::size_t count) const final { allocator_->copy_data(dest, src, count); } - bool initialized() override { - return allocator_->initialized(); - } - void emptyCache(MempoolId_t mempool_id = {0, 0}) { - allocator_->emptyCache(mempool_id); - } - void recordStream(const DataPtr& ptr, c10::Stream stream) { - allocator_->recordStream(ptr, stream); - } - CachingDeviceAllocator::DeviceStats getDeviceStats(c10::DeviceIndex device) { - return allocator_->getDeviceStats(device); - } - void resetAccumulatedStats(c10::DeviceIndex device) { - allocator_->resetAccumulatedStats(device); - } - void resetPeakStats(c10::DeviceIndex device) { - allocator_->resetPeakStats(device); - } }; } // namespace c10::hip diff --git a/aten/src/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.cpp b/aten/src/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.cpp index 19bc0a6b34e5..46f7d247293a 100644 --- a/aten/src/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.cpp +++ b/aten/src/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.cpp @@ -4,9 +4,8 @@ namespace c10 { namespace hip { namespace HIPCachingAllocatorMasqueradingAsCUDA { -static HIPAllocatorMasqueradingAsCUDA allocator(HIPCachingAllocator::get()); - Allocator* get() { + static HIPAllocatorMasqueradingAsCUDA allocator(HIPCachingAllocator::get()); return &allocator; } @@ -14,9 +13,5 @@ void recordStreamMasqueradingAsCUDA(const DataPtr& ptr, HIPStreamMasqueradingAsC HIPCachingAllocator::recordStream(ptr, stream.hip_stream()); } -// Register this HIP allocator as CUDA allocator to enable access through both -// c10::GetAllocator(kCUDA) and c10::getDeviceAllocator(kCUDA) APIs -REGISTER_ALLOCATOR(kCUDA, &allocator) - } // namespace HIPCachingAllocatorMasqueradingAsCUDA }} // namespace c10::hip diff --git a/c10/core/CachingDeviceAllocator.cpp b/c10/core/CachingDeviceAllocator.cpp deleted file mode 100644 index 582efd59cf1b..000000000000 --- a/c10/core/CachingDeviceAllocator.cpp +++ /dev/null @@ -1,10 +0,0 @@ -#include - -namespace c10 { - -// Ensures proper DLL export of this pure virtual base class on Windows, -// since it's mainly used in other DLLs outside c10.dll. -DeviceAllocator::DeviceAllocator() = default; -DeviceAllocator::~DeviceAllocator() = default; - -} // namespace c10 diff --git a/c10/core/CachingDeviceAllocator.h b/c10/core/CachingDeviceAllocator.h index 0bec03ae417f..b23490de693a 100644 --- a/c10/core/CachingDeviceAllocator.h +++ b/c10/core/CachingDeviceAllocator.h @@ -1,7 +1,6 @@ #pragma once #include -#include namespace c10::CachingDeviceAllocator { @@ -60,55 +59,3 @@ struct DeviceStats { }; } // namespace c10::CachingDeviceAllocator - -namespace c10 { - -using CaptureId_t = unsigned long long; - -// first is set if the instance is created by Graph mode capture_begin. -// second is set if the instance is created by Graph mode graph_pool_handle. -using MempoolId_t = std::pair; - -struct C10_API DeviceAllocator : public c10::Allocator { - DeviceAllocator(); - ~DeviceAllocator() override; - - // Returns true if the allocator has been properly initialized and is ready - // for use - virtual bool initialized() = 0; - - // Releases all cached device memory from the specified memory pool back to - // the system - virtual void emptyCache(MempoolId_t mempool_id = {0, 0}) = 0; - - // Associates a memory allocation with a stream to establish dependency - // tracking. Prevents memory reuse until all operations on the specified - // stream complete - virtual void recordStream(const DataPtr& ptr, c10::Stream stream) = 0; - - // Retrieves comprehensive memory statistics for the specified device, - // including allocation patterns, usage metrics - virtual CachingDeviceAllocator::DeviceStats getDeviceStats( - c10::DeviceIndex device) = 0; - - // Resets cumulative allocation statistics for the specified device to zero - virtual void resetAccumulatedStats(c10::DeviceIndex device) = 0; - - // Resets peak memory usage statistics for the specified device - virtual void resetPeakStats(c10::DeviceIndex device) = 0; -}; - -// This function is used to get the DeviceAllocator for a specific device type -// and keep backward compatibility with c10::GetAllocator. -C10_API inline DeviceAllocator* getDeviceAllocator(const DeviceType& t) { - TORCH_CHECK( - t != DeviceType::CPU, - "getDeviceAllocator is not supported for CPU device type."); - auto* allocator = c10::GetAllocator(t); - auto* device_allocator = dynamic_cast(allocator); - TORCH_INTERNAL_ASSERT( - device_allocator, "Allocator for ", t, " is not a DeviceAllocator."); - return device_allocator; -} - -} // namespace c10 diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp index f6754ec59641..e152feba9ccc 100644 --- a/c10/cuda/CUDACachingAllocator.cpp +++ b/c10/cuda/CUDACachingAllocator.cpp @@ -3695,7 +3695,7 @@ class NativeCachingAllocator : public CUDAAllocator { return device_allocator[block->device]->shareIpcHandle(block); } - void recordStream(const DataPtr& ptr, c10::Stream stream) override { + void recordStream(const DataPtr& ptr, cuda::CUDAStream stream) override { // Empty tensor's storage().data() might be a null ptr. As there is no // blocks associated with those tensors, it is fine to do nothing here. if (!ptr.get()) { @@ -3713,8 +3713,7 @@ class NativeCachingAllocator : public CUDAAllocator { Block* block = get_allocated_block(ptr.get()); // block must not be null reaching here TORCH_INTERNAL_ASSERT(block != nullptr, "No allocated block can be found"); - c10::cuda::CUDAStream cuda_stream{stream}; - device_allocator[block->device]->recordStream(block, cuda_stream); + device_allocator[block->device]->recordStream(block, stream); } SnapshotInfo snapshot(MempoolId_t mempool_id) override { @@ -4179,7 +4178,6 @@ struct BackendStaticInitializer { BackendStaticInitializer() { auto r = parseEnvForBackend(); - at::SetAllocator(kCUDA, r, 0); allocator.store(r); } }; diff --git a/c10/cuda/CUDACachingAllocator.h b/c10/cuda/CUDACachingAllocator.h index 0f9b06adc03a..a6fa61110d67 100644 --- a/c10/cuda/CUDACachingAllocator.h +++ b/c10/cuda/CUDACachingAllocator.h @@ -202,18 +202,25 @@ struct ShareableHandle { std::string handle; }; -class CUDAAllocator : public DeviceAllocator { +class CUDAAllocator : public Allocator { public: virtual void* raw_alloc(size_t nbytes) = 0; virtual void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream) = 0; virtual void raw_delete(void* ptr) = 0; virtual void init(int device_count) = 0; + virtual bool initialized() = 0; virtual double getMemoryFraction(c10::DeviceIndex device) = 0; virtual void setMemoryFraction(double fraction, c10::DeviceIndex device) = 0; + virtual void emptyCache(MempoolId_t mempool_id = {0, 0}) = 0; virtual void enable(bool value) = 0; virtual bool isEnabled() const = 0; virtual void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) = 0; virtual void* getBaseAllocation(void* ptr, size_t* size) = 0; + virtual void recordStream(const DataPtr&, CUDAStream stream) = 0; + virtual c10::CachingDeviceAllocator::DeviceStats getDeviceStats( + c10::DeviceIndex device) = 0; + virtual void resetAccumulatedStats(c10::DeviceIndex device) = 0; + virtual void resetPeakStats(c10::DeviceIndex device) = 0; virtual SnapshotInfo snapshot(MempoolId_t mempool_id = {0, 0}) = 0; virtual void beginAllocateToPool( c10::DeviceIndex device, @@ -518,10 +525,6 @@ inline void enablePeerAccess( namespace c10::cuda { -// Keep BC only -using c10::CaptureId_t; -using c10::MempoolId_t; - // MemPool represents a pool of memory in a caching allocator. Currently, // it's just the ID of the pool object maintained in the CUDACachingAllocator. // diff --git a/c10/cuda/CUDAGraphsC10Utils.h b/c10/cuda/CUDAGraphsC10Utils.h index 936875fd71d5..eb29ca8bc9f0 100644 --- a/c10/cuda/CUDAGraphsC10Utils.h +++ b/c10/cuda/CUDAGraphsC10Utils.h @@ -9,6 +9,12 @@ namespace c10::cuda { +using CaptureId_t = unsigned long long; + +// first is set if the instance is created by CUDAGraph::capture_begin. +// second is set if the instance is created by at::cuda::graph_pool_handle. +using MempoolId_t = std::pair; + // RAII guard for "cudaStreamCaptureMode", a thread-local value // that controls the error-checking strictness of a capture. struct C10_CUDA_API CUDAStreamCaptureModeGuard { diff --git a/c10/cuda/CUDAMallocAsyncAllocator.cpp b/c10/cuda/CUDAMallocAsyncAllocator.cpp index 28c60a0afaa9..b5f313e419db 100644 --- a/c10/cuda/CUDAMallocAsyncAllocator.cpp +++ b/c10/cuda/CUDAMallocAsyncAllocator.cpp @@ -607,7 +607,7 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator { return ptr; } - void recordStream(const DataPtr& ptr, c10::Stream stream) override { + void recordStream(const DataPtr& ptr, cuda::CUDAStream stream) override { std::lock_guard lk(general_mutex); auto ptr_val = ptr.get(); // Empty tensor's storage().data() might be a null ptr. As there is no @@ -620,8 +620,7 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator { auto it = ptr_info.find(ptr_val); TORCH_INTERNAL_ASSERT(it != ptr_info.end(), "ptr not found in ptr_info"); - c10::cuda::CUDAStream cuda_stream{stream}; - UsageStream to_record{cuda_stream.stream(), stream.device_index()}; + UsageStream to_record{stream.stream(), stream.device_index()}; if (to_record == it->second.creation_stream) { TORCH_WARN_ONCE( "Called record_stream on tensor whose original creation stream " diff --git a/c10/xpu/XPUCachingAllocator.cpp b/c10/xpu/XPUCachingAllocator.cpp index a5e088515ff5..543b48f08113 100644 --- a/c10/xpu/XPUCachingAllocator.cpp +++ b/c10/xpu/XPUCachingAllocator.cpp @@ -540,7 +540,7 @@ class DeviceCachingAllocator { static void local_raw_delete(void* ptr); -class XPUAllocator : public DeviceAllocator { +class XPUAllocator : public Allocator { private: std::mutex mutex; ska::flat_hash_map allocated_blocks; @@ -576,10 +576,6 @@ class XPUAllocator : public DeviceAllocator { } } - bool initialized() override { - return !device_allocators.empty(); - } - void malloc( void** devPtr, DeviceIndex device, @@ -614,13 +610,13 @@ class XPUAllocator : public DeviceAllocator { } } - void emptyCache(MempoolId_t mempool_id [[maybe_unused]] = {0, 0}) override { + void emptyCache() { for (auto& da : device_allocators) { da->emptyCache(); } } - void recordStream(const DataPtr& ptr, c10::Stream stream) override { + void recordStream(const DataPtr& ptr, XPUStream stream) { if (!ptr.get()) { return; } @@ -630,8 +626,7 @@ class XPUAllocator : public DeviceAllocator { Block* block = get_allocated_block(ptr.get()); TORCH_CHECK(block, "No allocated block can be found."); - c10::xpu::XPUStream xpu_stream{stream}; - device_allocators[block->device]->recordStream(block, xpu_stream); + device_allocators[block->device]->recordStream(block, stream); } DataPtr allocate(size_t size) override { @@ -684,17 +679,17 @@ class XPUAllocator : public DeviceAllocator { ": did you call init?"); } - DeviceStats getDeviceStats(DeviceIndex device) override { + DeviceStats getDeviceStats(DeviceIndex device) { assertValidDevice(device); return device_allocators[device]->getStats(); } - void resetPeakStats(DeviceIndex device) override { + void resetPeakStats(DeviceIndex device) { assertValidDevice(device); device_allocators[device]->resetPeakStats(); } - void resetAccumulatedStats(DeviceIndex device) override { + void resetAccumulatedStats(DeviceIndex device) { assertValidDevice(device); device_allocators[device]->resetAccumulatedStats(); } diff --git a/torch/csrc/cuda/CUDAPluggableAllocator.cpp b/torch/csrc/cuda/CUDAPluggableAllocator.cpp index 9c5d17a3805c..43606807c6e4 100644 --- a/torch/csrc/cuda/CUDAPluggableAllocator.cpp +++ b/torch/csrc/cuda/CUDAPluggableAllocator.cpp @@ -210,8 +210,7 @@ void* CUDAPluggableAllocator::getBaseAllocation(void* ptr, size_t* size) { void CUDAPluggableAllocator::recordStream( const c10::DataPtr& ptr, - c10::Stream c10_stream) { - streamType stream{c10_stream}; + streamType stream) { if (record_stream_fn_) { record_stream_fn_(ptr.get(), stream); } diff --git a/torch/csrc/cuda/CUDAPluggableAllocator.h b/torch/csrc/cuda/CUDAPluggableAllocator.h index ed31c82a0109..5a1b7be0a15d 100644 --- a/torch/csrc/cuda/CUDAPluggableAllocator.h +++ b/torch/csrc/cuda/CUDAPluggableAllocator.h @@ -122,7 +122,7 @@ struct TORCH_CUDA_CPP_API CUDAPluggableAllocator void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) override; void* getBaseAllocation(void* ptr, size_t* size) override; - void recordStream(const c10::DataPtr&, c10::Stream stream) override; + void recordStream(const c10::DataPtr&, streamType stream) override; c10::CachingDeviceAllocator::DeviceStats getDeviceStats( c10::DeviceIndex device) override;