Deprecate overleap functions in CUDAAllocatorConfig, use AcceleratorAllocatorConfig instead (#156165)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/156165
Approved by: https://github.com/albanD
ghstack dependencies: #149601, #157908, #150312
This commit is contained in:
Yu, Guangye
2025-07-30 09:14:36 +00:00
committed by PyTorch MergeBot
parent dfacf11f66
commit 1fc010a9d8
5 changed files with 44 additions and 32 deletions

View File

@ -162,7 +162,7 @@ struct CUDACachingHostAllocatorImpl
} }
bool pinned_use_background_threads() override { bool pinned_use_background_threads() override {
return c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig:: return c10::CachingAllocator::AcceleratorAllocatorConfig::
pinned_use_background_threads(); pinned_use_background_threads();
} }

View File

@ -3,6 +3,7 @@
#include <c10/core/AllocatorConfig.h> #include <c10/core/AllocatorConfig.h>
#include <c10/cuda/CUDAException.h> #include <c10/cuda/CUDAException.h>
#include <c10/cuda/CUDAMacros.h> #include <c10/cuda/CUDAMacros.h>
#include <c10/util/Deprecated.h>
#include <c10/util/Exception.h> #include <c10/util/Exception.h>
#include <c10/util/env.h> #include <c10/util/env.h>
@ -17,9 +18,13 @@ enum class Expandable_Segments_Handle_Type : int {
// Environment config parser // Environment config parser
class C10_CUDA_API CUDAAllocatorConfig { class C10_CUDA_API CUDAAllocatorConfig {
public: public:
C10_DEPRECATED_MESSAGE(
"c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::max_split_size() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::max_split_size() instead.")
static size_t max_split_size() { static size_t max_split_size() {
return c10::CachingAllocator::AcceleratorAllocatorConfig::max_split_size(); return c10::CachingAllocator::AcceleratorAllocatorConfig::max_split_size();
} }
C10_DEPRECATED_MESSAGE(
"c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::garbage_collection_threshold() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::garbage_collection_threshold() instead.")
static double garbage_collection_threshold() { static double garbage_collection_threshold() {
return c10::CachingAllocator::AcceleratorAllocatorConfig:: return c10::CachingAllocator::AcceleratorAllocatorConfig::
garbage_collection_threshold(); garbage_collection_threshold();
@ -60,6 +65,8 @@ class C10_CUDA_API CUDAAllocatorConfig {
return instance().m_pinned_num_register_threads; return instance().m_pinned_num_register_threads;
} }
C10_DEPRECATED_MESSAGE(
"c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::pinned_use_background_threads() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::pinned_use_background_threads() instead.")
static bool pinned_use_background_threads() { static bool pinned_use_background_threads() {
return c10::CachingAllocator::AcceleratorAllocatorConfig:: return c10::CachingAllocator::AcceleratorAllocatorConfig::
pinned_use_background_threads(); pinned_use_background_threads();
@ -72,25 +79,29 @@ class C10_CUDA_API CUDAAllocatorConfig {
return 128; return 128;
} }
// This is used to round-up allocation size to nearest power of 2 divisions. C10_DEPRECATED_MESSAGE(
// More description below in function roundup_power2_next_division "c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::roundup_power2_divisions() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::roundup_power2_divisions() instead.")
// As an example, if we want 4 divisions between 2's power, this can be done
// using env variable: PYTORCH_CUDA_ALLOC_CONF=roundup_power2_divisions:4
static size_t roundup_power2_divisions(size_t size) { static size_t roundup_power2_divisions(size_t size) {
return c10::CachingAllocator::AcceleratorAllocatorConfig:: return c10::CachingAllocator::AcceleratorAllocatorConfig::
roundup_power2_divisions(size); roundup_power2_divisions(size);
} }
C10_DEPRECATED_MESSAGE(
"c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::roundup_power2_divisions() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::roundup_power2_divisions() instead.")
static std::vector<size_t> roundup_power2_divisions() { static std::vector<size_t> roundup_power2_divisions() {
return c10::CachingAllocator::AcceleratorAllocatorConfig:: return c10::CachingAllocator::AcceleratorAllocatorConfig::
roundup_power2_divisions(); roundup_power2_divisions();
} }
C10_DEPRECATED_MESSAGE(
"c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::max_non_split_rounding_size() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::max_non_split_rounding_size() instead.")
static size_t max_non_split_rounding_size() { static size_t max_non_split_rounding_size() {
return c10::CachingAllocator::AcceleratorAllocatorConfig:: return c10::CachingAllocator::AcceleratorAllocatorConfig::
max_non_split_rounding_size(); max_non_split_rounding_size();
} }
C10_DEPRECATED_MESSAGE(
"c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::last_allocator_settings() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::last_allocator_settings() instead.")
static std::string last_allocator_settings() { static std::string last_allocator_settings() {
return c10::CachingAllocator::getAllocatorSettings(); return c10::CachingAllocator::getAllocatorSettings();
} }

View File

@ -1225,7 +1225,7 @@ class DeviceCachingAllocator {
DeviceCachingAllocator() DeviceCachingAllocator()
: large_blocks(/*small=*/false), small_blocks(/*small=*/true) { : large_blocks(/*small=*/false), small_blocks(/*small=*/true) {
stats.max_split_size = stats.max_split_size =
static_cast<int64_t>(CUDAAllocatorConfig::max_split_size()); static_cast<int64_t>(AcceleratorAllocatorConfig::max_split_size());
context_recorder_.store(nullptr); context_recorder_.store(nullptr);
} }
@ -1350,7 +1350,8 @@ class DeviceCachingAllocator {
// Do garbage collection if the flag is set. // Do garbage collection if the flag is set.
if (C10_UNLIKELY( if (C10_UNLIKELY(
set_fraction && set_fraction &&
CUDAAllocatorConfig::garbage_collection_threshold() > 0.0)) { AcceleratorAllocatorConfig::garbage_collection_threshold() >
0.0)) {
garbage_collect_cached_blocks(context); garbage_collect_cached_blocks(context);
} }
// Attempt allocate // Attempt allocate
@ -1602,7 +1603,7 @@ class DeviceCachingAllocator {
stats.active_bytes[stat_type].increase(block->size); stats.active_bytes[stat_type].increase(block->size);
stats.requested_bytes[stat_type].increase(block->requested_size); stats.requested_bytes[stat_type].increase(block->requested_size);
}); });
if (block->size >= CUDAAllocatorConfig::max_split_size()) if (block->size >= AcceleratorAllocatorConfig::max_split_size())
stats.oversize_allocations.increase(1); stats.oversize_allocations.increase(1);
auto allocated_bytes_gauge = auto allocated_bytes_gauge =
@ -1653,7 +1654,7 @@ class DeviceCachingAllocator {
block->pool->owner_MempoolId(), block->pool->owner_MempoolId(),
context ? context : block->context_when_allocated); context ? context : block->context_when_allocated);
if (block->size >= CUDAAllocatorConfig::max_split_size()) if (block->size >= AcceleratorAllocatorConfig::max_split_size())
stats.oversize_allocations.decrease(1); stats.oversize_allocations.decrease(1);
if (!block->stream_uses.empty()) { if (!block->stream_uses.empty()) {
@ -2202,7 +2203,8 @@ class DeviceCachingAllocator {
if (size < kMinBlockSize) { if (size < kMinBlockSize) {
return kMinBlockSize; return kMinBlockSize;
} else { } else {
auto divisions = CUDAAllocatorConfig::roundup_power2_divisions(size); auto divisions =
AcceleratorAllocatorConfig::roundup_power2_divisions(size);
if (divisions > 1 && size > (kMinBlockSize * divisions)) { if (divisions > 1 && size > (kMinBlockSize * divisions)) {
return roundup_power2_next_division(size, divisions); return roundup_power2_next_division(size, divisions);
} else { } else {
@ -2692,7 +2694,7 @@ class DeviceCachingAllocator {
if (block->pool->is_small || CUDAAllocatorConfig::expandable_segments()) { if (block->pool->is_small || CUDAAllocatorConfig::expandable_segments()) {
return remaining >= kMinBlockSize; return remaining >= kMinBlockSize;
} else { } else {
return (size < CUDAAllocatorConfig::max_split_size()) && return (size < AcceleratorAllocatorConfig::max_split_size()) &&
(remaining > kSmallSize); (remaining > kSmallSize);
} }
} }
@ -2712,7 +2714,7 @@ class DeviceCachingAllocator {
if (C10_UNLIKELY( if (C10_UNLIKELY(
set_fraction && set_fraction &&
CUDAAllocatorConfig::garbage_collection_threshold() > 0.0)) { AcceleratorAllocatorConfig::garbage_collection_threshold() > 0.0)) {
// Track block reuse interval only when garbage collection is enabled. // Track block reuse interval only when garbage collection is enabled.
++pool.get_free_blocks_call_count; ++pool.get_free_blocks_call_count;
} }
@ -2754,13 +2756,13 @@ class DeviceCachingAllocator {
} }
// Do not return an oversized block for a large request // Do not return an oversized block for a large request
if ((p.size() < CUDAAllocatorConfig::max_split_size()) && if ((p.size() < AcceleratorAllocatorConfig::max_split_size()) &&
((*it)->size >= CUDAAllocatorConfig::max_split_size())) ((*it)->size >= AcceleratorAllocatorConfig::max_split_size()))
return false; return false;
// Allow oversized block size to be rounded up but within a limit // Allow oversized block size to be rounded up but within a limit
if ((p.size() >= CUDAAllocatorConfig::max_split_size()) && if ((p.size() >= AcceleratorAllocatorConfig::max_split_size()) &&
((*it)->size >= ((*it)->size >=
p.size() + CUDAAllocatorConfig::max_non_split_rounding_size())) p.size() + AcceleratorAllocatorConfig::max_non_split_rounding_size()))
return false; return false;
p.block = *it; p.block = *it;
pool.blocks.erase(it); pool.blocks.erase(it);
@ -2783,7 +2785,7 @@ class DeviceCachingAllocator {
// therefore should be of less overheads. // therefore should be of less overheads.
size_t gc_threshold = static_cast<size_t>( size_t gc_threshold = static_cast<size_t>(
CUDAAllocatorConfig::garbage_collection_threshold() * AcceleratorAllocatorConfig::garbage_collection_threshold() *
static_cast<double>(allowed_memory_maximum)); static_cast<double>(allowed_memory_maximum));
// No need to trigger GC yet // No need to trigger GC yet
if (total_allocated_memory <= gc_threshold) { if (total_allocated_memory <= gc_threshold) {
@ -2931,7 +2933,7 @@ class DeviceCachingAllocator {
stats.segment[stat_type].increase(1); stats.segment[stat_type].increase(1);
stats.reserved_bytes[stat_type].increase(size); stats.reserved_bytes[stat_type].increase(size);
}); });
if (size >= CUDAAllocatorConfig::max_split_size()) if (size >= AcceleratorAllocatorConfig::max_split_size())
stats.oversize_segments.increase(1); stats.oversize_segments.increase(1);
auto reserved_bytes_gauge = auto reserved_bytes_gauge =
STATIC_GAUGE(pytorch.CUDACachingAllocator.reserved_bytes); STATIC_GAUGE(pytorch.CUDACachingAllocator.reserved_bytes);
@ -2960,7 +2962,7 @@ class DeviceCachingAllocator {
bool release_available_cached_blocks( bool release_available_cached_blocks(
const AllocParams& p, const AllocParams& p,
const std::shared_ptr<GatheredContext>& context) { const std::shared_ptr<GatheredContext>& context) {
if (CUDAAllocatorConfig::max_split_size() == if (AcceleratorAllocatorConfig::max_split_size() ==
std::numeric_limits<size_t>::max()) std::numeric_limits<size_t>::max())
return false; return false;
BlockPool& pool = *p.pool; BlockPool& pool = *p.pool;
@ -2968,8 +2970,8 @@ class DeviceCachingAllocator {
// because of std::unique_ptr, block cannot be trivially copied // because of std::unique_ptr, block cannot be trivially copied
// Use constructor for search key. // Use constructor for search key.
Block key(p.search_key.device, p.search_key.stream, p.search_key.size); Block key(p.search_key.device, p.search_key.stream, p.search_key.size);
key.size = (key.size < CUDAAllocatorConfig::max_split_size()) key.size = (key.size < AcceleratorAllocatorConfig::max_split_size())
? CUDAAllocatorConfig::max_split_size() ? AcceleratorAllocatorConfig::max_split_size()
: key.size; : key.size;
auto it = pool.blocks.lower_bound(&key); auto it = pool.blocks.lower_bound(&key);
if (it == pool.blocks.end() || (*it)->stream != p.stream() || if (it == pool.blocks.end() || (*it)->stream != p.stream() ||
@ -2982,7 +2984,7 @@ class DeviceCachingAllocator {
--it; // Back up one item. Now on the largest block for the correct --it; // Back up one item. Now on the largest block for the correct
// stream // stream
while ((totalReleased < key.size) && while ((totalReleased < key.size) &&
((*it)->size >= CUDAAllocatorConfig::max_split_size()) && ((*it)->size >= AcceleratorAllocatorConfig::max_split_size()) &&
((*it)->stream == p.stream())) { ((*it)->stream == p.stream())) {
auto cur = it; auto cur = it;
bool is_first = cur == pool.blocks.begin(); bool is_first = cur == pool.blocks.begin();
@ -3107,7 +3109,7 @@ class DeviceCachingAllocator {
stats.reserved_bytes[static_cast<int64_t>(StatType::AGGREGATE)] stats.reserved_bytes[static_cast<int64_t>(StatType::AGGREGATE)]
.current); .current);
if (block->size >= CUDAAllocatorConfig::max_split_size()) if (block->size >= AcceleratorAllocatorConfig::max_split_size())
stats.oversize_segments.decrease(1); stats.oversize_segments.decrease(1);
pool->blocks.erase(block); pool->blocks.erase(block);
delete block; delete block;
@ -3734,8 +3736,8 @@ class NativeCachingAllocator : public CUDAAllocator {
auto& md = result.config_metadata; auto& md = result.config_metadata;
md.garbage_collection_threshold = md.garbage_collection_threshold =
CUDAAllocatorConfig::garbage_collection_threshold(); AcceleratorAllocatorConfig::garbage_collection_threshold();
md.max_split_size = CUDAAllocatorConfig::max_split_size(); md.max_split_size = AcceleratorAllocatorConfig::max_split_size();
md.pinned_num_register_threads = md.pinned_num_register_threads =
CUDAAllocatorConfig::pinned_num_register_threads(); CUDAAllocatorConfig::pinned_num_register_threads();
md.expandable_segments = CUDAAllocatorConfig::expandable_segments(); md.expandable_segments = CUDAAllocatorConfig::expandable_segments();
@ -3743,9 +3745,10 @@ class NativeCachingAllocator : public CUDAAllocator {
CUDAAllocatorConfig::release_lock_on_cudamalloc(); CUDAAllocatorConfig::release_lock_on_cudamalloc();
md.pinned_use_host_register = md.pinned_use_host_register =
CUDAAllocatorConfig::pinned_use_cuda_host_register(); CUDAAllocatorConfig::pinned_use_cuda_host_register();
md.last_allocator_settings = CUDAAllocatorConfig::last_allocator_settings(); md.last_allocator_settings =
AcceleratorAllocatorConfig::last_allocator_settings();
md.roundup_power2_divisions = md.roundup_power2_divisions =
CUDAAllocatorConfig::roundup_power2_divisions(); AcceleratorAllocatorConfig::roundup_power2_divisions();
return result; return result;
} }

View File

@ -1,3 +1,4 @@
#include <c10/core/AllocatorConfig.h>
#include <c10/util/flat_hash_map.h> #include <c10/util/flat_hash_map.h>
#include <c10/util/irange.h> #include <c10/util/irange.h>
#include <c10/xpu/XPUCachingAllocator.h> #include <c10/xpu/XPUCachingAllocator.h>
@ -20,8 +21,6 @@ constexpr size_t kMinBlockSize = 512;
constexpr size_t kSmallSize = 1048576; constexpr size_t kSmallSize = 1048576;
// "small" allocations are packed in 2 MiB blocks // "small" allocations are packed in 2 MiB blocks
constexpr size_t kSmallBuffer = 2097152; constexpr size_t kSmallBuffer = 2097152;
// "large" allocations may be packed in 20 MiB blocks
constexpr size_t kLargeBuffer = 20971520;
// allocations between 1 and 10 MiB may use kLargeBuffer // allocations between 1 and 10 MiB may use kLargeBuffer
constexpr size_t kMinLargeAlloc = 10485760; constexpr size_t kMinLargeAlloc = 10485760;
// round up large allocations to 2 MiB // round up large allocations to 2 MiB

View File

@ -20,8 +20,8 @@
#include <ATen/cuda/detail/CUDAHooks.h> #include <ATen/cuda/detail/CUDAHooks.h>
#include <ATen/cuda/jiterator.h> #include <ATen/cuda/jiterator.h>
#include <ATen/cuda/tunable/Tunable.h> #include <ATen/cuda/tunable/Tunable.h>
#include <c10/core/AllocatorConfig.h>
#include <c10/core/StorageImpl.h> #include <c10/core/StorageImpl.h>
#include <c10/cuda/CUDAAllocatorConfig.h>
#include <c10/cuda/CUDACachingAllocator.h> #include <c10/cuda/CUDACachingAllocator.h>
#include <c10/cuda/CUDAFunctions.h> #include <c10/cuda/CUDAFunctions.h>
#include <ATen/cuda/CUDAGraphsUtils.cuh> #include <ATen/cuda/CUDAGraphsUtils.cuh>
@ -426,8 +426,7 @@ PyObject* THCPModule_cudaCachingAllocator_set_allocator_settings(
PyObject* _unused, PyObject* _unused,
PyObject* env) { PyObject* env) {
HANDLE_TH_ERRORS HANDLE_TH_ERRORS
c10::cuda::CUDACachingAllocator::setAllocatorSettings( c10::CachingAllocator::setAllocatorSettings(THPUtils_unpackString(env));
THPUtils_unpackString(env));
Py_RETURN_NONE; Py_RETURN_NONE;
END_HANDLE_TH_ERRORS END_HANDLE_TH_ERRORS
} }