diff --git a/aten/src/ATen/cuda/CachingHostAllocator.cpp b/aten/src/ATen/cuda/CachingHostAllocator.cpp index 34aa15d0c06c..39fd0e16fac5 100644 --- a/aten/src/ATen/cuda/CachingHostAllocator.cpp +++ b/aten/src/ATen/cuda/CachingHostAllocator.cpp @@ -162,7 +162,7 @@ struct CUDACachingHostAllocatorImpl } bool pinned_use_background_threads() override { - return c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig:: + return c10::CachingAllocator::AcceleratorAllocatorConfig:: pinned_use_background_threads(); } diff --git a/c10/cuda/CUDAAllocatorConfig.h b/c10/cuda/CUDAAllocatorConfig.h index 382a50c08f6d..8c4b613473c0 100644 --- a/c10/cuda/CUDAAllocatorConfig.h +++ b/c10/cuda/CUDAAllocatorConfig.h @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -17,9 +18,13 @@ enum class Expandable_Segments_Handle_Type : int { // Environment config parser class C10_CUDA_API CUDAAllocatorConfig { public: + C10_DEPRECATED_MESSAGE( + "c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::max_split_size() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::max_split_size() instead.") static size_t max_split_size() { return c10::CachingAllocator::AcceleratorAllocatorConfig::max_split_size(); } + C10_DEPRECATED_MESSAGE( + "c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::garbage_collection_threshold() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::garbage_collection_threshold() instead.") static double garbage_collection_threshold() { return c10::CachingAllocator::AcceleratorAllocatorConfig:: garbage_collection_threshold(); @@ -60,6 +65,8 @@ class C10_CUDA_API CUDAAllocatorConfig { return instance().m_pinned_num_register_threads; } + C10_DEPRECATED_MESSAGE( + "c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::pinned_use_background_threads() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::pinned_use_background_threads() instead.") static bool pinned_use_background_threads() { return c10::CachingAllocator::AcceleratorAllocatorConfig:: pinned_use_background_threads(); @@ -72,25 +79,29 @@ class C10_CUDA_API CUDAAllocatorConfig { return 128; } - // This is used to round-up allocation size to nearest power of 2 divisions. - // More description below in function roundup_power2_next_division - // As an example, if we want 4 divisions between 2's power, this can be done - // using env variable: PYTORCH_CUDA_ALLOC_CONF=roundup_power2_divisions:4 + C10_DEPRECATED_MESSAGE( + "c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::roundup_power2_divisions() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::roundup_power2_divisions() instead.") static size_t roundup_power2_divisions(size_t size) { return c10::CachingAllocator::AcceleratorAllocatorConfig:: roundup_power2_divisions(size); } + C10_DEPRECATED_MESSAGE( + "c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::roundup_power2_divisions() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::roundup_power2_divisions() instead.") static std::vector roundup_power2_divisions() { return c10::CachingAllocator::AcceleratorAllocatorConfig:: roundup_power2_divisions(); } + C10_DEPRECATED_MESSAGE( + "c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::max_non_split_rounding_size() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::max_non_split_rounding_size() instead.") static size_t max_non_split_rounding_size() { return c10::CachingAllocator::AcceleratorAllocatorConfig:: max_non_split_rounding_size(); } + C10_DEPRECATED_MESSAGE( + "c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::last_allocator_settings() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::last_allocator_settings() instead.") static std::string last_allocator_settings() { return c10::CachingAllocator::getAllocatorSettings(); } diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp index 1ee03914807a..b0b1be8937a9 100644 --- a/c10/cuda/CUDACachingAllocator.cpp +++ b/c10/cuda/CUDACachingAllocator.cpp @@ -1225,7 +1225,7 @@ class DeviceCachingAllocator { DeviceCachingAllocator() : large_blocks(/*small=*/false), small_blocks(/*small=*/true) { stats.max_split_size = - static_cast(CUDAAllocatorConfig::max_split_size()); + static_cast(AcceleratorAllocatorConfig::max_split_size()); context_recorder_.store(nullptr); } @@ -1350,7 +1350,8 @@ class DeviceCachingAllocator { // Do garbage collection if the flag is set. if (C10_UNLIKELY( set_fraction && - CUDAAllocatorConfig::garbage_collection_threshold() > 0.0)) { + AcceleratorAllocatorConfig::garbage_collection_threshold() > + 0.0)) { garbage_collect_cached_blocks(context); } // Attempt allocate @@ -1602,7 +1603,7 @@ class DeviceCachingAllocator { stats.active_bytes[stat_type].increase(block->size); stats.requested_bytes[stat_type].increase(block->requested_size); }); - if (block->size >= CUDAAllocatorConfig::max_split_size()) + if (block->size >= AcceleratorAllocatorConfig::max_split_size()) stats.oversize_allocations.increase(1); auto allocated_bytes_gauge = @@ -1653,7 +1654,7 @@ class DeviceCachingAllocator { block->pool->owner_MempoolId(), context ? context : block->context_when_allocated); - if (block->size >= CUDAAllocatorConfig::max_split_size()) + if (block->size >= AcceleratorAllocatorConfig::max_split_size()) stats.oversize_allocations.decrease(1); if (!block->stream_uses.empty()) { @@ -2202,7 +2203,8 @@ class DeviceCachingAllocator { if (size < kMinBlockSize) { return kMinBlockSize; } else { - auto divisions = CUDAAllocatorConfig::roundup_power2_divisions(size); + auto divisions = + AcceleratorAllocatorConfig::roundup_power2_divisions(size); if (divisions > 1 && size > (kMinBlockSize * divisions)) { return roundup_power2_next_division(size, divisions); } else { @@ -2692,7 +2694,7 @@ class DeviceCachingAllocator { if (block->pool->is_small || CUDAAllocatorConfig::expandable_segments()) { return remaining >= kMinBlockSize; } else { - return (size < CUDAAllocatorConfig::max_split_size()) && + return (size < AcceleratorAllocatorConfig::max_split_size()) && (remaining > kSmallSize); } } @@ -2712,7 +2714,7 @@ class DeviceCachingAllocator { if (C10_UNLIKELY( set_fraction && - CUDAAllocatorConfig::garbage_collection_threshold() > 0.0)) { + AcceleratorAllocatorConfig::garbage_collection_threshold() > 0.0)) { // Track block reuse interval only when garbage collection is enabled. ++pool.get_free_blocks_call_count; } @@ -2754,13 +2756,13 @@ class DeviceCachingAllocator { } // Do not return an oversized block for a large request - if ((p.size() < CUDAAllocatorConfig::max_split_size()) && - ((*it)->size >= CUDAAllocatorConfig::max_split_size())) + if ((p.size() < AcceleratorAllocatorConfig::max_split_size()) && + ((*it)->size >= AcceleratorAllocatorConfig::max_split_size())) return false; // Allow oversized block size to be rounded up but within a limit - if ((p.size() >= CUDAAllocatorConfig::max_split_size()) && + if ((p.size() >= AcceleratorAllocatorConfig::max_split_size()) && ((*it)->size >= - p.size() + CUDAAllocatorConfig::max_non_split_rounding_size())) + p.size() + AcceleratorAllocatorConfig::max_non_split_rounding_size())) return false; p.block = *it; pool.blocks.erase(it); @@ -2783,7 +2785,7 @@ class DeviceCachingAllocator { // therefore should be of less overheads. size_t gc_threshold = static_cast( - CUDAAllocatorConfig::garbage_collection_threshold() * + AcceleratorAllocatorConfig::garbage_collection_threshold() * static_cast(allowed_memory_maximum)); // No need to trigger GC yet if (total_allocated_memory <= gc_threshold) { @@ -2931,7 +2933,7 @@ class DeviceCachingAllocator { stats.segment[stat_type].increase(1); stats.reserved_bytes[stat_type].increase(size); }); - if (size >= CUDAAllocatorConfig::max_split_size()) + if (size >= AcceleratorAllocatorConfig::max_split_size()) stats.oversize_segments.increase(1); auto reserved_bytes_gauge = STATIC_GAUGE(pytorch.CUDACachingAllocator.reserved_bytes); @@ -2960,7 +2962,7 @@ class DeviceCachingAllocator { bool release_available_cached_blocks( const AllocParams& p, const std::shared_ptr& context) { - if (CUDAAllocatorConfig::max_split_size() == + if (AcceleratorAllocatorConfig::max_split_size() == std::numeric_limits::max()) return false; BlockPool& pool = *p.pool; @@ -2968,8 +2970,8 @@ class DeviceCachingAllocator { // because of std::unique_ptr, block cannot be trivially copied // Use constructor for search key. Block key(p.search_key.device, p.search_key.stream, p.search_key.size); - key.size = (key.size < CUDAAllocatorConfig::max_split_size()) - ? CUDAAllocatorConfig::max_split_size() + key.size = (key.size < AcceleratorAllocatorConfig::max_split_size()) + ? AcceleratorAllocatorConfig::max_split_size() : key.size; auto it = pool.blocks.lower_bound(&key); if (it == pool.blocks.end() || (*it)->stream != p.stream() || @@ -2982,7 +2984,7 @@ class DeviceCachingAllocator { --it; // Back up one item. Now on the largest block for the correct // stream while ((totalReleased < key.size) && - ((*it)->size >= CUDAAllocatorConfig::max_split_size()) && + ((*it)->size >= AcceleratorAllocatorConfig::max_split_size()) && ((*it)->stream == p.stream())) { auto cur = it; bool is_first = cur == pool.blocks.begin(); @@ -3107,7 +3109,7 @@ class DeviceCachingAllocator { stats.reserved_bytes[static_cast(StatType::AGGREGATE)] .current); - if (block->size >= CUDAAllocatorConfig::max_split_size()) + if (block->size >= AcceleratorAllocatorConfig::max_split_size()) stats.oversize_segments.decrease(1); pool->blocks.erase(block); delete block; @@ -3734,8 +3736,8 @@ class NativeCachingAllocator : public CUDAAllocator { auto& md = result.config_metadata; md.garbage_collection_threshold = - CUDAAllocatorConfig::garbage_collection_threshold(); - md.max_split_size = CUDAAllocatorConfig::max_split_size(); + AcceleratorAllocatorConfig::garbage_collection_threshold(); + md.max_split_size = AcceleratorAllocatorConfig::max_split_size(); md.pinned_num_register_threads = CUDAAllocatorConfig::pinned_num_register_threads(); md.expandable_segments = CUDAAllocatorConfig::expandable_segments(); @@ -3743,9 +3745,10 @@ class NativeCachingAllocator : public CUDAAllocator { CUDAAllocatorConfig::release_lock_on_cudamalloc(); md.pinned_use_host_register = CUDAAllocatorConfig::pinned_use_cuda_host_register(); - md.last_allocator_settings = CUDAAllocatorConfig::last_allocator_settings(); + md.last_allocator_settings = + AcceleratorAllocatorConfig::last_allocator_settings(); md.roundup_power2_divisions = - CUDAAllocatorConfig::roundup_power2_divisions(); + AcceleratorAllocatorConfig::roundup_power2_divisions(); return result; } diff --git a/c10/xpu/XPUCachingAllocator.cpp b/c10/xpu/XPUCachingAllocator.cpp index 543b48f08113..afae32d92a4b 100644 --- a/c10/xpu/XPUCachingAllocator.cpp +++ b/c10/xpu/XPUCachingAllocator.cpp @@ -1,3 +1,4 @@ +#include #include #include #include @@ -20,8 +21,6 @@ constexpr size_t kMinBlockSize = 512; constexpr size_t kSmallSize = 1048576; // "small" allocations are packed in 2 MiB blocks constexpr size_t kSmallBuffer = 2097152; -// "large" allocations may be packed in 20 MiB blocks -constexpr size_t kLargeBuffer = 20971520; // allocations between 1 and 10 MiB may use kLargeBuffer constexpr size_t kMinLargeAlloc = 10485760; // round up large allocations to 2 MiB diff --git a/torch/csrc/cuda/Module.cpp b/torch/csrc/cuda/Module.cpp index b44ce311ecd9..ead46337ff09 100644 --- a/torch/csrc/cuda/Module.cpp +++ b/torch/csrc/cuda/Module.cpp @@ -20,8 +20,8 @@ #include #include #include +#include #include -#include #include #include #include @@ -426,8 +426,7 @@ PyObject* THCPModule_cudaCachingAllocator_set_allocator_settings( PyObject* _unused, PyObject* env) { HANDLE_TH_ERRORS - c10::cuda::CUDACachingAllocator::setAllocatorSettings( - THPUtils_unpackString(env)); + c10::CachingAllocator::setAllocatorSettings(THPUtils_unpackString(env)); Py_RETURN_NONE; END_HANDLE_TH_ERRORS }