diff --git a/c10/cuda/CUDAAllocatorConfig.cpp b/c10/cuda/CUDAAllocatorConfig.cpp index 19aedb2cbb02..7b410bdd6ef5 100644 --- a/c10/cuda/CUDAAllocatorConfig.cpp +++ b/c10/cuda/CUDAAllocatorConfig.cpp @@ -12,6 +12,7 @@ constexpr size_t kRoundUpPowerOfTwoIntervals = 16; CUDAAllocatorConfig::CUDAAllocatorConfig() : m_max_split_size(std::numeric_limits::max()), + m_max_non_split_rounding_size(kLargeBuffer), m_garbage_collection_threshold(0), m_pinned_num_register_threads(1), m_expandable_segments(false), @@ -94,6 +95,27 @@ size_t CUDAAllocatorConfig::parseMaxSplitSize( return i; } +size_t CUDAAllocatorConfig::parseMaxNonSplitRoundingSize( + const std::vector& config, + size_t i) { + consumeToken(config, ++i, ':'); + constexpr int mb = 1024 * 1024; + if (++i < config.size()) { + size_t val1 = stoi(config[i]); + TORCH_CHECK( + val1 > kLargeBuffer / mb, + "CachingAllocator option max_non_split_rounding_mb too small, must be > ", + kLargeBuffer / mb, + ""); + val1 = std::max(val1, kLargeBuffer / mb); + val1 = std::min(val1, (std::numeric_limits::max() / mb)); + m_max_non_split_rounding_size = val1 * 1024 * 1024; + } else { + TORCH_CHECK(false, "Error, expecting max_non_split_rounding_mb value", ""); + } + return i; +} + size_t CUDAAllocatorConfig::parseGarbageCollectionThreshold( const std::vector& config, size_t i) { @@ -258,6 +280,9 @@ void CUDAAllocatorConfig::parseArgs(const char* env) { if (config_item_view == "max_split_size_mb") { i = parseMaxSplitSize(config, i); used_native_specific_option = true; + } else if (config_item_view == "max_non_split_rounding_mb") { + i = parseMaxNonSplitRoundingSize(config, i); + used_native_specific_option = true; } else if (config_item_view == "garbage_collection_threshold") { i = parseGarbageCollectionThreshold(config, i); used_native_specific_option = true; diff --git a/c10/cuda/CUDAAllocatorConfig.h b/c10/cuda/CUDAAllocatorConfig.h index 3106fc1b46ba..38adc4732e3d 100644 --- a/c10/cuda/CUDAAllocatorConfig.h +++ b/c10/cuda/CUDAAllocatorConfig.h @@ -63,6 +63,10 @@ class C10_CUDA_API CUDAAllocatorConfig { return instance().m_roundup_power2_divisions; } + static size_t max_non_split_rounding_size() { + return instance().m_max_non_split_rounding_size; + } + static std::string last_allocator_settings() { std::lock_guard lock( instance().m_last_allocator_settings_mutex); @@ -90,6 +94,9 @@ class C10_CUDA_API CUDAAllocatorConfig { size_t i, const char c); size_t parseMaxSplitSize(const std::vector& config, size_t i); + size_t parseMaxNonSplitRoundingSize( + const std::vector& config, + size_t i); size_t parseGarbageCollectionThreshold( const std::vector& config, size_t i); @@ -108,6 +115,7 @@ class C10_CUDA_API CUDAAllocatorConfig { size_t i); std::atomic m_max_split_size; + std::atomic m_max_non_split_rounding_size; std::vector m_roundup_power2_divisions; std::atomic m_garbage_collection_threshold; std::atomic m_pinned_num_register_threads; diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp index 3da3c6d4f5d0..a67a720717bb 100644 --- a/c10/cuda/CUDACachingAllocator.cpp +++ b/c10/cuda/CUDACachingAllocator.cpp @@ -2527,7 +2527,8 @@ class DeviceCachingAllocator { return false; // Allow oversized block size to be rounded up but within a limit if ((p.size() >= CUDAAllocatorConfig::max_split_size()) && - ((*it)->size >= p.size() + kLargeBuffer)) + ((*it)->size >= + p.size() + CUDAAllocatorConfig::max_non_split_rounding_size())) return false; p.block = *it; pool.blocks.erase(it); diff --git a/docs/source/notes/cuda.rst b/docs/source/notes/cuda.rst index 7d434bbbba64..c0b82adc7e07 100644 --- a/docs/source/notes/cuda.rst +++ b/docs/source/notes/cuda.rst @@ -471,6 +471,13 @@ Available options: set the knob value to: [256:1,512:2,1024:4,>:8]. ``roundup_power2_divisions`` is only meaningful with ``backend:native``. With ``backend:cudaMallocAsync``, ``roundup_power2_divisions`` is ignored. +* ``max_non_split_rounding_mb`` will allow non-split blocks for better reuse, eg, + a 1024MB cached block can be re-used for a 512MB allocation request. In the default + case, we only allow up to 20MB of rounding of non-split blocks, so a 512MB block + can only be served with between 512-532 MB size block. If we set the value of this + option to 1024, it will alow 512-1536 MB size blocks to be used for a 512MB block + which increases reuse of larger blocks. This will also help in reducing the stalls + in avoiding expensive cudaMalloc calls. * ``garbage_collection_threshold`` helps actively reclaiming unused GPU memory to avoid triggering expensive sync-and-reclaim-all operation (release_cached_blocks), which can be unfavorable to latency-critical GPU applications (e.g., servers).