[PyTorch CUDA Allocator] Allow reuse of non-split blocks with better rounding (#136174)

Summary: This diff adds an option to round the non-split blocks in caching allocator so that they can be reused without causing lots of fragmentation for large memory segments. For example, if we specify max_split memory size as 400MB, then all allocations more than 400MB will not be split. Lets say, we allocated some 1024MB blocks and these are cached in the allocator blocks. If we request a new 500MB block, we round it to nearest power-2-division, thats 512MB, we add default kLargeBuffer of 20MB, that will be 532MB and since 532MB is less than existing 1024MB block, the 1024MB will not be used for this allocation, instead a new 512MB block will be created. In this diff, we provide an option to cofigure the kLargeBuffer for rounding and expose as a configurable option, so 512MB + max_non_split_rounding_size and if thats greater than 1024MB, we will use te 1024MB and we wont create a new 512MB block using cudaMalloc. This option is added so that we can pre-allocate some large blocks so that we can reuse them as much as possible and we dont stall on calling cudaMalloc. Differential Revision: D62758758 Pull Request resolved: https://github.com/pytorch/pytorch/pull/136174 Approved by: https://github.com/zyan0
2025-10-20 21:14:14 +08:00 · 2024-09-17 19:08:44 +00:00
parent e3aa5e2f64
commit 48d18fbd4c
4 changed files with 42 additions and 1 deletions
--- a/c10/cuda/CUDAAllocatorConfig.cpp
+++ b/c10/cuda/CUDAAllocatorConfig.cpp
@ -12,6 +12,7 @@ constexpr size_t kRoundUpPowerOfTwoIntervals = 16;

 CUDAAllocatorConfig::CUDAAllocatorConfig()
    : m_max_split_size(std::numeric_limits<size_t>::max()),
+      m_max_non_split_rounding_size(kLargeBuffer),
      m_garbage_collection_threshold(0),
      m_pinned_num_register_threads(1),
      m_expandable_segments(false),
@ -94,6 +95,27 @@ size_t CUDAAllocatorConfig::parseMaxSplitSize(
  return i;
 }

+size_t CUDAAllocatorConfig::parseMaxNonSplitRoundingSize(
+    const std::vector<std::string>& config,
+    size_t i) {
+  consumeToken(config, ++i, ':');
+  constexpr int mb = 1024 * 1024;
+  if (++i < config.size()) {
+    size_t val1 = stoi(config[i]);
+    TORCH_CHECK(
+        val1 > kLargeBuffer / mb,
+        "CachingAllocator option max_non_split_rounding_mb too small, must be > ",
+        kLargeBuffer / mb,
+        "");
+    val1 = std::max(val1, kLargeBuffer / mb);
+    val1 = std::min(val1, (std::numeric_limits<size_t>::max() / mb));
+    m_max_non_split_rounding_size = val1 * 1024 * 1024;
+  } else {
+    TORCH_CHECK(false, "Error, expecting max_non_split_rounding_mb value", "");
+  }
+  return i;
+}
+
 size_t CUDAAllocatorConfig::parseGarbageCollectionThreshold(
    const std::vector<std::string>& config,
    size_t i) {
@ -258,6 +280,9 @@ void CUDAAllocatorConfig::parseArgs(const char* env) {
    if (config_item_view == "max_split_size_mb") {
      i = parseMaxSplitSize(config, i);
      used_native_specific_option = true;
+    } else if (config_item_view == "max_non_split_rounding_mb") {
+      i = parseMaxNonSplitRoundingSize(config, i);
+      used_native_specific_option = true;
    } else if (config_item_view == "garbage_collection_threshold") {
      i = parseGarbageCollectionThreshold(config, i);
      used_native_specific_option = true;
--- a/c10/cuda/CUDAAllocatorConfig.h
+++ b/c10/cuda/CUDAAllocatorConfig.h
@ -63,6 +63,10 @@ class C10_CUDA_API CUDAAllocatorConfig {
    return instance().m_roundup_power2_divisions;
  }

+  static size_t max_non_split_rounding_size() {
+    return instance().m_max_non_split_rounding_size;
+  }
+
  static std::string last_allocator_settings() {
    std::lock_guard<std::mutex> lock(
        instance().m_last_allocator_settings_mutex);
@ -90,6 +94,9 @@ class C10_CUDA_API CUDAAllocatorConfig {
      size_t i,
      const char c);
  size_t parseMaxSplitSize(const std::vector<std::string>& config, size_t i);
+  size_t parseMaxNonSplitRoundingSize(
+      const std::vector<std::string>& config,
+      size_t i);
  size_t parseGarbageCollectionThreshold(
      const std::vector<std::string>& config,
      size_t i);
@ -108,6 +115,7 @@ class C10_CUDA_API CUDAAllocatorConfig {
      size_t i);

  std::atomic<size_t> m_max_split_size;
+  std::atomic<size_t> m_max_non_split_rounding_size;
  std::vector<size_t> m_roundup_power2_divisions;
  std::atomic<double> m_garbage_collection_threshold;
  std::atomic<size_t> m_pinned_num_register_threads;
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp
@ -2527,7 +2527,8 @@ class DeviceCachingAllocator {
      return false;
    // Allow oversized block size to be rounded up but within a limit
    if ((p.size() >= CUDAAllocatorConfig::max_split_size()) &&
-        ((*it)->size >= p.size() + kLargeBuffer))
+        ((*it)->size >=
+         p.size() + CUDAAllocatorConfig::max_non_split_rounding_size()))
      return false;
    p.block = *it;
    pool.blocks.erase(it);
--- a/docs/source/notes/cuda.rst
+++ b/docs/source/notes/cuda.rst
@ -471,6 +471,13 @@ Available options:
  set the knob value to: [256:1,512:2,1024:4,>:8].
  ``roundup_power2_divisions`` is only meaningful with ``backend:native``.
  With ``backend:cudaMallocAsync``, ``roundup_power2_divisions`` is ignored.
+* ``max_non_split_rounding_mb`` will allow non-split blocks for better reuse, eg,
+   a 1024MB cached block can be re-used for a 512MB allocation request. In the default
+   case, we only allow up to 20MB of rounding of non-split blocks, so a 512MB block
+   can only be served with between 512-532 MB size block. If we set the value of this
+   option to 1024, it will alow 512-1536 MB size blocks to be used for a 512MB block
+   which increases reuse of larger blocks. This will also help in reducing the stalls
+   in avoiding expensive cudaMalloc calls.
 * ``garbage_collection_threshold`` helps actively reclaiming unused GPU memory to
  avoid triggering expensive sync-and-reclaim-all operation (release_cached_blocks),
  which can be unfavorable to latency-critical GPU applications (e.g., servers).