Remove unused code in CUDAAllocatorConfig (#165136)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165136 Approved by: https://github.com/Skylion007 ghstack dependencies: #165129, #165131, #165135
2025-10-20 21:14:14 +08:00 · 2025-10-14 13:29:35 +00:00
parent 608a6d4a26
commit 515b5ff539
2 changed files with 22 additions and 235 deletions
--- a/c10/cuda/CUDAAllocatorConfig.cpp
+++ b/c10/cuda/CUDAAllocatorConfig.cpp
@ -8,15 +8,9 @@

 namespace c10::cuda::CUDACachingAllocator {

-constexpr size_t kRoundUpPowerOfTwoIntervals = 16;
-
 CUDAAllocatorConfig::CUDAAllocatorConfig()
-    : m_max_split_size(std::numeric_limits<size_t>::max()),
-      m_max_non_split_rounding_size(kLargeBuffer),
-      m_garbage_collection_threshold(0),
-      m_pinned_num_register_threads(1),
+    : m_pinned_num_register_threads(1),
      m_pinned_reserve_segment_size_mb(0),
-      m_expandable_segments(false),
 #if CUDA_VERSION >= 12030
      m_expandable_segments_handle_type(
          Expandable_Segments_Handle_Type::UNSPECIFIED),
@ -26,14 +20,7 @@ CUDAAllocatorConfig::CUDAAllocatorConfig()
 #endif
      m_release_lock_on_cudamalloc(false),
      m_pinned_use_cuda_host_register(false),
-      m_graph_capture_record_stream_reuse(false),
-      m_pinned_use_background_threads(false) {
-  m_roundup_power2_divisions.assign(kRoundUpPowerOfTwoIntervals, 0);
-}
-
-size_t CUDAAllocatorConfig::roundup_power2_divisions(size_t size) {
-  return c10::CachingAllocator::AcceleratorAllocatorConfig::
-      roundup_power2_divisions(size);
+      m_graph_capture_record_stream_reuse(false) {
 }

 void CUDAAllocatorConfig::lexArgs(
@ -68,148 +55,6 @@ void CUDAAllocatorConfig::consumeToken(
      "");
 }

-size_t CUDAAllocatorConfig::parseMaxSplitSize(
-    const std::vector<std::string>& config,
-    size_t i) {
-  consumeToken(config, ++i, ':');
-  constexpr int mb = 1024 * 1024;
-  if (++i < config.size()) {
-    size_t val1 = stoi(config[i]);
-    TORCH_CHECK(
-        val1 > kLargeBuffer / mb,
-        "CachingAllocator option max_split_size_mb too small, must be > ",
-        kLargeBuffer / mb,
-        "");
-    val1 = std::max(val1, kLargeBuffer / mb);
-    val1 = std::min(val1, (std::numeric_limits<size_t>::max() / mb));
-    m_max_split_size = val1 * 1024 * 1024;
-  } else {
-    TORCH_CHECK(false, "Error, expecting max_split_size_mb value", "");
-  }
-  return i;
-}
-
-size_t CUDAAllocatorConfig::parseMaxNonSplitRoundingSize(
-    const std::vector<std::string>& config,
-    size_t i) {
-  consumeToken(config, ++i, ':');
-  constexpr int mb = 1024 * 1024;
-  if (++i < config.size()) {
-    size_t val1 = stoi(config[i]);
-    TORCH_CHECK(
-        val1 > kLargeBuffer / mb,
-        "CachingAllocator option max_non_split_rounding_mb too small, must be > ",
-        kLargeBuffer / mb,
-        "");
-    val1 = std::max(val1, kLargeBuffer / mb);
-    val1 = std::min(val1, (std::numeric_limits<size_t>::max() / mb));
-    m_max_non_split_rounding_size = val1 * 1024 * 1024;
-  } else {
-    TORCH_CHECK(false, "Error, expecting max_non_split_rounding_mb value", "");
-  }
-  return i;
-}
-
-size_t CUDAAllocatorConfig::parseGarbageCollectionThreshold(
-    const std::vector<std::string>& config,
-    size_t i) {
-  consumeToken(config, ++i, ':');
-  if (++i < config.size()) {
-    double val1 = stod(config[i]);
-    TORCH_CHECK(
-        val1 > 0, "garbage_collect_threshold too small, set it 0.0~1.0", "");
-    TORCH_CHECK(
-        val1 < 1.0, "garbage_collect_threshold too big, set it 0.0~1.0", "");
-    m_garbage_collection_threshold = val1;
-  } else {
-    TORCH_CHECK(
-        false, "Error, expecting garbage_collection_threshold value", "");
-  }
-  return i;
-}
-
-size_t CUDAAllocatorConfig::parseRoundUpPower2Divisions(
-    const std::vector<std::string>& config,
-    size_t i) {
-  consumeToken(config, ++i, ':');
-  bool first_value = true;
-
-  if (++i < config.size()) {
-    if (std::string_view(config[i]) == "[") {
-      size_t last_index = 0;
-      // NOLINTNEXTLINE(bugprone-inc-dec-in-conditions)
-      while (++i < config.size() && std::string_view(config[i]) != "]") {
-        const std::string& val1 = config[i];
-        size_t val2 = 0;
-
-        consumeToken(config, ++i, ':');
-        if (++i < config.size()) {
-          val2 = stoi(config[i]);
-        } else {
-          TORCH_CHECK(
-              false, "Error parsing roundup_power2_divisions value", "");
-        }
-        TORCH_CHECK(
-            val2 == 0 || llvm::isPowerOf2_64(val2),
-            "For roundups, the divisions has to be power of 2 or 0 to disable roundup ",
-            "");
-
-        if (std::string_view(val1) == ">") {
-          std::fill(
-              std::next(
-                  m_roundup_power2_divisions.begin(),
-                  static_cast<std::vector<unsigned long>::difference_type>(
-                      last_index)),
-              m_roundup_power2_divisions.end(),
-              val2);
-        } else {
-          size_t val1_long = stoul(val1);
-          TORCH_CHECK(
-              llvm::isPowerOf2_64(val1_long),
-              "For roundups, the intervals have to be power of 2 ",
-              "");
-
-          size_t index = 63 - llvm::countLeadingZeros(val1_long);
-          index = std::max((size_t)0, index);
-          index = std::min(index, m_roundup_power2_divisions.size() - 1);
-
-          if (first_value) {
-            std::fill(
-                m_roundup_power2_divisions.begin(),
-                std::next(
-                    m_roundup_power2_divisions.begin(),
-                    static_cast<std::vector<unsigned long>::difference_type>(
-                        index)),
-                val2);
-            first_value = false;
-          }
-          if (index < m_roundup_power2_divisions.size()) {
-            m_roundup_power2_divisions[index] = val2;
-          }
-          last_index = index;
-        }
-
-        if (std::string_view(config[i + 1]) != "]") {
-          consumeToken(config, ++i, ',');
-        }
-      }
-    } else { // Keep this for backwards compatibility
-      size_t val1 = stoi(config[i]);
-      TORCH_CHECK(
-          llvm::isPowerOf2_64(val1),
-          "For roundups, the divisions has to be power of 2 ",
-          "");
-      std::fill(
-          m_roundup_power2_divisions.begin(),
-          m_roundup_power2_divisions.end(),
-          val1);
-    }
-  } else {
-    TORCH_CHECK(false, "Error, expecting roundup_power2_divisions value", "");
-  }
-  return i;
-}
-
 size_t CUDAAllocatorConfig::parseAllocatorConfig(
    const std::vector<std::string>& config,
    size_t i,
@ -285,47 +130,16 @@ size_t CUDAAllocatorConfig::parseAllocatorConfig(

 void CUDAAllocatorConfig::parseArgs(const std::string& env) {
  // If empty, set the default values
-  m_max_split_size = std::numeric_limits<size_t>::max();
-  m_roundup_power2_divisions.assign(kRoundUpPowerOfTwoIntervals, 0);
-  m_garbage_collection_threshold = 0;
  bool used_cudaMallocAsync = false;
  bool used_native_specific_option = false;

-  {
-    std::lock_guard<std::mutex> lock(m_last_allocator_settings_mutex);
-    m_last_allocator_settings = env;
-  }
-
  std::vector<std::string> config;
  lexArgs(env, config);

  for (size_t i = 0; i < config.size(); i++) {
    std::string_view config_item_view(config[i]);
-    if (config_item_view == "max_split_size_mb") {
-      i = parseMaxSplitSize(config, i);
-      used_native_specific_option = true;
-    } else if (config_item_view == "max_non_split_rounding_mb") {
-      i = parseMaxNonSplitRoundingSize(config, i);
-      used_native_specific_option = true;
-    } else if (config_item_view == "garbage_collection_threshold") {
-      i = parseGarbageCollectionThreshold(config, i);
-      used_native_specific_option = true;
-    } else if (config_item_view == "roundup_power2_divisions") {
-      i = parseRoundUpPower2Divisions(config, i);
-      used_native_specific_option = true;
-    } else if (config_item_view == "backend") {
+    if (config_item_view == "backend") {
      i = parseAllocatorConfig(config, i, used_cudaMallocAsync);
-    } else if (config_item_view == "expandable_segments") {
-      used_native_specific_option = true;
-      consumeToken(config, ++i, ':');
-      ++i;
-      TORCH_CHECK(
-          i < config.size() &&
-              (std::string_view(config[i]) == "True" ||
-               std::string_view(config[i]) == "False"),
-          "Expected a single True/False argument for expandable_segments");
-      config_item_view = config[i];
-      m_expandable_segments = (config_item_view == "True");
    } else if (
        // ROCm build's hipify step will change "cuda" to "hip", but for ease of
        // use, accept both. We must break up the string to prevent hipify here.
@ -358,15 +172,26 @@ void CUDAAllocatorConfig::parseArgs(const std::string& env) {
    } else if (config_item_view == "pinned_reserve_segment_size_mb") {
      i = parsePinnedReserveSegmentSize(config, i);
      used_native_specific_option = true;
-    } else if (config_item_view == "pinned_use_background_threads") {
-      i = parsePinnedUseBackgroundThreads(config, i);
-      used_native_specific_option = true;
    } else if (config_item_view == "graph_capture_record_stream_reuse") {
      i = parseGraphCaptureRecordStreamReuse(config, i);
      used_native_specific_option = true;
    } else {
+      const auto& keys =
+          c10::CachingAllocator::AcceleratorAllocatorConfig::getKeys();
      TORCH_CHECK(
-          false, "Unrecognized CachingAllocator option: ", config_item_view);
+          keys.find(config[i]) != keys.end(),
+          "Unrecognized key '",
+          config_item_view,
+          "' in CUDA allocator config.");
+      // Skip the key and its value
+      consumeToken(config, ++i, ':');
+      i++; // Move to the value
+      if (config[i] == "[") {
+        // Skip config inside the list until matching ']'
+        // NOLINTNEXTLINE(bugprone-inc-dec-in-conditions)
+        while (++i < config.size() && config[i] != "]") {
+        }
+      }
    }

    if (i + 1 < config.size()) {
@ -454,22 +279,6 @@ size_t CUDAAllocatorConfig::parsePinnedReserveSegmentSize(
  return i;
 }

-size_t CUDAAllocatorConfig::parsePinnedUseBackgroundThreads(
-    const std::vector<std::string>& config,
-    size_t i) {
-  consumeToken(config, ++i, ':');
-  if (++i < config.size()) {
-    TORCH_CHECK(
-        (config[i] == "True" || config[i] == "False"),
-        "Expected a single True/False argument for pinned_use_background_threads");
-    m_pinned_use_background_threads = (config[i] == "True");
-  } else {
-    TORCH_CHECK(
-        false, "Error, expecting pinned_use_background_threads value", "");
-  }
-  return i;
-}
-
 REGISTER_ALLOCATOR_CONFIG_PARSE_HOOK(CUDAAllocatorConfig)

 } // namespace c10::cuda::CUDACachingAllocator
--- a/c10/cuda/CUDAAllocatorConfig.h
+++ b/c10/cuda/CUDAAllocatorConfig.h
@ -79,11 +79,10 @@ class C10_CUDA_API CUDAAllocatorConfig {
    return 128;
  }

-  // This is used to round-up allocation size to nearest power of 2 divisions.
-  // More description below in function roundup_power2_next_division
-  // As an example, if we want 4 divisions between 2's power, this can be done
-  // using env variable: PYTORCH_CUDA_ALLOC_CONF=roundup_power2_divisions:4
-  static size_t roundup_power2_divisions(size_t size);
+  static size_t roundup_power2_divisions(size_t size) {
+    return c10::CachingAllocator::AcceleratorAllocatorConfig::
+        roundup_power2_divisions(size);
+  }

  static std::vector<size_t> roundup_power2_divisions() {
    return c10::CachingAllocator::AcceleratorAllocatorConfig::
@ -152,16 +151,6 @@ class C10_CUDA_API CUDAAllocatorConfig {
      const std::vector<std::string>& config,
      size_t i,
      const char c);
-  size_t parseMaxSplitSize(const std::vector<std::string>& config, size_t i);
-  size_t parseMaxNonSplitRoundingSize(
-      const std::vector<std::string>& config,
-      size_t i);
-  size_t parseGarbageCollectionThreshold(
-      const std::vector<std::string>& config,
-      size_t i);
-  size_t parseRoundUpPower2Divisions(
-      const std::vector<std::string>& config,
-      size_t i);
  size_t parseAllocatorConfig(
      const std::vector<std::string>& config,
      size_t i,
@ -175,28 +164,17 @@ class C10_CUDA_API CUDAAllocatorConfig {
  size_t parsePinnedReserveSegmentSize(
      const std::vector<std::string>& config,
      size_t i);
-  size_t parsePinnedUseBackgroundThreads(
-      const std::vector<std::string>& config,
-      size_t i);
  size_t parseGraphCaptureRecordStreamReuse(
      const std::vector<std::string>& config,
      size_t i);

-  std::atomic<size_t> m_max_split_size;
-  std::atomic<size_t> m_max_non_split_rounding_size;
-  std::vector<size_t> m_roundup_power2_divisions;
-  std::atomic<double> m_garbage_collection_threshold;
  std::atomic<size_t> m_pinned_num_register_threads;
  std::atomic<size_t> m_pinned_reserve_segment_size_mb;
-  std::atomic<bool> m_expandable_segments;
  std::atomic<Expandable_Segments_Handle_Type>
      m_expandable_segments_handle_type;
  std::atomic<bool> m_release_lock_on_cudamalloc;
  std::atomic<bool> m_pinned_use_cuda_host_register;
  std::atomic<bool> m_graph_capture_record_stream_reuse;
-  std::atomic<bool> m_pinned_use_background_threads;
-  std::string m_last_allocator_settings;
-  std::mutex m_last_allocator_settings_mutex;
 };

 // Keep this for backwards compatibility