Refactor CUDAAllocatorConfig using ConfigTokenizer (#165281)

* #165129 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165281 Approved by: https://github.com/albanD ghstack dependencies: #165129, #165131, #165135, #165136
2025-10-20 21:14:14 +08:00 · 2025-10-14 13:29:37 +00:00
parent 515b5ff539
commit 219fb6aafc
2 changed files with 114 additions and 217 deletions
--- a/c10/cuda/CUDAAllocatorConfig.cpp
+++ b/c10/cuda/CUDAAllocatorConfig.cpp
@ -1,6 +1,5 @@
 #include <c10/cuda/CUDAAllocatorConfig.h>
 #include <c10/cuda/CUDACachingAllocator.h>
 #include <c10/util/llvmMathExtras.h>
 #if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
 #include <c10/cuda/driver_api.h>
@ -8,194 +7,119 @@
 namespace c10::cuda::CUDACachingAllocator {
 CUDAAllocatorConfig::CUDAAllocatorConfig()
    : m_pinned_num_register_threads(1),
      m_pinned_reserve_segment_size_mb(0),
 #if CUDA_VERSION >= 12030
      m_expandable_segments_handle_type(
          Expandable_Segments_Handle_Type::UNSPECIFIED),
 #else
      m_expandable_segments_handle_type(
          Expandable_Segments_Handle_Type::POSIX_FD),
 #endif
      m_release_lock_on_cudamalloc(false),
      m_pinned_use_cuda_host_register(false),
      m_graph_capture_record_stream_reuse(false) {
 }
 void CUDAAllocatorConfig::lexArgs(
    const std::string& env,
    std::vector<std::string>& config) {
  std::vector<char> buf;
  for (char ch : env) {
    if (ch == ',' || ch == ':' || ch == '[' || ch == ']') {
      if (!buf.empty()) {
        config.emplace_back(buf.begin(), buf.end());
        buf.clear();
      }
      config.emplace_back(1, ch);
    } else if (ch != ' ') {
      buf.emplace_back(ch);
    }
  }
  if (!buf.empty()) {
    config.emplace_back(buf.begin(), buf.end());
  }
 }
 void CUDAAllocatorConfig::consumeToken(
    const std::vector<std::string>& config,
    size_t i,
    const char c) {
  TORCH_CHECK(
      i < config.size() && config[i] == std::string(1, c),
      "Error parsing CachingAllocator settings, expected ",
      c,
      "");
 }
 size_t CUDAAllocatorConfig::parseAllocatorConfig(
-    const std::vector<std::string>& config,
+    const c10::CachingAllocator::ConfigTokenizer& tokenizer,
    size_t i,
    bool& used_cudaMallocAsync) {
  // For ease of maintenance and understanding, the CUDA and ROCm
  // implementations of this function are separated. This avoids having many
  // #ifdef's throughout.
 #ifdef USE_ROCM
  // Ease burden on ROCm users by allowing either cuda or hip tokens.
  // cuda token is broken up to prevent hipify matching it.
 #define PYTORCH_TOKEN1 \
  "cud"                \
  "aMallocAsync"
 #define PYTORCH_TOKEN2 "hipMallocAsync"
-  consumeToken(config, ++i, ':');
+  tokenizer.checkToken(++i, ":");
-  if (++i < config.size()) {
+  i++; // Move to the value after the colon
-    TORCH_CHECK(
+#ifdef USE_ROCM
-        ((config[i] == "native") || (config[i] == PYTORCH_TOKEN1) ||
+  TORCH_CHECK(
-         (config[i] == PYTORCH_TOKEN2)),
+      ((tokenizer[i] == "native") || (tokenizer[i] == PYTORCH_TOKEN1) ||
-        "Unknown allocator backend, "
+       (tokenizer[i] == PYTORCH_TOKEN2)),
-        "options are native, " PYTORCH_TOKEN1 ", and " PYTORCH_TOKEN2);
+      "Unknown allocator backend, "
-    used_cudaMallocAsync =
+      "options are native, " PYTORCH_TOKEN1 ", and " PYTORCH_TOKEN2);
-        (config[i] == PYTORCH_TOKEN1 || config[i] == PYTORCH_TOKEN2);
+  used_cudaMallocAsync =
-    TORCH_INTERNAL_ASSERT(
+      (tokenizer[i] == PYTORCH_TOKEN1 || tokenizer[i] == PYTORCH_TOKEN2);
-        config[i] == get()->name() ||
+  TORCH_INTERNAL_ASSERT(
-            (config[i] == PYTORCH_TOKEN1 && get()->name() == PYTORCH_TOKEN2),
+      tokenizer[i] == get()->name() ||
-        "Allocator backend parsed at runtime != "
+          (tokenizer[i] == PYTORCH_TOKEN1 && get()->name() == PYTORCH_TOKEN2),
-        "allocator backend parsed at load time, ",
+      "Allocator backend parsed at runtime != "
-        config[i],
+      "allocator backend parsed at load time, ",
-        " != ",
+      tokenizer[i],
-        get()->name());
+      " != ",
-  } else {
+      get()->name());
    TORCH_CHECK(false, "Error parsing backend value", "");
  }
  return i;
 #undef PYTORCH_TOKEN1
 #undef PYTORCH_TOKEN2
 #else // USE_ROCM
-  consumeToken(config, ++i, ':');
+  TORCH_CHECK(
-  if (++i < config.size()) {
+      ((tokenizer[i] == "native") || (tokenizer[i] == PYTORCH_TOKEN1)),
-    TORCH_CHECK(
+      "Unknown allocator backend, "
-        ((config[i] == "native") || (config[i] == "cudaMallocAsync")),
+      "options are native and " PYTORCH_TOKEN1);
-        "Unknown allocator backend, "
+  used_cudaMallocAsync = (tokenizer[i] == PYTORCH_TOKEN1);
-        "options are native and cudaMallocAsync");
+  TORCH_INTERNAL_ASSERT(
-    used_cudaMallocAsync = (config[i] == "cudaMallocAsync");
+      tokenizer[i] == get()->name(),
-    if (used_cudaMallocAsync) {
+      "Allocator backend parsed at runtime != "
      "allocator backend parsed at load time, ",
      tokenizer[i],
      " != ",
      get()->name());
  if (used_cudaMallocAsync) {
 #if CUDA_VERSION >= 11040
-      int version = 0;
+    int version = 0;
-      C10_CUDA_CHECK(cudaDriverGetVersion(&version));
+    C10_CUDA_CHECK(cudaDriverGetVersion(&version));
-      TORCH_CHECK(
+    TORCH_CHECK(
-          version >= 11040,
+        version >= 11040,
-          "backend:cudaMallocAsync requires CUDA runtime "
+        "backend:cudaMallocAsync requires CUDA runtime "
-          "11.4 or newer, but cudaDriverGetVersion returned ",
+        "11.4 or newer, but cudaDriverGetVersion returned ",
-          version);
+        version);
-#else
+#else // CUDA_VERSION >= 11040
-      TORCH_CHECK(
+    TORCH_CHECK(
-          false,
+        false,
-          "backend:cudaMallocAsync requires PyTorch to be built with "
+        "backend:cudaMallocAsync requires PyTorch to be built with "
-          "CUDA 11.4 or newer, but CUDA_VERSION is ",
+        "CUDA 11.4 or newer, but CUDA_VERSION is ",
-          CUDA_VERSION);
+        CUDA_VERSION);
-#endif
+#endif // CUDA_VERSION >= 11040
    }
    TORCH_INTERNAL_ASSERT(
        config[i] == get()->name(),
        "Allocator backend parsed at runtime != "
        "allocator backend parsed at load time");
  } else {
    TORCH_CHECK(false, "Error parsing backend value", "");
  }
  return i;
 #endif // USE_ROCM
  return i;
 }
 void CUDAAllocatorConfig::parseArgs(const std::string& env) {
  // If empty, set the default values
  bool used_cudaMallocAsync = false;
  bool used_native_specific_option = false;
-  std::vector<std::string> config;
+  c10::CachingAllocator::ConfigTokenizer tokenizer(env);
-  lexArgs(env, config);
+  for (size_t i = 0; i < tokenizer.size(); i++) {
-
+    const auto& key = tokenizer[i];
-  for (size_t i = 0; i < config.size(); i++) {
+    if (key == "backend") {
-    std::string_view config_item_view(config[i]);
+      i = parseAllocatorConfig(tokenizer, i, used_cudaMallocAsync);
    if (config_item_view == "backend") {
      i = parseAllocatorConfig(config, i, used_cudaMallocAsync);
    } else if (
        // ROCm build's hipify step will change "cuda" to "hip", but for ease of
        // use, accept both. We must break up the string to prevent hipify here.
-        config_item_view == "release_lock_on_hipmalloc" ||
+        key == "release_lock_on_hipmalloc" ||
-        config_item_view ==
+        key ==
            "release_lock_on_c"
            "udamalloc") {
      used_native_specific_option = true;
-      consumeToken(config, ++i, ':');
+      tokenizer.checkToken(++i, ":");
-      ++i;
+      m_release_lock_on_cudamalloc = tokenizer.toBool(++i);
      TORCH_CHECK(
          i < config.size() &&
              (std::string_view(config[i]) == "True" ||
               std::string_view(config[i]) == "False"),
          "Expected a single True/False argument for release_lock_on_cudamalloc");
      config_item_view = config[i];
      m_release_lock_on_cudamalloc = (config_item_view == "True");
    } else if (
        // ROCm build's hipify step will change "cuda" to "hip", but for ease of
        // use, accept both. We must break up the string to prevent hipify here.
-        config_item_view == "pinned_use_hip_host_register" ||
+        key == "pinned_use_hip_host_register" ||
-        config_item_view ==
+        key ==
            "pinned_use_c"
            "uda_host_register") {
-      i = parsePinnedUseCudaHostRegister(config, i);
+      i = parsePinnedUseCudaHostRegister(tokenizer, i);
      used_native_specific_option = true;
-    } else if (config_item_view == "pinned_num_register_threads") {
+    } else if (key == "pinned_num_register_threads") {
-      i = parsePinnedNumRegisterThreads(config, i);
+      i = parsePinnedNumRegisterThreads(tokenizer, i);
      used_native_specific_option = true;
-    } else if (config_item_view == "pinned_reserve_segment_size_mb") {
+    } else if (key == "pinned_reserve_segment_size_mb") {
-      i = parsePinnedReserveSegmentSize(config, i);
+      i = parsePinnedReserveSegmentSize(tokenizer, i);
      used_native_specific_option = true;
-    } else if (config_item_view == "graph_capture_record_stream_reuse") {
+    } else if (key == "graph_capture_record_stream_reuse") {
-      i = parseGraphCaptureRecordStreamReuse(config, i);
+      i = parseGraphCaptureRecordStreamReuse(tokenizer, i);
      used_native_specific_option = true;
    } else {
      const auto& keys =
          c10::CachingAllocator::AcceleratorAllocatorConfig::getKeys();
      TORCH_CHECK(
-          keys.find(config[i]) != keys.end(),
+          keys.find(key) != keys.end(),
          "Unrecognized key '",
-          config_item_view,
+          key,
          "' in CUDA allocator config.");
      // Skip the key and its value
-      consumeToken(config, ++i, ':');
+      i = tokenizer.skipKey(i);
      i++; // Move to the value
      if (config[i] == "[") {
        // Skip config inside the list until matching ']'
        // NOLINTNEXTLINE(bugprone-inc-dec-in-conditions)
        while (++i < config.size() && config[i] != "]") {
        }
      }
    }
-    if (i + 1 < config.size()) {
+    if (i + 1 < tokenizer.size()) {
-      consumeToken(config, ++i, ',');
+      tokenizer.checkToken(++i, ",");
    }
  }
@ -207,75 +131,48 @@ void CUDAAllocatorConfig::parseArgs(const std::string& env) {
 }
 size_t CUDAAllocatorConfig::parsePinnedUseCudaHostRegister(
-    const std::vector<std::string>& config,
+    const c10::CachingAllocator::ConfigTokenizer& tokenizer,
    size_t i) {
-  consumeToken(config, ++i, ':');
+  tokenizer.checkToken(++i, ":");
-  if (++i < config.size()) {
+  m_pinned_use_cuda_host_register = tokenizer.toBool(++i);
    TORCH_CHECK(
        (config[i] == "True" || config[i] == "False"),
        "Expected a single True/False argument for pinned_use_cuda_host_register");
    m_pinned_use_cuda_host_register = (config[i] == "True");
  } else {
    TORCH_CHECK(
        false, "Error, expecting pinned_use_cuda_host_register value", "");
  }
  return i;
 }
 size_t CUDAAllocatorConfig::parseGraphCaptureRecordStreamReuse(
-    const std::vector<std::string>& config,
+    const c10::CachingAllocator::ConfigTokenizer& tokenizer,
    size_t i) {
-  consumeToken(config, ++i, ':');
+  tokenizer.checkToken(++i, ":");
-  if (++i < config.size()) {
+  m_graph_capture_record_stream_reuse = tokenizer.toBool(++i);
    TORCH_CHECK(
        (config[i] == "True" || config[i] == "False"),
        "Expected a single True/False argument for graph_capture_record_stream_reuse");
    m_graph_capture_record_stream_reuse = (config[i] == "True");
  } else {
    TORCH_CHECK(
        false, "Error, expecting graph_capture_record_stream_reuse value", "");
  }
  return i;
 }
 size_t CUDAAllocatorConfig::parsePinnedNumRegisterThreads(
-    const std::vector<std::string>& config,
+    const c10::CachingAllocator::ConfigTokenizer& tokenizer,
    size_t i) {
-  consumeToken(config, ++i, ':');
+  tokenizer.checkToken(++i, ":");
-  if (++i < config.size()) {
+  size_t val2 = tokenizer.toSizeT(++i);
-    size_t val2 = stoi(config[i]);
+  TORCH_CHECK(
-    TORCH_CHECK(
+      llvm::isPowerOf2_64(val2),
-        llvm::isPowerOf2_64(val2),
+      "Number of register threads has to be power of 2, got ",
-        "Number of register threads has to be power of 2 ",
+      val2);
-        "");
+  auto maxThreads = CUDAAllocatorConfig::pinned_max_register_threads();
-    auto maxThreads = CUDAAllocatorConfig::pinned_max_register_threads();
+  TORCH_CHECK(
-    TORCH_CHECK(
+      val2 <= maxThreads,
-        val2 <= maxThreads,
+      "Number of register threads should be less than or equal to ",
-        "Number of register threads should be less than or equal to " +
+      maxThreads,
-            std::to_string(maxThreads),
+      ", got ",
-        "");
+      val2);
-    m_pinned_num_register_threads = val2;
+  m_pinned_num_register_threads = val2;
  } else {
    TORCH_CHECK(
        false, "Error, expecting pinned_num_register_threads value", "");
  }
  return i;
 }
 size_t CUDAAllocatorConfig::parsePinnedReserveSegmentSize(
-    const std::vector<std::string>& config,
+    const c10::CachingAllocator::ConfigTokenizer& tokenizer,
    size_t i) {
-  consumeToken(config, ++i, ':');
+  tokenizer.checkToken(++i, ":");
-  if (++i < config.size()) {
+  size_t val2 = tokenizer.toSizeT(++i);
-    size_t val2 = stoi(config[i]);
+  TORCH_CHECK(val2 > 0, "Pinned reserve segment size has to be greater than 0");
-    TORCH_CHECK(
+  m_pinned_reserve_segment_size_mb = val2;
        val2 > 0, "Pinned reserve segment size has to be greater than 0 ", "");
    m_pinned_reserve_segment_size_mb = val2;
  } else {
    TORCH_CHECK(
        false, "Error, expecting pinned_reserve_segment_size_mb value", "");
  }
  return i;
 }
--- a/c10/cuda/CUDAAllocatorConfig.h
+++ b/c10/cuda/CUDAAllocatorConfig.h
@ -1,6 +1,7 @@
 #pragma once
 #include <c10/core/AllocatorConfig.h>
 #include <c10/cuda/CUDAException.h>
 #include <c10/cuda/CUDAMacros.h>
 #include <c10/util/Exception.h>
 #include <c10/util/env.h>
@ -144,37 +145,36 @@ class C10_CUDA_API CUDAAllocatorConfig {
  void parseArgs(const std::string& env);
 private:
-  CUDAAllocatorConfig();
+  CUDAAllocatorConfig() = default;
  static void lexArgs(const std::string& env, std::vector<std::string>& config);
  static void consumeToken(
      const std::vector<std::string>& config,
      size_t i,
      const char c);
  size_t parseAllocatorConfig(
-      const std::vector<std::string>& config,
+      const c10::CachingAllocator::ConfigTokenizer& tokenizer,
      size_t i,
      bool& used_cudaMallocAsync);
  size_t parsePinnedUseCudaHostRegister(
-      const std::vector<std::string>& config,
+      const c10::CachingAllocator::ConfigTokenizer& tokenizer,
      size_t i);
  size_t parsePinnedNumRegisterThreads(
-      const std::vector<std::string>& config,
+      const c10::CachingAllocator::ConfigTokenizer& tokenizer,
      size_t i);
  size_t parsePinnedReserveSegmentSize(
-      const std::vector<std::string>& config,
+      const c10::CachingAllocator::ConfigTokenizer& tokenizer,
      size_t i);
  size_t parseGraphCaptureRecordStreamReuse(
-      const std::vector<std::string>& config,
+      const c10::CachingAllocator::ConfigTokenizer& tokenizer,
      size_t i);
-  std::atomic<size_t> m_pinned_num_register_threads;
+  std::atomic<size_t> m_pinned_num_register_threads{1};
-  std::atomic<size_t> m_pinned_reserve_segment_size_mb;
+  std::atomic<size_t> m_pinned_reserve_segment_size_mb{0};
-  std::atomic<Expandable_Segments_Handle_Type>
+  std::atomic<Expandable_Segments_Handle_Type> m_expandable_segments_handle_type
-      m_expandable_segments_handle_type;
+#if CUDA_VERSION >= 12030
-  std::atomic<bool> m_release_lock_on_cudamalloc;
+      {Expandable_Segments_Handle_Type::UNSPECIFIED};
-  std::atomic<bool> m_pinned_use_cuda_host_register;
+#else
-  std::atomic<bool> m_graph_capture_record_stream_reuse;
+      {Expandable_Segments_Handle_Type::POSIX_FD};
 #endif
  std::atomic<bool> m_release_lock_on_cudamalloc{false};
  std::atomic<bool> m_pinned_use_cuda_host_register{false};
  std::atomic<bool> m_graph_capture_record_stream_reuse{false};
 };
 // Keep this for backwards compatibility