Refactor CUDAAllocatorConfig using ConfigTokenizer (#165281)

* #165129 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165281 Approved by: https://github.com/albanD ghstack dependencies: #165129, #165131, #165135, #165136
2025-10-20 12:54:11 +08:00 · 2025-10-14 13:29:37 +00:00
parent 515b5ff539
commit 219fb6aafc
2 changed files with 114 additions and 217 deletions
--- a/c10/cuda/CUDAAllocatorConfig.cpp
+++ b/c10/cuda/CUDAAllocatorConfig.cpp
@ -1,6 +1,5 @@
 #include <c10/cuda/CUDAAllocatorConfig.h>
 #include <c10/cuda/CUDACachingAllocator.h>
-#include <c10/util/llvmMathExtras.h>

 #if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
 #include <c10/cuda/driver_api.h>
@ -8,194 +7,119 @@

 namespace c10::cuda::CUDACachingAllocator {

-CUDAAllocatorConfig::CUDAAllocatorConfig()
-    : m_pinned_num_register_threads(1),
-      m_pinned_reserve_segment_size_mb(0),
-#if CUDA_VERSION >= 12030
-      m_expandable_segments_handle_type(
-          Expandable_Segments_Handle_Type::UNSPECIFIED),
-#else
-      m_expandable_segments_handle_type(
-          Expandable_Segments_Handle_Type::POSIX_FD),
-#endif
-      m_release_lock_on_cudamalloc(false),
-      m_pinned_use_cuda_host_register(false),
-      m_graph_capture_record_stream_reuse(false) {
-}
-
-void CUDAAllocatorConfig::lexArgs(
-    const std::string& env,
-    std::vector<std::string>& config) {
-  std::vector<char> buf;
-
-  for (char ch : env) {
-    if (ch == ',' || ch == ':' || ch == '[' || ch == ']') {
-      if (!buf.empty()) {
-        config.emplace_back(buf.begin(), buf.end());
-        buf.clear();
-      }
-      config.emplace_back(1, ch);
-    } else if (ch != ' ') {
-      buf.emplace_back(ch);
-    }
-  }
-  if (!buf.empty()) {
-    config.emplace_back(buf.begin(), buf.end());
-  }
-}
-
-void CUDAAllocatorConfig::consumeToken(
-    const std::vector<std::string>& config,
-    size_t i,
-    const char c) {
-  TORCH_CHECK(
-      i < config.size() && config[i] == std::string(1, c),
-      "Error parsing CachingAllocator settings, expected ",
-      c,
-      "");
-}
-
 size_t CUDAAllocatorConfig::parseAllocatorConfig(
-    const std::vector<std::string>& config,
+    const c10::CachingAllocator::ConfigTokenizer& tokenizer,
    size_t i,
    bool& used_cudaMallocAsync) {
-  // For ease of maintenance and understanding, the CUDA and ROCm
-  // implementations of this function are separated. This avoids having many
-  // #ifdef's throughout.
-#ifdef USE_ROCM
  // Ease burden on ROCm users by allowing either cuda or hip tokens.
  // cuda token is broken up to prevent hipify matching it.
 #define PYTORCH_TOKEN1 \
  "cud"                \
  "aMallocAsync"
 #define PYTORCH_TOKEN2 "hipMallocAsync"
-  consumeToken(config, ++i, ':');
-  if (++i < config.size()) {
-    TORCH_CHECK(
-        ((config[i] == "native") || (config[i] == PYTORCH_TOKEN1) ||
-         (config[i] == PYTORCH_TOKEN2)),
-        "Unknown allocator backend, "
-        "options are native, " PYTORCH_TOKEN1 ", and " PYTORCH_TOKEN2);
-    used_cudaMallocAsync =
-        (config[i] == PYTORCH_TOKEN1 || config[i] == PYTORCH_TOKEN2);
-    TORCH_INTERNAL_ASSERT(
-        config[i] == get()->name() ||
-            (config[i] == PYTORCH_TOKEN1 && get()->name() == PYTORCH_TOKEN2),
-        "Allocator backend parsed at runtime != "
-        "allocator backend parsed at load time, ",
-        config[i],
-        " != ",
-        get()->name());
-  } else {
-    TORCH_CHECK(false, "Error parsing backend value", "");
-  }
-  return i;
-#undef PYTORCH_TOKEN1
-#undef PYTORCH_TOKEN2
+  tokenizer.checkToken(++i, ":");
+  i++; // Move to the value after the colon
+#ifdef USE_ROCM
+  TORCH_CHECK(
+      ((tokenizer[i] == "native") || (tokenizer[i] == PYTORCH_TOKEN1) ||
+       (tokenizer[i] == PYTORCH_TOKEN2)),
+      "Unknown allocator backend, "
+      "options are native, " PYTORCH_TOKEN1 ", and " PYTORCH_TOKEN2);
+  used_cudaMallocAsync =
+      (tokenizer[i] == PYTORCH_TOKEN1 || tokenizer[i] == PYTORCH_TOKEN2);
+  TORCH_INTERNAL_ASSERT(
+      tokenizer[i] == get()->name() ||
+          (tokenizer[i] == PYTORCH_TOKEN1 && get()->name() == PYTORCH_TOKEN2),
+      "Allocator backend parsed at runtime != "
+      "allocator backend parsed at load time, ",
+      tokenizer[i],
+      " != ",
+      get()->name());
 #else // USE_ROCM
-  consumeToken(config, ++i, ':');
-  if (++i < config.size()) {
-    TORCH_CHECK(
-        ((config[i] == "native") || (config[i] == "cudaMallocAsync")),
-        "Unknown allocator backend, "
-        "options are native and cudaMallocAsync");
-    used_cudaMallocAsync = (config[i] == "cudaMallocAsync");
-    if (used_cudaMallocAsync) {
+  TORCH_CHECK(
+      ((tokenizer[i] == "native") || (tokenizer[i] == PYTORCH_TOKEN1)),
+      "Unknown allocator backend, "
+      "options are native and " PYTORCH_TOKEN1);
+  used_cudaMallocAsync = (tokenizer[i] == PYTORCH_TOKEN1);
+  TORCH_INTERNAL_ASSERT(
+      tokenizer[i] == get()->name(),
+      "Allocator backend parsed at runtime != "
+      "allocator backend parsed at load time, ",
+      tokenizer[i],
+      " != ",
+      get()->name());
+  if (used_cudaMallocAsync) {
 #if CUDA_VERSION >= 11040
-      int version = 0;
-      C10_CUDA_CHECK(cudaDriverGetVersion(&version));
-      TORCH_CHECK(
-          version >= 11040,
-          "backend:cudaMallocAsync requires CUDA runtime "
-          "11.4 or newer, but cudaDriverGetVersion returned ",
-          version);
-#else
-      TORCH_CHECK(
-          false,
-          "backend:cudaMallocAsync requires PyTorch to be built with "
-          "CUDA 11.4 or newer, but CUDA_VERSION is ",
-          CUDA_VERSION);
-#endif
-    }
-    TORCH_INTERNAL_ASSERT(
-        config[i] == get()->name(),
-        "Allocator backend parsed at runtime != "
-        "allocator backend parsed at load time");
-  } else {
-    TORCH_CHECK(false, "Error parsing backend value", "");
+    int version = 0;
+    C10_CUDA_CHECK(cudaDriverGetVersion(&version));
+    TORCH_CHECK(
+        version >= 11040,
+        "backend:cudaMallocAsync requires CUDA runtime "
+        "11.4 or newer, but cudaDriverGetVersion returned ",
+        version);
+#else // CUDA_VERSION >= 11040
+    TORCH_CHECK(
+        false,
+        "backend:cudaMallocAsync requires PyTorch to be built with "
+        "CUDA 11.4 or newer, but CUDA_VERSION is ",
+        CUDA_VERSION);
+#endif // CUDA_VERSION >= 11040
  }
-  return i;
 #endif // USE_ROCM
+  return i;
 }

 void CUDAAllocatorConfig::parseArgs(const std::string& env) {
-  // If empty, set the default values
  bool used_cudaMallocAsync = false;
  bool used_native_specific_option = false;

-  std::vector<std::string> config;
-  lexArgs(env, config);
-
-  for (size_t i = 0; i < config.size(); i++) {
-    std::string_view config_item_view(config[i]);
-    if (config_item_view == "backend") {
-      i = parseAllocatorConfig(config, i, used_cudaMallocAsync);
+  c10::CachingAllocator::ConfigTokenizer tokenizer(env);
+  for (size_t i = 0; i < tokenizer.size(); i++) {
+    const auto& key = tokenizer[i];
+    if (key == "backend") {
+      i = parseAllocatorConfig(tokenizer, i, used_cudaMallocAsync);
    } else if (
        // ROCm build's hipify step will change "cuda" to "hip", but for ease of
        // use, accept both. We must break up the string to prevent hipify here.
-        config_item_view == "release_lock_on_hipmalloc" ||
-        config_item_view ==
+        key == "release_lock_on_hipmalloc" ||
+        key ==
            "release_lock_on_c"
            "udamalloc") {
      used_native_specific_option = true;
-      consumeToken(config, ++i, ':');
-      ++i;
-      TORCH_CHECK(
-          i < config.size() &&
-              (std::string_view(config[i]) == "True" ||
-               std::string_view(config[i]) == "False"),
-          "Expected a single True/False argument for release_lock_on_cudamalloc");
-      config_item_view = config[i];
-      m_release_lock_on_cudamalloc = (config_item_view == "True");
+      tokenizer.checkToken(++i, ":");
+      m_release_lock_on_cudamalloc = tokenizer.toBool(++i);
    } else if (
        // ROCm build's hipify step will change "cuda" to "hip", but for ease of
        // use, accept both. We must break up the string to prevent hipify here.
-        config_item_view == "pinned_use_hip_host_register" ||
-        config_item_view ==
+        key == "pinned_use_hip_host_register" ||
+        key ==
            "pinned_use_c"
            "uda_host_register") {
-      i = parsePinnedUseCudaHostRegister(config, i);
+      i = parsePinnedUseCudaHostRegister(tokenizer, i);
      used_native_specific_option = true;
-    } else if (config_item_view == "pinned_num_register_threads") {
-      i = parsePinnedNumRegisterThreads(config, i);
+    } else if (key == "pinned_num_register_threads") {
+      i = parsePinnedNumRegisterThreads(tokenizer, i);
      used_native_specific_option = true;
-    } else if (config_item_view == "pinned_reserve_segment_size_mb") {
-      i = parsePinnedReserveSegmentSize(config, i);
+    } else if (key == "pinned_reserve_segment_size_mb") {
+      i = parsePinnedReserveSegmentSize(tokenizer, i);
      used_native_specific_option = true;
-    } else if (config_item_view == "graph_capture_record_stream_reuse") {
-      i = parseGraphCaptureRecordStreamReuse(config, i);
+    } else if (key == "graph_capture_record_stream_reuse") {
+      i = parseGraphCaptureRecordStreamReuse(tokenizer, i);
      used_native_specific_option = true;
    } else {
      const auto& keys =
          c10::CachingAllocator::AcceleratorAllocatorConfig::getKeys();
      TORCH_CHECK(
-          keys.find(config[i]) != keys.end(),
+          keys.find(key) != keys.end(),
          "Unrecognized key '",
-          config_item_view,
+          key,
          "' in CUDA allocator config.");
      // Skip the key and its value
-      consumeToken(config, ++i, ':');
-      i++; // Move to the value
-      if (config[i] == "[") {
-        // Skip config inside the list until matching ']'
-        // NOLINTNEXTLINE(bugprone-inc-dec-in-conditions)
-        while (++i < config.size() && config[i] != "]") {
-        }
-      }
+      i = tokenizer.skipKey(i);
    }

-    if (i + 1 < config.size()) {
-      consumeToken(config, ++i, ',');
+    if (i + 1 < tokenizer.size()) {
+      tokenizer.checkToken(++i, ",");
    }
  }

@ -207,75 +131,48 @@ void CUDAAllocatorConfig::parseArgs(const std::string& env) {
 }

 size_t CUDAAllocatorConfig::parsePinnedUseCudaHostRegister(
-    const std::vector<std::string>& config,
+    const c10::CachingAllocator::ConfigTokenizer& tokenizer,
    size_t i) {
-  consumeToken(config, ++i, ':');
-  if (++i < config.size()) {
-    TORCH_CHECK(
-        (config[i] == "True" || config[i] == "False"),
-        "Expected a single True/False argument for pinned_use_cuda_host_register");
-    m_pinned_use_cuda_host_register = (config[i] == "True");
-  } else {
-    TORCH_CHECK(
-        false, "Error, expecting pinned_use_cuda_host_register value", "");
-  }
+  tokenizer.checkToken(++i, ":");
+  m_pinned_use_cuda_host_register = tokenizer.toBool(++i);
  return i;
 }

 size_t CUDAAllocatorConfig::parseGraphCaptureRecordStreamReuse(
-    const std::vector<std::string>& config,
+    const c10::CachingAllocator::ConfigTokenizer& tokenizer,
    size_t i) {
-  consumeToken(config, ++i, ':');
-  if (++i < config.size()) {
-    TORCH_CHECK(
-        (config[i] == "True" || config[i] == "False"),
-        "Expected a single True/False argument for graph_capture_record_stream_reuse");
-    m_graph_capture_record_stream_reuse = (config[i] == "True");
-  } else {
-    TORCH_CHECK(
-        false, "Error, expecting graph_capture_record_stream_reuse value", "");
-  }
-
+  tokenizer.checkToken(++i, ":");
+  m_graph_capture_record_stream_reuse = tokenizer.toBool(++i);
  return i;
 }

 size_t CUDAAllocatorConfig::parsePinnedNumRegisterThreads(
-    const std::vector<std::string>& config,
+    const c10::CachingAllocator::ConfigTokenizer& tokenizer,
    size_t i) {
-  consumeToken(config, ++i, ':');
-  if (++i < config.size()) {
-    size_t val2 = stoi(config[i]);
-    TORCH_CHECK(
-        llvm::isPowerOf2_64(val2),
-        "Number of register threads has to be power of 2 ",
-        "");
-    auto maxThreads = CUDAAllocatorConfig::pinned_max_register_threads();
-    TORCH_CHECK(
-        val2 <= maxThreads,
-        "Number of register threads should be less than or equal to " +
-            std::to_string(maxThreads),
-        "");
-    m_pinned_num_register_threads = val2;
-  } else {
-    TORCH_CHECK(
-        false, "Error, expecting pinned_num_register_threads value", "");
-  }
+  tokenizer.checkToken(++i, ":");
+  size_t val2 = tokenizer.toSizeT(++i);
+  TORCH_CHECK(
+      llvm::isPowerOf2_64(val2),
+      "Number of register threads has to be power of 2, got ",
+      val2);
+  auto maxThreads = CUDAAllocatorConfig::pinned_max_register_threads();
+  TORCH_CHECK(
+      val2 <= maxThreads,
+      "Number of register threads should be less than or equal to ",
+      maxThreads,
+      ", got ",
+      val2);
+  m_pinned_num_register_threads = val2;
  return i;
 }

 size_t CUDAAllocatorConfig::parsePinnedReserveSegmentSize(
-    const std::vector<std::string>& config,
+    const c10::CachingAllocator::ConfigTokenizer& tokenizer,
    size_t i) {
-  consumeToken(config, ++i, ':');
-  if (++i < config.size()) {
-    size_t val2 = stoi(config[i]);
-    TORCH_CHECK(
-        val2 > 0, "Pinned reserve segment size has to be greater than 0 ", "");
-    m_pinned_reserve_segment_size_mb = val2;
-  } else {
-    TORCH_CHECK(
-        false, "Error, expecting pinned_reserve_segment_size_mb value", "");
-  }
+  tokenizer.checkToken(++i, ":");
+  size_t val2 = tokenizer.toSizeT(++i);
+  TORCH_CHECK(val2 > 0, "Pinned reserve segment size has to be greater than 0");
+  m_pinned_reserve_segment_size_mb = val2;
  return i;
 }

--- a/c10/cuda/CUDAAllocatorConfig.h
+++ b/c10/cuda/CUDAAllocatorConfig.h
@ -1,6 +1,7 @@
 #pragma once

 #include <c10/core/AllocatorConfig.h>
+#include <c10/cuda/CUDAException.h>
 #include <c10/cuda/CUDAMacros.h>
 #include <c10/util/Exception.h>
 #include <c10/util/env.h>
@ -144,37 +145,36 @@ class C10_CUDA_API CUDAAllocatorConfig {
  void parseArgs(const std::string& env);

 private:
-  CUDAAllocatorConfig();
+  CUDAAllocatorConfig() = default;

-  static void lexArgs(const std::string& env, std::vector<std::string>& config);
-  static void consumeToken(
-      const std::vector<std::string>& config,
-      size_t i,
-      const char c);
  size_t parseAllocatorConfig(
-      const std::vector<std::string>& config,
+      const c10::CachingAllocator::ConfigTokenizer& tokenizer,
      size_t i,
      bool& used_cudaMallocAsync);
  size_t parsePinnedUseCudaHostRegister(
-      const std::vector<std::string>& config,
+      const c10::CachingAllocator::ConfigTokenizer& tokenizer,
      size_t i);
  size_t parsePinnedNumRegisterThreads(
-      const std::vector<std::string>& config,
+      const c10::CachingAllocator::ConfigTokenizer& tokenizer,
      size_t i);
  size_t parsePinnedReserveSegmentSize(
-      const std::vector<std::string>& config,
+      const c10::CachingAllocator::ConfigTokenizer& tokenizer,
      size_t i);
  size_t parseGraphCaptureRecordStreamReuse(
-      const std::vector<std::string>& config,
+      const c10::CachingAllocator::ConfigTokenizer& tokenizer,
      size_t i);

-  std::atomic<size_t> m_pinned_num_register_threads;
-  std::atomic<size_t> m_pinned_reserve_segment_size_mb;
-  std::atomic<Expandable_Segments_Handle_Type>
-      m_expandable_segments_handle_type;
-  std::atomic<bool> m_release_lock_on_cudamalloc;
-  std::atomic<bool> m_pinned_use_cuda_host_register;
-  std::atomic<bool> m_graph_capture_record_stream_reuse;
+  std::atomic<size_t> m_pinned_num_register_threads{1};
+  std::atomic<size_t> m_pinned_reserve_segment_size_mb{0};
+  std::atomic<Expandable_Segments_Handle_Type> m_expandable_segments_handle_type
+#if CUDA_VERSION >= 12030
+      {Expandable_Segments_Handle_Type::UNSPECIFIED};
+#else
+      {Expandable_Segments_Handle_Type::POSIX_FD};
+#endif
+  std::atomic<bool> m_release_lock_on_cudamalloc{false};
+  std::atomic<bool> m_pinned_use_cuda_host_register{false};
+  std::atomic<bool> m_graph_capture_record_stream_reuse{false};
 };

 // Keep this for backwards compatibility