Refactor CUDAAllocatorConfig using ConfigTokenizer (#165281)

* #165129
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165281
Approved by: https://github.com/albanD
ghstack dependencies: #165129, #165131, #165135, #165136
This commit is contained in:
Yu, Guangye
2025-10-14 13:29:37 +00:00
committed by PyTorch MergeBot
parent 515b5ff539
commit 219fb6aafc
2 changed files with 114 additions and 217 deletions

View File

@ -1,6 +1,5 @@
#include <c10/cuda/CUDAAllocatorConfig.h>
#include <c10/cuda/CUDACachingAllocator.h>
#include <c10/util/llvmMathExtras.h>
#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
#include <c10/cuda/driver_api.h>
@ -8,194 +7,119 @@
namespace c10::cuda::CUDACachingAllocator {
CUDAAllocatorConfig::CUDAAllocatorConfig()
: m_pinned_num_register_threads(1),
m_pinned_reserve_segment_size_mb(0),
#if CUDA_VERSION >= 12030
m_expandable_segments_handle_type(
Expandable_Segments_Handle_Type::UNSPECIFIED),
#else
m_expandable_segments_handle_type(
Expandable_Segments_Handle_Type::POSIX_FD),
#endif
m_release_lock_on_cudamalloc(false),
m_pinned_use_cuda_host_register(false),
m_graph_capture_record_stream_reuse(false) {
}
void CUDAAllocatorConfig::lexArgs(
const std::string& env,
std::vector<std::string>& config) {
std::vector<char> buf;
for (char ch : env) {
if (ch == ',' || ch == ':' || ch == '[' || ch == ']') {
if (!buf.empty()) {
config.emplace_back(buf.begin(), buf.end());
buf.clear();
}
config.emplace_back(1, ch);
} else if (ch != ' ') {
buf.emplace_back(ch);
}
}
if (!buf.empty()) {
config.emplace_back(buf.begin(), buf.end());
}
}
void CUDAAllocatorConfig::consumeToken(
const std::vector<std::string>& config,
size_t i,
const char c) {
TORCH_CHECK(
i < config.size() && config[i] == std::string(1, c),
"Error parsing CachingAllocator settings, expected ",
c,
"");
}
size_t CUDAAllocatorConfig::parseAllocatorConfig(
const std::vector<std::string>& config,
const c10::CachingAllocator::ConfigTokenizer& tokenizer,
size_t i,
bool& used_cudaMallocAsync) {
// For ease of maintenance and understanding, the CUDA and ROCm
// implementations of this function are separated. This avoids having many
// #ifdef's throughout.
#ifdef USE_ROCM
// Ease burden on ROCm users by allowing either cuda or hip tokens.
// cuda token is broken up to prevent hipify matching it.
#define PYTORCH_TOKEN1 \
"cud" \
"aMallocAsync"
#define PYTORCH_TOKEN2 "hipMallocAsync"
consumeToken(config, ++i, ':');
if (++i < config.size()) {
TORCH_CHECK(
((config[i] == "native") || (config[i] == PYTORCH_TOKEN1) ||
(config[i] == PYTORCH_TOKEN2)),
"Unknown allocator backend, "
"options are native, " PYTORCH_TOKEN1 ", and " PYTORCH_TOKEN2);
used_cudaMallocAsync =
(config[i] == PYTORCH_TOKEN1 || config[i] == PYTORCH_TOKEN2);
TORCH_INTERNAL_ASSERT(
config[i] == get()->name() ||
(config[i] == PYTORCH_TOKEN1 && get()->name() == PYTORCH_TOKEN2),
"Allocator backend parsed at runtime != "
"allocator backend parsed at load time, ",
config[i],
" != ",
get()->name());
} else {
TORCH_CHECK(false, "Error parsing backend value", "");
}
return i;
#undef PYTORCH_TOKEN1
#undef PYTORCH_TOKEN2
tokenizer.checkToken(++i, ":");
i++; // Move to the value after the colon
#ifdef USE_ROCM
TORCH_CHECK(
((tokenizer[i] == "native") || (tokenizer[i] == PYTORCH_TOKEN1) ||
(tokenizer[i] == PYTORCH_TOKEN2)),
"Unknown allocator backend, "
"options are native, " PYTORCH_TOKEN1 ", and " PYTORCH_TOKEN2);
used_cudaMallocAsync =
(tokenizer[i] == PYTORCH_TOKEN1 || tokenizer[i] == PYTORCH_TOKEN2);
TORCH_INTERNAL_ASSERT(
tokenizer[i] == get()->name() ||
(tokenizer[i] == PYTORCH_TOKEN1 && get()->name() == PYTORCH_TOKEN2),
"Allocator backend parsed at runtime != "
"allocator backend parsed at load time, ",
tokenizer[i],
" != ",
get()->name());
#else // USE_ROCM
consumeToken(config, ++i, ':');
if (++i < config.size()) {
TORCH_CHECK(
((config[i] == "native") || (config[i] == "cudaMallocAsync")),
"Unknown allocator backend, "
"options are native and cudaMallocAsync");
used_cudaMallocAsync = (config[i] == "cudaMallocAsync");
if (used_cudaMallocAsync) {
TORCH_CHECK(
((tokenizer[i] == "native") || (tokenizer[i] == PYTORCH_TOKEN1)),
"Unknown allocator backend, "
"options are native and " PYTORCH_TOKEN1);
used_cudaMallocAsync = (tokenizer[i] == PYTORCH_TOKEN1);
TORCH_INTERNAL_ASSERT(
tokenizer[i] == get()->name(),
"Allocator backend parsed at runtime != "
"allocator backend parsed at load time, ",
tokenizer[i],
" != ",
get()->name());
if (used_cudaMallocAsync) {
#if CUDA_VERSION >= 11040
int version = 0;
C10_CUDA_CHECK(cudaDriverGetVersion(&version));
TORCH_CHECK(
version >= 11040,
"backend:cudaMallocAsync requires CUDA runtime "
"11.4 or newer, but cudaDriverGetVersion returned ",
version);
#else
TORCH_CHECK(
false,
"backend:cudaMallocAsync requires PyTorch to be built with "
"CUDA 11.4 or newer, but CUDA_VERSION is ",
CUDA_VERSION);
#endif
}
TORCH_INTERNAL_ASSERT(
config[i] == get()->name(),
"Allocator backend parsed at runtime != "
"allocator backend parsed at load time");
} else {
TORCH_CHECK(false, "Error parsing backend value", "");
int version = 0;
C10_CUDA_CHECK(cudaDriverGetVersion(&version));
TORCH_CHECK(
version >= 11040,
"backend:cudaMallocAsync requires CUDA runtime "
"11.4 or newer, but cudaDriverGetVersion returned ",
version);
#else // CUDA_VERSION >= 11040
TORCH_CHECK(
false,
"backend:cudaMallocAsync requires PyTorch to be built with "
"CUDA 11.4 or newer, but CUDA_VERSION is ",
CUDA_VERSION);
#endif // CUDA_VERSION >= 11040
}
return i;
#endif // USE_ROCM
return i;
}
void CUDAAllocatorConfig::parseArgs(const std::string& env) {
// If empty, set the default values
bool used_cudaMallocAsync = false;
bool used_native_specific_option = false;
std::vector<std::string> config;
lexArgs(env, config);
for (size_t i = 0; i < config.size(); i++) {
std::string_view config_item_view(config[i]);
if (config_item_view == "backend") {
i = parseAllocatorConfig(config, i, used_cudaMallocAsync);
c10::CachingAllocator::ConfigTokenizer tokenizer(env);
for (size_t i = 0; i < tokenizer.size(); i++) {
const auto& key = tokenizer[i];
if (key == "backend") {
i = parseAllocatorConfig(tokenizer, i, used_cudaMallocAsync);
} else if (
// ROCm build's hipify step will change "cuda" to "hip", but for ease of
// use, accept both. We must break up the string to prevent hipify here.
config_item_view == "release_lock_on_hipmalloc" ||
config_item_view ==
key == "release_lock_on_hipmalloc" ||
key ==
"release_lock_on_c"
"udamalloc") {
used_native_specific_option = true;
consumeToken(config, ++i, ':');
++i;
TORCH_CHECK(
i < config.size() &&
(std::string_view(config[i]) == "True" ||
std::string_view(config[i]) == "False"),
"Expected a single True/False argument for release_lock_on_cudamalloc");
config_item_view = config[i];
m_release_lock_on_cudamalloc = (config_item_view == "True");
tokenizer.checkToken(++i, ":");
m_release_lock_on_cudamalloc = tokenizer.toBool(++i);
} else if (
// ROCm build's hipify step will change "cuda" to "hip", but for ease of
// use, accept both. We must break up the string to prevent hipify here.
config_item_view == "pinned_use_hip_host_register" ||
config_item_view ==
key == "pinned_use_hip_host_register" ||
key ==
"pinned_use_c"
"uda_host_register") {
i = parsePinnedUseCudaHostRegister(config, i);
i = parsePinnedUseCudaHostRegister(tokenizer, i);
used_native_specific_option = true;
} else if (config_item_view == "pinned_num_register_threads") {
i = parsePinnedNumRegisterThreads(config, i);
} else if (key == "pinned_num_register_threads") {
i = parsePinnedNumRegisterThreads(tokenizer, i);
used_native_specific_option = true;
} else if (config_item_view == "pinned_reserve_segment_size_mb") {
i = parsePinnedReserveSegmentSize(config, i);
} else if (key == "pinned_reserve_segment_size_mb") {
i = parsePinnedReserveSegmentSize(tokenizer, i);
used_native_specific_option = true;
} else if (config_item_view == "graph_capture_record_stream_reuse") {
i = parseGraphCaptureRecordStreamReuse(config, i);
} else if (key == "graph_capture_record_stream_reuse") {
i = parseGraphCaptureRecordStreamReuse(tokenizer, i);
used_native_specific_option = true;
} else {
const auto& keys =
c10::CachingAllocator::AcceleratorAllocatorConfig::getKeys();
TORCH_CHECK(
keys.find(config[i]) != keys.end(),
keys.find(key) != keys.end(),
"Unrecognized key '",
config_item_view,
key,
"' in CUDA allocator config.");
// Skip the key and its value
consumeToken(config, ++i, ':');
i++; // Move to the value
if (config[i] == "[") {
// Skip config inside the list until matching ']'
// NOLINTNEXTLINE(bugprone-inc-dec-in-conditions)
while (++i < config.size() && config[i] != "]") {
}
}
i = tokenizer.skipKey(i);
}
if (i + 1 < config.size()) {
consumeToken(config, ++i, ',');
if (i + 1 < tokenizer.size()) {
tokenizer.checkToken(++i, ",");
}
}
@ -207,75 +131,48 @@ void CUDAAllocatorConfig::parseArgs(const std::string& env) {
}
size_t CUDAAllocatorConfig::parsePinnedUseCudaHostRegister(
const std::vector<std::string>& config,
const c10::CachingAllocator::ConfigTokenizer& tokenizer,
size_t i) {
consumeToken(config, ++i, ':');
if (++i < config.size()) {
TORCH_CHECK(
(config[i] == "True" || config[i] == "False"),
"Expected a single True/False argument for pinned_use_cuda_host_register");
m_pinned_use_cuda_host_register = (config[i] == "True");
} else {
TORCH_CHECK(
false, "Error, expecting pinned_use_cuda_host_register value", "");
}
tokenizer.checkToken(++i, ":");
m_pinned_use_cuda_host_register = tokenizer.toBool(++i);
return i;
}
size_t CUDAAllocatorConfig::parseGraphCaptureRecordStreamReuse(
const std::vector<std::string>& config,
const c10::CachingAllocator::ConfigTokenizer& tokenizer,
size_t i) {
consumeToken(config, ++i, ':');
if (++i < config.size()) {
TORCH_CHECK(
(config[i] == "True" || config[i] == "False"),
"Expected a single True/False argument for graph_capture_record_stream_reuse");
m_graph_capture_record_stream_reuse = (config[i] == "True");
} else {
TORCH_CHECK(
false, "Error, expecting graph_capture_record_stream_reuse value", "");
}
tokenizer.checkToken(++i, ":");
m_graph_capture_record_stream_reuse = tokenizer.toBool(++i);
return i;
}
size_t CUDAAllocatorConfig::parsePinnedNumRegisterThreads(
const std::vector<std::string>& config,
const c10::CachingAllocator::ConfigTokenizer& tokenizer,
size_t i) {
consumeToken(config, ++i, ':');
if (++i < config.size()) {
size_t val2 = stoi(config[i]);
TORCH_CHECK(
llvm::isPowerOf2_64(val2),
"Number of register threads has to be power of 2 ",
"");
auto maxThreads = CUDAAllocatorConfig::pinned_max_register_threads();
TORCH_CHECK(
val2 <= maxThreads,
"Number of register threads should be less than or equal to " +
std::to_string(maxThreads),
"");
m_pinned_num_register_threads = val2;
} else {
TORCH_CHECK(
false, "Error, expecting pinned_num_register_threads value", "");
}
tokenizer.checkToken(++i, ":");
size_t val2 = tokenizer.toSizeT(++i);
TORCH_CHECK(
llvm::isPowerOf2_64(val2),
"Number of register threads has to be power of 2, got ",
val2);
auto maxThreads = CUDAAllocatorConfig::pinned_max_register_threads();
TORCH_CHECK(
val2 <= maxThreads,
"Number of register threads should be less than or equal to ",
maxThreads,
", got ",
val2);
m_pinned_num_register_threads = val2;
return i;
}
size_t CUDAAllocatorConfig::parsePinnedReserveSegmentSize(
const std::vector<std::string>& config,
const c10::CachingAllocator::ConfigTokenizer& tokenizer,
size_t i) {
consumeToken(config, ++i, ':');
if (++i < config.size()) {
size_t val2 = stoi(config[i]);
TORCH_CHECK(
val2 > 0, "Pinned reserve segment size has to be greater than 0 ", "");
m_pinned_reserve_segment_size_mb = val2;
} else {
TORCH_CHECK(
false, "Error, expecting pinned_reserve_segment_size_mb value", "");
}
tokenizer.checkToken(++i, ":");
size_t val2 = tokenizer.toSizeT(++i);
TORCH_CHECK(val2 > 0, "Pinned reserve segment size has to be greater than 0");
m_pinned_reserve_segment_size_mb = val2;
return i;
}

View File

@ -1,6 +1,7 @@
#pragma once
#include <c10/core/AllocatorConfig.h>
#include <c10/cuda/CUDAException.h>
#include <c10/cuda/CUDAMacros.h>
#include <c10/util/Exception.h>
#include <c10/util/env.h>
@ -144,37 +145,36 @@ class C10_CUDA_API CUDAAllocatorConfig {
void parseArgs(const std::string& env);
private:
CUDAAllocatorConfig();
CUDAAllocatorConfig() = default;
static void lexArgs(const std::string& env, std::vector<std::string>& config);
static void consumeToken(
const std::vector<std::string>& config,
size_t i,
const char c);
size_t parseAllocatorConfig(
const std::vector<std::string>& config,
const c10::CachingAllocator::ConfigTokenizer& tokenizer,
size_t i,
bool& used_cudaMallocAsync);
size_t parsePinnedUseCudaHostRegister(
const std::vector<std::string>& config,
const c10::CachingAllocator::ConfigTokenizer& tokenizer,
size_t i);
size_t parsePinnedNumRegisterThreads(
const std::vector<std::string>& config,
const c10::CachingAllocator::ConfigTokenizer& tokenizer,
size_t i);
size_t parsePinnedReserveSegmentSize(
const std::vector<std::string>& config,
const c10::CachingAllocator::ConfigTokenizer& tokenizer,
size_t i);
size_t parseGraphCaptureRecordStreamReuse(
const std::vector<std::string>& config,
const c10::CachingAllocator::ConfigTokenizer& tokenizer,
size_t i);
std::atomic<size_t> m_pinned_num_register_threads;
std::atomic<size_t> m_pinned_reserve_segment_size_mb;
std::atomic<Expandable_Segments_Handle_Type>
m_expandable_segments_handle_type;
std::atomic<bool> m_release_lock_on_cudamalloc;
std::atomic<bool> m_pinned_use_cuda_host_register;
std::atomic<bool> m_graph_capture_record_stream_reuse;
std::atomic<size_t> m_pinned_num_register_threads{1};
std::atomic<size_t> m_pinned_reserve_segment_size_mb{0};
std::atomic<Expandable_Segments_Handle_Type> m_expandable_segments_handle_type
#if CUDA_VERSION >= 12030
{Expandable_Segments_Handle_Type::UNSPECIFIED};
#else
{Expandable_Segments_Handle_Type::POSIX_FD};
#endif
std::atomic<bool> m_release_lock_on_cudamalloc{false};
std::atomic<bool> m_pinned_use_cuda_host_register{false};
std::atomic<bool> m_graph_capture_record_stream_reuse{false};
};
// Keep this for backwards compatibility