Refactor CUDAAllocatorConfig to reuse AcceleratorAllocatorConfig (#150312)

# Motivation
Refactor `CUDAAllocatorConfig` to reuse `AcceleratorAllocatorConfig` and `ConfigTokenizer`. We would deprecate those option that overleap with `AcceleratorAllocatorConfig` in the following PR and keep them only for BC.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/150312
Approved by: https://github.com/albanD
ghstack dependencies: #149601, #157908
This commit is contained in:
Yu, Guangye
2025-07-10 11:20:27 +00:00
committed by PyTorch MergeBot
parent 8088958793
commit 03b307575a
4 changed files with 160 additions and 496 deletions

View File

@ -1,16 +1,10 @@
#pragma once
#include <c10/core/AllocatorConfig.h>
#include <c10/cuda/CUDAMacros.h>
#include <c10/util/Exception.h>
#include <c10/util/env.h>
#include <atomic>
#include <cstddef>
#include <cstdlib>
#include <mutex>
#include <string>
#include <vector>
namespace c10::cuda::CUDACachingAllocator {
enum class Expandable_Segments_Handle_Type : int {
@ -23,20 +17,23 @@ enum class Expandable_Segments_Handle_Type : int {
class C10_CUDA_API CUDAAllocatorConfig {
public:
static size_t max_split_size() {
return instance().m_max_split_size;
return c10::CachingAllocator::AcceleratorAllocatorConfig::max_split_size();
}
static double garbage_collection_threshold() {
return instance().m_garbage_collection_threshold;
return c10::CachingAllocator::AcceleratorAllocatorConfig::
garbage_collection_threshold();
}
static bool expandable_segments() {
bool enabled = c10::CachingAllocator::AcceleratorAllocatorConfig::
use_expandable_segments();
#ifndef PYTORCH_C10_DRIVER_API_SUPPORTED
if (instance().m_expandable_segments) {
if (enabled) {
TORCH_WARN_ONCE("expandable_segments not supported on this platform")
}
return false;
#else
return instance().m_expandable_segments;
return enabled;
#endif
}
@ -63,7 +60,8 @@ class C10_CUDA_API CUDAAllocatorConfig {
}
static bool pinned_use_background_threads() {
return instance().m_pinned_use_background_threads;
return c10::CachingAllocator::AcceleratorAllocatorConfig::
pinned_use_background_threads();
}
static size_t pinned_max_register_threads() {
@ -77,88 +75,97 @@ class C10_CUDA_API CUDAAllocatorConfig {
// More description below in function roundup_power2_next_division
// As an example, if we want 4 divisions between 2's power, this can be done
// using env variable: PYTORCH_CUDA_ALLOC_CONF=roundup_power2_divisions:4
static size_t roundup_power2_divisions(size_t size);
static size_t roundup_power2_divisions(size_t size) {
return c10::CachingAllocator::AcceleratorAllocatorConfig::
roundup_power2_divisions(size);
}
static std::vector<size_t> roundup_power2_divisions() {
return instance().m_roundup_power2_divisions;
return c10::CachingAllocator::AcceleratorAllocatorConfig::
roundup_power2_divisions();
}
static size_t max_non_split_rounding_size() {
return instance().m_max_non_split_rounding_size;
return c10::CachingAllocator::AcceleratorAllocatorConfig::
max_non_split_rounding_size();
}
static std::string last_allocator_settings() {
std::lock_guard<std::mutex> lock(
instance().m_last_allocator_settings_mutex);
return instance().m_last_allocator_settings;
return c10::CachingAllocator::getAllocatorSettings();
}
static bool use_async_allocator() {
return instance().m_use_async_allocator;
}
static const std::unordered_set<std::string>& getKeys() {
return instance().keys_;
}
static CUDAAllocatorConfig& instance() {
static CUDAAllocatorConfig* s_instance = ([]() {
auto inst = new CUDAAllocatorConfig();
auto env = c10::utils::get_env("PYTORCH_CUDA_ALLOC_CONF");
auto env = c10::utils::get_env("PYTORCH_ALLOC_CONF");
if (!env.has_value()) {
// For backward compatibility, check for the old environment variable
// PYTORCH_CUDA_ALLOC_CONF.
env = c10::utils::get_env("PYTORCH_CUDA_ALLOC_CONF");
}
#ifdef USE_ROCM
// convenience for ROCm users, allow alternative HIP token
if (!env.has_value()) {
env = c10::utils::get_env("PYTORCH_HIP_ALLOC_CONF");
}
#endif
inst->parseArgs(env);
if (env.has_value()) {
inst->parseArgs(env.value());
}
return inst;
})();
return *s_instance;
}
void parseArgs(const std::optional<std::string>& env);
void parseArgs(const std::string& env);
private:
CUDAAllocatorConfig();
CUDAAllocatorConfig() = default;
static void lexArgs(const std::string& env, std::vector<std::string>& config);
static void consumeToken(
const std::vector<std::string>& config,
size_t i,
const char c);
size_t parseMaxSplitSize(const std::vector<std::string>& config, size_t i);
size_t parseMaxNonSplitRoundingSize(
const std::vector<std::string>& config,
size_t i);
size_t parseGarbageCollectionThreshold(
const std::vector<std::string>& config,
size_t i);
size_t parseRoundUpPower2Divisions(
const std::vector<std::string>& config,
size_t i);
size_t parseAllocatorConfig(
const std::vector<std::string>& config,
size_t i,
bool& used_cudaMallocAsync);
const c10::CachingAllocator::ConfigTokenizer& tokenizer,
size_t i);
size_t parsePinnedUseCudaHostRegister(
const std::vector<std::string>& config,
const c10::CachingAllocator::ConfigTokenizer& tokenizer,
size_t i);
size_t parsePinnedNumRegisterThreads(
const std::vector<std::string>& config,
size_t i);
size_t parsePinnedUseBackgroundThreads(
const std::vector<std::string>& config,
const c10::CachingAllocator::ConfigTokenizer& tokenizer,
size_t i);
std::atomic<size_t> m_max_split_size;
std::atomic<size_t> m_max_non_split_rounding_size;
std::vector<size_t> m_roundup_power2_divisions;
std::atomic<double> m_garbage_collection_threshold;
std::atomic<size_t> m_pinned_num_register_threads;
std::atomic<bool> m_expandable_segments;
std::atomic<Expandable_Segments_Handle_Type>
m_expandable_segments_handle_type;
std::atomic<bool> m_release_lock_on_cudamalloc;
std::atomic<bool> m_pinned_use_cuda_host_register;
std::atomic<bool> m_pinned_use_background_threads;
std::string m_last_allocator_settings;
std::mutex m_last_allocator_settings_mutex;
std::atomic<size_t> m_pinned_num_register_threads{1};
std::atomic<Expandable_Segments_Handle_Type> m_expandable_segments_handle_type
#if CUDA_VERSION >= 12030
{Expandable_Segments_Handle_Type::UNSPECIFIED};
#else
{Expandable_Segments_Handle_Type::POSIX_FD};
#endif
std::atomic<bool> m_release_lock_on_cudamalloc{false};
std::atomic<bool> m_pinned_use_cuda_host_register{false};
std::atomic<bool> m_use_async_allocator{false};
std::atomic<bool> m_is_allocator_loaded{false};
std::unordered_set<std::string> keys_{
"backend",
// keep BC for Rocm: `cuda` -> `cud` `a`, to avoid hipify issues
// NOLINTBEGIN(bugprone-suspicious-missing-comma,-warnings-as-errors)
"release_lock_on_cud"
"amalloc",
"pinned_use_cud"
"a_host_register",
// NOLINTEND(bugprone-suspicious-missing-comma,-warnings-as-errors)
"release_lock_on_hipmalloc",
"pinned_use_hip_host_register",
"pinned_num_register_threads"};
};
// General caching allocator utilities
C10_CUDA_API void setAllocatorSettings(const std::string& env);
// Keep this for backwards compatibility
using c10::CachingAllocator::setAllocatorSettings;
} // namespace c10::cuda::CUDACachingAllocator