mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
* __->__ #165288 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165288 Approved by: https://github.com/albanD
242 lines
8.2 KiB
C++
242 lines
8.2 KiB
C++
#include <c10/core/AllocatorConfig.h>
|
|
#include <c10/util/env.h>
|
|
|
|
namespace c10::CachingAllocator {
|
|
|
|
namespace {
|
|
constexpr size_t kRoundUpPowerOfTwoIntervals = 16;
|
|
constexpr size_t kMB = 1024 * 1024ul;
|
|
constexpr size_t kRoundUpPowerOfTwoStart = 1 * kMB; // 1MB
|
|
constexpr size_t kRoundUpPowerOfTwoEnd = 64 * 1024ul * kMB; // 64GB
|
|
} // anonymous namespace
|
|
|
|
AcceleratorAllocatorConfig& AcceleratorAllocatorConfig::instance() {
|
|
static AcceleratorAllocatorConfig instance;
|
|
#define C10_ALLOCATOR_CONFIG_PARSE_ENV(env) \
|
|
auto env##_name = c10::utils::get_env(#env); \
|
|
if (env##_name.has_value()) { \
|
|
instance.parseArgs(env##_name.value()); \
|
|
return true; \
|
|
}
|
|
static bool env_flag [[maybe_unused]] = []() {
|
|
// Parse allocator configuration from environment variables.
|
|
// The first two entries are kept for backward compatibility with legacy
|
|
// CUDA and HIP environment variable names. The new unified variable
|
|
// (PYTORCH_ALLOC_CONF) should be used going forward.
|
|
// Note: keep the parsing order and logic stable to avoid potential
|
|
// performance regressions in internal tests.
|
|
C10_ALLOCATOR_CONFIG_PARSE_ENV(PYTORCH_CUDA_ALLOC_CONF)
|
|
C10_ALLOCATOR_CONFIG_PARSE_ENV(PYTORCH_HIP_ALLOC_CONF)
|
|
C10_ALLOCATOR_CONFIG_PARSE_ENV(PYTORCH_ALLOC_CONF)
|
|
return false;
|
|
}();
|
|
#undef C10_ALLOCATOR_CONFIG_PARSE_ENV
|
|
return instance;
|
|
}
|
|
|
|
AcceleratorAllocatorConfig::AcceleratorAllocatorConfig() {
|
|
roundup_power2_divisions_.assign(kRoundUpPowerOfTwoIntervals, 0);
|
|
}
|
|
|
|
size_t AcceleratorAllocatorConfig::roundup_power2_divisions(size_t size) {
|
|
size_t log_size = (63 - llvm::countLeadingZeros(size));
|
|
|
|
// Our intervals start at 1MB and end at 64GB
|
|
const size_t interval_start =
|
|
63 - llvm::countLeadingZeros(kRoundUpPowerOfTwoStart);
|
|
const size_t interval_end =
|
|
63 - llvm::countLeadingZeros(kRoundUpPowerOfTwoEnd);
|
|
TORCH_CHECK_VALUE(
|
|
interval_end - interval_start == kRoundUpPowerOfTwoIntervals,
|
|
"kRoundUpPowerOfTwoIntervals mismatch");
|
|
|
|
size_t index =
|
|
(log_size > interval_start) ? (log_size - interval_start) : 0ul;
|
|
index = std::min(index, kRoundUpPowerOfTwoIntervals - 1);
|
|
return instance().roundup_power2_divisions_[index];
|
|
}
|
|
|
|
size_t AcceleratorAllocatorConfig::parseMaxSplitSize(
|
|
const ConfigTokenizer& tokenizer,
|
|
size_t i) {
|
|
tokenizer.checkToken(++i, ":");
|
|
constexpr size_t min_allowed_split_size_mb = kLargeBuffer / kMB;
|
|
constexpr size_t max_allowed_split_size_mb =
|
|
std::numeric_limits<size_t>::max() / kMB;
|
|
|
|
size_t val_env = tokenizer.toSizeT(++i);
|
|
TORCH_CHECK_VALUE(
|
|
val_env >= min_allowed_split_size_mb,
|
|
"CachingAllocator option max_split_size_mb too small, must be >= ",
|
|
min_allowed_split_size_mb);
|
|
val_env = std::min(val_env, max_allowed_split_size_mb);
|
|
max_split_size_ = val_env * kMB;
|
|
|
|
return i;
|
|
}
|
|
|
|
size_t AcceleratorAllocatorConfig::parseMaxNonSplitRoundingSize(
|
|
const ConfigTokenizer& tokenizer,
|
|
size_t i) {
|
|
tokenizer.checkToken(++i, ":");
|
|
constexpr size_t min_allowed_split_size_mb = kLargeBuffer / kMB;
|
|
constexpr size_t max_allowed_split_size_mb =
|
|
std::numeric_limits<size_t>::max() / kMB;
|
|
|
|
size_t val_env = tokenizer.toSizeT(++i);
|
|
TORCH_CHECK_VALUE(
|
|
val_env >= min_allowed_split_size_mb,
|
|
"CachingAllocator option max_non_split_rounding_mb too small, must be >= ",
|
|
min_allowed_split_size_mb);
|
|
val_env = std::min(val_env, max_allowed_split_size_mb);
|
|
max_non_split_rounding_size_ = val_env * kMB;
|
|
|
|
return i;
|
|
}
|
|
|
|
size_t AcceleratorAllocatorConfig::parseGarbageCollectionThreshold(
|
|
const ConfigTokenizer& tokenizer,
|
|
size_t i) {
|
|
tokenizer.checkToken(++i, ":");
|
|
double val_env = tokenizer.toDouble(++i);
|
|
TORCH_CHECK_VALUE(
|
|
val_env > 0 && val_env < 1.0,
|
|
"garbage_collect_threshold is invalid, set it in (0.0, 1.0)");
|
|
garbage_collection_threshold_ = val_env;
|
|
|
|
return i;
|
|
}
|
|
|
|
size_t AcceleratorAllocatorConfig::parseRoundUpPower2Divisions(
|
|
const ConfigTokenizer& tokenizer,
|
|
size_t i) {
|
|
tokenizer.checkToken(++i, ":");
|
|
bool first_value = true;
|
|
|
|
if (tokenizer[++i] == "[") {
|
|
size_t last_index = 0;
|
|
// NOLINTNEXTLINE(bugprone-inc-dec-in-conditions)
|
|
while (++i < tokenizer.size() && tokenizer[i] != "]") {
|
|
size_t value_index = i;
|
|
tokenizer.checkToken(++i, ":");
|
|
size_t value = tokenizer.toSizeT(++i);
|
|
TORCH_CHECK_VALUE(
|
|
value == 0 || llvm::isPowerOf2_64(value),
|
|
"For roundups, the divisions has to be power of 2 or 0 to disable roundup ");
|
|
|
|
if (tokenizer[value_index] == ">") {
|
|
std::fill(
|
|
std::next(
|
|
roundup_power2_divisions_.begin(),
|
|
static_cast<std::vector<size_t>::difference_type>(last_index)),
|
|
roundup_power2_divisions_.end(),
|
|
value);
|
|
} else {
|
|
size_t boundary = tokenizer.toSizeT(value_index);
|
|
TORCH_CHECK_VALUE(
|
|
llvm::isPowerOf2_64(boundary),
|
|
"For roundups, the intervals have to be power of 2 ");
|
|
|
|
size_t index = 63 - llvm::countLeadingZeros(boundary);
|
|
index =
|
|
std::clamp(index, size_t{0}, roundup_power2_divisions_.size() - 1);
|
|
|
|
if (first_value) {
|
|
std::fill(
|
|
roundup_power2_divisions_.begin(),
|
|
std::next(
|
|
roundup_power2_divisions_.begin(),
|
|
static_cast<std::vector<size_t>::difference_type>(index)),
|
|
value);
|
|
first_value = false;
|
|
}
|
|
roundup_power2_divisions_[index] = value;
|
|
last_index = index;
|
|
}
|
|
|
|
if (tokenizer[i + 1] != "]") {
|
|
tokenizer.checkToken(++i, ",");
|
|
}
|
|
}
|
|
TORCH_INTERNAL_ASSERT(
|
|
i < tokenizer.size(),
|
|
"Expected closing bracket ']' in ConfigTokenizer but reached end of config");
|
|
} else { // Keep this for backwards compatibility
|
|
size_t value = tokenizer.toSizeT(i);
|
|
TORCH_CHECK_VALUE(
|
|
llvm::isPowerOf2_64(value),
|
|
"For roundups, the divisions has to be power of 2 ");
|
|
std::fill(
|
|
roundup_power2_divisions_.begin(),
|
|
roundup_power2_divisions_.end(),
|
|
value);
|
|
}
|
|
return i;
|
|
}
|
|
|
|
size_t AcceleratorAllocatorConfig::parseExpandableSegments(
|
|
const ConfigTokenizer& tokenizer,
|
|
size_t i) {
|
|
tokenizer.checkToken(++i, ":");
|
|
use_expandable_segments_ = tokenizer.toBool(++i);
|
|
|
|
return i;
|
|
}
|
|
|
|
size_t AcceleratorAllocatorConfig::parsePinnedUseBackgroundThreads(
|
|
const ConfigTokenizer& tokenizer,
|
|
size_t i) {
|
|
tokenizer.checkToken(++i, ":");
|
|
pinned_use_background_threads_ = tokenizer.toBool(++i);
|
|
|
|
return i;
|
|
}
|
|
|
|
void AcceleratorAllocatorConfig::parseArgs(const std::string& env) {
|
|
// The following option will be reset to its default value if not explicitly
|
|
// set each time.
|
|
max_split_size_ = std::numeric_limits<size_t>::max();
|
|
roundup_power2_divisions_.assign(kRoundUpPowerOfTwoIntervals, 0);
|
|
garbage_collection_threshold_ = 0;
|
|
|
|
{
|
|
std::lock_guard<std::mutex> lock(last_allocator_settings_mutex_);
|
|
last_allocator_settings_ = env;
|
|
}
|
|
|
|
ConfigTokenizer tokenizer(env);
|
|
for (size_t i = 0; i < tokenizer.size(); i++) {
|
|
const auto& key = tokenizer[i];
|
|
if (key == "max_split_size_mb") {
|
|
i = parseMaxSplitSize(tokenizer, i);
|
|
} else if (key == "max_non_split_rounding_mb") {
|
|
i = parseMaxNonSplitRoundingSize(tokenizer, i);
|
|
} else if (key == "garbage_collection_threshold") {
|
|
i = parseGarbageCollectionThreshold(tokenizer, i);
|
|
} else if (key == "roundup_power2_divisions") {
|
|
i = parseRoundUpPower2Divisions(tokenizer, i);
|
|
} else if (key == "expandable_segments") {
|
|
i = parseExpandableSegments(tokenizer, i);
|
|
} else if (key == "pinned_use_background_threads") {
|
|
i = parsePinnedUseBackgroundThreads(tokenizer, i);
|
|
} else {
|
|
// If a device-specific configuration parser hook is registered, it will
|
|
// check if the key is unrecognized.
|
|
if (device_config_parser_hook_) {
|
|
TORCH_CHECK_VALUE(
|
|
getKeys().find(key) != getKeys().end(),
|
|
"Unrecognized key '",
|
|
key,
|
|
"' in Accelerator allocator config.");
|
|
}
|
|
i = tokenizer.skipKey(i);
|
|
}
|
|
|
|
if (i + 1 < tokenizer.size()) {
|
|
tokenizer.checkToken(++i, ",");
|
|
}
|
|
}
|
|
}
|
|
|
|
} // namespace c10::CachingAllocator
|