mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Refactor CUDAAllocatorConfig using ConfigTokenizer (#165281)
* #165129 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165281 Approved by: https://github.com/albanD ghstack dependencies: #165129, #165131, #165135, #165136
This commit is contained in:
committed by
PyTorch MergeBot
parent
515b5ff539
commit
219fb6aafc
@ -1,6 +1,5 @@
|
|||||||
#include <c10/cuda/CUDAAllocatorConfig.h>
|
#include <c10/cuda/CUDAAllocatorConfig.h>
|
||||||
#include <c10/cuda/CUDACachingAllocator.h>
|
#include <c10/cuda/CUDACachingAllocator.h>
|
||||||
#include <c10/util/llvmMathExtras.h>
|
|
||||||
|
|
||||||
#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
|
#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
|
||||||
#include <c10/cuda/driver_api.h>
|
#include <c10/cuda/driver_api.h>
|
||||||
@ -8,194 +7,119 @@
|
|||||||
|
|
||||||
namespace c10::cuda::CUDACachingAllocator {
|
namespace c10::cuda::CUDACachingAllocator {
|
||||||
|
|
||||||
CUDAAllocatorConfig::CUDAAllocatorConfig()
|
|
||||||
: m_pinned_num_register_threads(1),
|
|
||||||
m_pinned_reserve_segment_size_mb(0),
|
|
||||||
#if CUDA_VERSION >= 12030
|
|
||||||
m_expandable_segments_handle_type(
|
|
||||||
Expandable_Segments_Handle_Type::UNSPECIFIED),
|
|
||||||
#else
|
|
||||||
m_expandable_segments_handle_type(
|
|
||||||
Expandable_Segments_Handle_Type::POSIX_FD),
|
|
||||||
#endif
|
|
||||||
m_release_lock_on_cudamalloc(false),
|
|
||||||
m_pinned_use_cuda_host_register(false),
|
|
||||||
m_graph_capture_record_stream_reuse(false) {
|
|
||||||
}
|
|
||||||
|
|
||||||
void CUDAAllocatorConfig::lexArgs(
|
|
||||||
const std::string& env,
|
|
||||||
std::vector<std::string>& config) {
|
|
||||||
std::vector<char> buf;
|
|
||||||
|
|
||||||
for (char ch : env) {
|
|
||||||
if (ch == ',' || ch == ':' || ch == '[' || ch == ']') {
|
|
||||||
if (!buf.empty()) {
|
|
||||||
config.emplace_back(buf.begin(), buf.end());
|
|
||||||
buf.clear();
|
|
||||||
}
|
|
||||||
config.emplace_back(1, ch);
|
|
||||||
} else if (ch != ' ') {
|
|
||||||
buf.emplace_back(ch);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (!buf.empty()) {
|
|
||||||
config.emplace_back(buf.begin(), buf.end());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void CUDAAllocatorConfig::consumeToken(
|
|
||||||
const std::vector<std::string>& config,
|
|
||||||
size_t i,
|
|
||||||
const char c) {
|
|
||||||
TORCH_CHECK(
|
|
||||||
i < config.size() && config[i] == std::string(1, c),
|
|
||||||
"Error parsing CachingAllocator settings, expected ",
|
|
||||||
c,
|
|
||||||
"");
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t CUDAAllocatorConfig::parseAllocatorConfig(
|
size_t CUDAAllocatorConfig::parseAllocatorConfig(
|
||||||
const std::vector<std::string>& config,
|
const c10::CachingAllocator::ConfigTokenizer& tokenizer,
|
||||||
size_t i,
|
size_t i,
|
||||||
bool& used_cudaMallocAsync) {
|
bool& used_cudaMallocAsync) {
|
||||||
// For ease of maintenance and understanding, the CUDA and ROCm
|
|
||||||
// implementations of this function are separated. This avoids having many
|
|
||||||
// #ifdef's throughout.
|
|
||||||
#ifdef USE_ROCM
|
|
||||||
// Ease burden on ROCm users by allowing either cuda or hip tokens.
|
// Ease burden on ROCm users by allowing either cuda or hip tokens.
|
||||||
// cuda token is broken up to prevent hipify matching it.
|
// cuda token is broken up to prevent hipify matching it.
|
||||||
#define PYTORCH_TOKEN1 \
|
#define PYTORCH_TOKEN1 \
|
||||||
"cud" \
|
"cud" \
|
||||||
"aMallocAsync"
|
"aMallocAsync"
|
||||||
#define PYTORCH_TOKEN2 "hipMallocAsync"
|
#define PYTORCH_TOKEN2 "hipMallocAsync"
|
||||||
consumeToken(config, ++i, ':');
|
tokenizer.checkToken(++i, ":");
|
||||||
if (++i < config.size()) {
|
i++; // Move to the value after the colon
|
||||||
TORCH_CHECK(
|
#ifdef USE_ROCM
|
||||||
((config[i] == "native") || (config[i] == PYTORCH_TOKEN1) ||
|
TORCH_CHECK(
|
||||||
(config[i] == PYTORCH_TOKEN2)),
|
((tokenizer[i] == "native") || (tokenizer[i] == PYTORCH_TOKEN1) ||
|
||||||
"Unknown allocator backend, "
|
(tokenizer[i] == PYTORCH_TOKEN2)),
|
||||||
"options are native, " PYTORCH_TOKEN1 ", and " PYTORCH_TOKEN2);
|
"Unknown allocator backend, "
|
||||||
used_cudaMallocAsync =
|
"options are native, " PYTORCH_TOKEN1 ", and " PYTORCH_TOKEN2);
|
||||||
(config[i] == PYTORCH_TOKEN1 || config[i] == PYTORCH_TOKEN2);
|
used_cudaMallocAsync =
|
||||||
TORCH_INTERNAL_ASSERT(
|
(tokenizer[i] == PYTORCH_TOKEN1 || tokenizer[i] == PYTORCH_TOKEN2);
|
||||||
config[i] == get()->name() ||
|
TORCH_INTERNAL_ASSERT(
|
||||||
(config[i] == PYTORCH_TOKEN1 && get()->name() == PYTORCH_TOKEN2),
|
tokenizer[i] == get()->name() ||
|
||||||
"Allocator backend parsed at runtime != "
|
(tokenizer[i] == PYTORCH_TOKEN1 && get()->name() == PYTORCH_TOKEN2),
|
||||||
"allocator backend parsed at load time, ",
|
"Allocator backend parsed at runtime != "
|
||||||
config[i],
|
"allocator backend parsed at load time, ",
|
||||||
" != ",
|
tokenizer[i],
|
||||||
get()->name());
|
" != ",
|
||||||
} else {
|
get()->name());
|
||||||
TORCH_CHECK(false, "Error parsing backend value", "");
|
|
||||||
}
|
|
||||||
return i;
|
|
||||||
#undef PYTORCH_TOKEN1
|
|
||||||
#undef PYTORCH_TOKEN2
|
|
||||||
#else // USE_ROCM
|
#else // USE_ROCM
|
||||||
consumeToken(config, ++i, ':');
|
TORCH_CHECK(
|
||||||
if (++i < config.size()) {
|
((tokenizer[i] == "native") || (tokenizer[i] == PYTORCH_TOKEN1)),
|
||||||
TORCH_CHECK(
|
"Unknown allocator backend, "
|
||||||
((config[i] == "native") || (config[i] == "cudaMallocAsync")),
|
"options are native and " PYTORCH_TOKEN1);
|
||||||
"Unknown allocator backend, "
|
used_cudaMallocAsync = (tokenizer[i] == PYTORCH_TOKEN1);
|
||||||
"options are native and cudaMallocAsync");
|
TORCH_INTERNAL_ASSERT(
|
||||||
used_cudaMallocAsync = (config[i] == "cudaMallocAsync");
|
tokenizer[i] == get()->name(),
|
||||||
if (used_cudaMallocAsync) {
|
"Allocator backend parsed at runtime != "
|
||||||
|
"allocator backend parsed at load time, ",
|
||||||
|
tokenizer[i],
|
||||||
|
" != ",
|
||||||
|
get()->name());
|
||||||
|
if (used_cudaMallocAsync) {
|
||||||
#if CUDA_VERSION >= 11040
|
#if CUDA_VERSION >= 11040
|
||||||
int version = 0;
|
int version = 0;
|
||||||
C10_CUDA_CHECK(cudaDriverGetVersion(&version));
|
C10_CUDA_CHECK(cudaDriverGetVersion(&version));
|
||||||
TORCH_CHECK(
|
TORCH_CHECK(
|
||||||
version >= 11040,
|
version >= 11040,
|
||||||
"backend:cudaMallocAsync requires CUDA runtime "
|
"backend:cudaMallocAsync requires CUDA runtime "
|
||||||
"11.4 or newer, but cudaDriverGetVersion returned ",
|
"11.4 or newer, but cudaDriverGetVersion returned ",
|
||||||
version);
|
version);
|
||||||
#else
|
#else // CUDA_VERSION >= 11040
|
||||||
TORCH_CHECK(
|
TORCH_CHECK(
|
||||||
false,
|
false,
|
||||||
"backend:cudaMallocAsync requires PyTorch to be built with "
|
"backend:cudaMallocAsync requires PyTorch to be built with "
|
||||||
"CUDA 11.4 or newer, but CUDA_VERSION is ",
|
"CUDA 11.4 or newer, but CUDA_VERSION is ",
|
||||||
CUDA_VERSION);
|
CUDA_VERSION);
|
||||||
#endif
|
#endif // CUDA_VERSION >= 11040
|
||||||
}
|
|
||||||
TORCH_INTERNAL_ASSERT(
|
|
||||||
config[i] == get()->name(),
|
|
||||||
"Allocator backend parsed at runtime != "
|
|
||||||
"allocator backend parsed at load time");
|
|
||||||
} else {
|
|
||||||
TORCH_CHECK(false, "Error parsing backend value", "");
|
|
||||||
}
|
}
|
||||||
return i;
|
|
||||||
#endif // USE_ROCM
|
#endif // USE_ROCM
|
||||||
|
return i;
|
||||||
}
|
}
|
||||||
|
|
||||||
void CUDAAllocatorConfig::parseArgs(const std::string& env) {
|
void CUDAAllocatorConfig::parseArgs(const std::string& env) {
|
||||||
// If empty, set the default values
|
|
||||||
bool used_cudaMallocAsync = false;
|
bool used_cudaMallocAsync = false;
|
||||||
bool used_native_specific_option = false;
|
bool used_native_specific_option = false;
|
||||||
|
|
||||||
std::vector<std::string> config;
|
c10::CachingAllocator::ConfigTokenizer tokenizer(env);
|
||||||
lexArgs(env, config);
|
for (size_t i = 0; i < tokenizer.size(); i++) {
|
||||||
|
const auto& key = tokenizer[i];
|
||||||
for (size_t i = 0; i < config.size(); i++) {
|
if (key == "backend") {
|
||||||
std::string_view config_item_view(config[i]);
|
i = parseAllocatorConfig(tokenizer, i, used_cudaMallocAsync);
|
||||||
if (config_item_view == "backend") {
|
|
||||||
i = parseAllocatorConfig(config, i, used_cudaMallocAsync);
|
|
||||||
} else if (
|
} else if (
|
||||||
// ROCm build's hipify step will change "cuda" to "hip", but for ease of
|
// ROCm build's hipify step will change "cuda" to "hip", but for ease of
|
||||||
// use, accept both. We must break up the string to prevent hipify here.
|
// use, accept both. We must break up the string to prevent hipify here.
|
||||||
config_item_view == "release_lock_on_hipmalloc" ||
|
key == "release_lock_on_hipmalloc" ||
|
||||||
config_item_view ==
|
key ==
|
||||||
"release_lock_on_c"
|
"release_lock_on_c"
|
||||||
"udamalloc") {
|
"udamalloc") {
|
||||||
used_native_specific_option = true;
|
used_native_specific_option = true;
|
||||||
consumeToken(config, ++i, ':');
|
tokenizer.checkToken(++i, ":");
|
||||||
++i;
|
m_release_lock_on_cudamalloc = tokenizer.toBool(++i);
|
||||||
TORCH_CHECK(
|
|
||||||
i < config.size() &&
|
|
||||||
(std::string_view(config[i]) == "True" ||
|
|
||||||
std::string_view(config[i]) == "False"),
|
|
||||||
"Expected a single True/False argument for release_lock_on_cudamalloc");
|
|
||||||
config_item_view = config[i];
|
|
||||||
m_release_lock_on_cudamalloc = (config_item_view == "True");
|
|
||||||
} else if (
|
} else if (
|
||||||
// ROCm build's hipify step will change "cuda" to "hip", but for ease of
|
// ROCm build's hipify step will change "cuda" to "hip", but for ease of
|
||||||
// use, accept both. We must break up the string to prevent hipify here.
|
// use, accept both. We must break up the string to prevent hipify here.
|
||||||
config_item_view == "pinned_use_hip_host_register" ||
|
key == "pinned_use_hip_host_register" ||
|
||||||
config_item_view ==
|
key ==
|
||||||
"pinned_use_c"
|
"pinned_use_c"
|
||||||
"uda_host_register") {
|
"uda_host_register") {
|
||||||
i = parsePinnedUseCudaHostRegister(config, i);
|
i = parsePinnedUseCudaHostRegister(tokenizer, i);
|
||||||
used_native_specific_option = true;
|
used_native_specific_option = true;
|
||||||
} else if (config_item_view == "pinned_num_register_threads") {
|
} else if (key == "pinned_num_register_threads") {
|
||||||
i = parsePinnedNumRegisterThreads(config, i);
|
i = parsePinnedNumRegisterThreads(tokenizer, i);
|
||||||
used_native_specific_option = true;
|
used_native_specific_option = true;
|
||||||
} else if (config_item_view == "pinned_reserve_segment_size_mb") {
|
} else if (key == "pinned_reserve_segment_size_mb") {
|
||||||
i = parsePinnedReserveSegmentSize(config, i);
|
i = parsePinnedReserveSegmentSize(tokenizer, i);
|
||||||
used_native_specific_option = true;
|
used_native_specific_option = true;
|
||||||
} else if (config_item_view == "graph_capture_record_stream_reuse") {
|
} else if (key == "graph_capture_record_stream_reuse") {
|
||||||
i = parseGraphCaptureRecordStreamReuse(config, i);
|
i = parseGraphCaptureRecordStreamReuse(tokenizer, i);
|
||||||
used_native_specific_option = true;
|
used_native_specific_option = true;
|
||||||
} else {
|
} else {
|
||||||
const auto& keys =
|
const auto& keys =
|
||||||
c10::CachingAllocator::AcceleratorAllocatorConfig::getKeys();
|
c10::CachingAllocator::AcceleratorAllocatorConfig::getKeys();
|
||||||
TORCH_CHECK(
|
TORCH_CHECK(
|
||||||
keys.find(config[i]) != keys.end(),
|
keys.find(key) != keys.end(),
|
||||||
"Unrecognized key '",
|
"Unrecognized key '",
|
||||||
config_item_view,
|
key,
|
||||||
"' in CUDA allocator config.");
|
"' in CUDA allocator config.");
|
||||||
// Skip the key and its value
|
// Skip the key and its value
|
||||||
consumeToken(config, ++i, ':');
|
i = tokenizer.skipKey(i);
|
||||||
i++; // Move to the value
|
|
||||||
if (config[i] == "[") {
|
|
||||||
// Skip config inside the list until matching ']'
|
|
||||||
// NOLINTNEXTLINE(bugprone-inc-dec-in-conditions)
|
|
||||||
while (++i < config.size() && config[i] != "]") {
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (i + 1 < config.size()) {
|
if (i + 1 < tokenizer.size()) {
|
||||||
consumeToken(config, ++i, ',');
|
tokenizer.checkToken(++i, ",");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -207,75 +131,48 @@ void CUDAAllocatorConfig::parseArgs(const std::string& env) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
size_t CUDAAllocatorConfig::parsePinnedUseCudaHostRegister(
|
size_t CUDAAllocatorConfig::parsePinnedUseCudaHostRegister(
|
||||||
const std::vector<std::string>& config,
|
const c10::CachingAllocator::ConfigTokenizer& tokenizer,
|
||||||
size_t i) {
|
size_t i) {
|
||||||
consumeToken(config, ++i, ':');
|
tokenizer.checkToken(++i, ":");
|
||||||
if (++i < config.size()) {
|
m_pinned_use_cuda_host_register = tokenizer.toBool(++i);
|
||||||
TORCH_CHECK(
|
|
||||||
(config[i] == "True" || config[i] == "False"),
|
|
||||||
"Expected a single True/False argument for pinned_use_cuda_host_register");
|
|
||||||
m_pinned_use_cuda_host_register = (config[i] == "True");
|
|
||||||
} else {
|
|
||||||
TORCH_CHECK(
|
|
||||||
false, "Error, expecting pinned_use_cuda_host_register value", "");
|
|
||||||
}
|
|
||||||
return i;
|
return i;
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t CUDAAllocatorConfig::parseGraphCaptureRecordStreamReuse(
|
size_t CUDAAllocatorConfig::parseGraphCaptureRecordStreamReuse(
|
||||||
const std::vector<std::string>& config,
|
const c10::CachingAllocator::ConfigTokenizer& tokenizer,
|
||||||
size_t i) {
|
size_t i) {
|
||||||
consumeToken(config, ++i, ':');
|
tokenizer.checkToken(++i, ":");
|
||||||
if (++i < config.size()) {
|
m_graph_capture_record_stream_reuse = tokenizer.toBool(++i);
|
||||||
TORCH_CHECK(
|
|
||||||
(config[i] == "True" || config[i] == "False"),
|
|
||||||
"Expected a single True/False argument for graph_capture_record_stream_reuse");
|
|
||||||
m_graph_capture_record_stream_reuse = (config[i] == "True");
|
|
||||||
} else {
|
|
||||||
TORCH_CHECK(
|
|
||||||
false, "Error, expecting graph_capture_record_stream_reuse value", "");
|
|
||||||
}
|
|
||||||
|
|
||||||
return i;
|
return i;
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t CUDAAllocatorConfig::parsePinnedNumRegisterThreads(
|
size_t CUDAAllocatorConfig::parsePinnedNumRegisterThreads(
|
||||||
const std::vector<std::string>& config,
|
const c10::CachingAllocator::ConfigTokenizer& tokenizer,
|
||||||
size_t i) {
|
size_t i) {
|
||||||
consumeToken(config, ++i, ':');
|
tokenizer.checkToken(++i, ":");
|
||||||
if (++i < config.size()) {
|
size_t val2 = tokenizer.toSizeT(++i);
|
||||||
size_t val2 = stoi(config[i]);
|
TORCH_CHECK(
|
||||||
TORCH_CHECK(
|
llvm::isPowerOf2_64(val2),
|
||||||
llvm::isPowerOf2_64(val2),
|
"Number of register threads has to be power of 2, got ",
|
||||||
"Number of register threads has to be power of 2 ",
|
val2);
|
||||||
"");
|
auto maxThreads = CUDAAllocatorConfig::pinned_max_register_threads();
|
||||||
auto maxThreads = CUDAAllocatorConfig::pinned_max_register_threads();
|
TORCH_CHECK(
|
||||||
TORCH_CHECK(
|
val2 <= maxThreads,
|
||||||
val2 <= maxThreads,
|
"Number of register threads should be less than or equal to ",
|
||||||
"Number of register threads should be less than or equal to " +
|
maxThreads,
|
||||||
std::to_string(maxThreads),
|
", got ",
|
||||||
"");
|
val2);
|
||||||
m_pinned_num_register_threads = val2;
|
m_pinned_num_register_threads = val2;
|
||||||
} else {
|
|
||||||
TORCH_CHECK(
|
|
||||||
false, "Error, expecting pinned_num_register_threads value", "");
|
|
||||||
}
|
|
||||||
return i;
|
return i;
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t CUDAAllocatorConfig::parsePinnedReserveSegmentSize(
|
size_t CUDAAllocatorConfig::parsePinnedReserveSegmentSize(
|
||||||
const std::vector<std::string>& config,
|
const c10::CachingAllocator::ConfigTokenizer& tokenizer,
|
||||||
size_t i) {
|
size_t i) {
|
||||||
consumeToken(config, ++i, ':');
|
tokenizer.checkToken(++i, ":");
|
||||||
if (++i < config.size()) {
|
size_t val2 = tokenizer.toSizeT(++i);
|
||||||
size_t val2 = stoi(config[i]);
|
TORCH_CHECK(val2 > 0, "Pinned reserve segment size has to be greater than 0");
|
||||||
TORCH_CHECK(
|
m_pinned_reserve_segment_size_mb = val2;
|
||||||
val2 > 0, "Pinned reserve segment size has to be greater than 0 ", "");
|
|
||||||
m_pinned_reserve_segment_size_mb = val2;
|
|
||||||
} else {
|
|
||||||
TORCH_CHECK(
|
|
||||||
false, "Error, expecting pinned_reserve_segment_size_mb value", "");
|
|
||||||
}
|
|
||||||
return i;
|
return i;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -1,6 +1,7 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <c10/core/AllocatorConfig.h>
|
#include <c10/core/AllocatorConfig.h>
|
||||||
|
#include <c10/cuda/CUDAException.h>
|
||||||
#include <c10/cuda/CUDAMacros.h>
|
#include <c10/cuda/CUDAMacros.h>
|
||||||
#include <c10/util/Exception.h>
|
#include <c10/util/Exception.h>
|
||||||
#include <c10/util/env.h>
|
#include <c10/util/env.h>
|
||||||
@ -144,37 +145,36 @@ class C10_CUDA_API CUDAAllocatorConfig {
|
|||||||
void parseArgs(const std::string& env);
|
void parseArgs(const std::string& env);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
CUDAAllocatorConfig();
|
CUDAAllocatorConfig() = default;
|
||||||
|
|
||||||
static void lexArgs(const std::string& env, std::vector<std::string>& config);
|
|
||||||
static void consumeToken(
|
|
||||||
const std::vector<std::string>& config,
|
|
||||||
size_t i,
|
|
||||||
const char c);
|
|
||||||
size_t parseAllocatorConfig(
|
size_t parseAllocatorConfig(
|
||||||
const std::vector<std::string>& config,
|
const c10::CachingAllocator::ConfigTokenizer& tokenizer,
|
||||||
size_t i,
|
size_t i,
|
||||||
bool& used_cudaMallocAsync);
|
bool& used_cudaMallocAsync);
|
||||||
size_t parsePinnedUseCudaHostRegister(
|
size_t parsePinnedUseCudaHostRegister(
|
||||||
const std::vector<std::string>& config,
|
const c10::CachingAllocator::ConfigTokenizer& tokenizer,
|
||||||
size_t i);
|
size_t i);
|
||||||
size_t parsePinnedNumRegisterThreads(
|
size_t parsePinnedNumRegisterThreads(
|
||||||
const std::vector<std::string>& config,
|
const c10::CachingAllocator::ConfigTokenizer& tokenizer,
|
||||||
size_t i);
|
size_t i);
|
||||||
size_t parsePinnedReserveSegmentSize(
|
size_t parsePinnedReserveSegmentSize(
|
||||||
const std::vector<std::string>& config,
|
const c10::CachingAllocator::ConfigTokenizer& tokenizer,
|
||||||
size_t i);
|
size_t i);
|
||||||
size_t parseGraphCaptureRecordStreamReuse(
|
size_t parseGraphCaptureRecordStreamReuse(
|
||||||
const std::vector<std::string>& config,
|
const c10::CachingAllocator::ConfigTokenizer& tokenizer,
|
||||||
size_t i);
|
size_t i);
|
||||||
|
|
||||||
std::atomic<size_t> m_pinned_num_register_threads;
|
std::atomic<size_t> m_pinned_num_register_threads{1};
|
||||||
std::atomic<size_t> m_pinned_reserve_segment_size_mb;
|
std::atomic<size_t> m_pinned_reserve_segment_size_mb{0};
|
||||||
std::atomic<Expandable_Segments_Handle_Type>
|
std::atomic<Expandable_Segments_Handle_Type> m_expandable_segments_handle_type
|
||||||
m_expandable_segments_handle_type;
|
#if CUDA_VERSION >= 12030
|
||||||
std::atomic<bool> m_release_lock_on_cudamalloc;
|
{Expandable_Segments_Handle_Type::UNSPECIFIED};
|
||||||
std::atomic<bool> m_pinned_use_cuda_host_register;
|
#else
|
||||||
std::atomic<bool> m_graph_capture_record_stream_reuse;
|
{Expandable_Segments_Handle_Type::POSIX_FD};
|
||||||
|
#endif
|
||||||
|
std::atomic<bool> m_release_lock_on_cudamalloc{false};
|
||||||
|
std::atomic<bool> m_pinned_use_cuda_host_register{false};
|
||||||
|
std::atomic<bool> m_graph_capture_record_stream_reuse{false};
|
||||||
};
|
};
|
||||||
|
|
||||||
// Keep this for backwards compatibility
|
// Keep this for backwards compatibility
|
||||||
|
|||||||
Reference in New Issue
Block a user