mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 12:54:11 +08:00
[Memory Snapshot] Add CUDAAllocatorConfig details into snapshot metadata (#119404)
Summary: Include the CUDAAllocatorConfig at the time of snapshot into the snapshot file. These include adding variables: ``` double garbage_collection_threshold; size_t max_split_size; size_t pinned_num_register_threads; bool expandable_segments; bool release_lock_on_cudamalloc; bool pinned_use_cuda_host_register; std::string last_allocator_settings; std::vector<size_t> roundup_power2_divisions; ``` Test Plan: `PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True ` produces ``` {'PYTORCH_CUDA_ALLOC_CONF': 'expandable_segments:True', 'max_split_size': -1, 'garbage_collection_threshold': 0.0, 'expandable_segments': True, 'pinned_num_register_threads': 1, 'release_lock_on_cudamalloc': False, 'pinned_use_cuda_host_register': False, 'roundup_power2_divisions': {'1': 0, '2': 0, '4': 0, '8': 0, '16': 0, '32': 0, '64': 0, '128': 0, '256': 0, '512': 0, '1024': 0, '2048': 0, '4096': 0, '8192': 0, '16384': 0, '32768': 0}} ``` `PYTORCH_CUDA_ALLOC_CONF="max_split_size_mb:2000,roundup_power2_divisions:[256:1,512:2,1024:4,>:8]"` produces ``` {'PYTORCH_CUDA_ALLOC_CONF': 'max_split_size_mb:2000,roundup_power2_divisions:[256:1,512:2,1024:4,>:8]', 'max_split_size': 2097152000, 'garbage_collection_threshold': 0.0, 'expandable_segments': False, 'pinned_num_register_threads': 1, 'release_lock_on_cudamalloc': False, 'pinned_use_cuda_host_register': False, 'roundup_power2_divisions': {'1': 1, '2': 1, '4': 1, '8': 1, '16': 1, '32': 1, '64': 1, '128': 1, '256': 1, '512': 2, '1024': 8, '2048': 8, '4096': 8, '8192': 8, '16384': 8, '32768': 8} } ``` Differential Revision: D53536199 Pulled By: aaronenyeshi Pull Request resolved: https://github.com/pytorch/pytorch/pull/119404 Approved by: https://github.com/zdevito
This commit is contained in:
committed by
PyTorch MergeBot
parent
9aa8bbf7f2
commit
7973ac586d
@ -16,7 +16,8 @@ CUDAAllocatorConfig::CUDAAllocatorConfig()
|
||||
m_pinned_num_register_threads(1),
|
||||
m_expandable_segments(false),
|
||||
m_release_lock_on_cudamalloc(false),
|
||||
m_pinned_use_cuda_host_register(false) {
|
||||
m_pinned_use_cuda_host_register(false),
|
||||
m_last_allocator_settings("") {
|
||||
m_roundup_power2_divisions.assign(kRoundUpPowerOfTwoIntervals, 0);
|
||||
}
|
||||
|
||||
@ -243,6 +244,10 @@ void CUDAAllocatorConfig::parseArgs(const char* env) {
|
||||
if (env == nullptr) {
|
||||
return;
|
||||
}
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(m_last_allocator_settings_mutex);
|
||||
m_last_allocator_settings = env;
|
||||
}
|
||||
|
||||
std::vector<std::string> config;
|
||||
lexArgs(env, config);
|
||||
|
@ -7,6 +7,7 @@
|
||||
#include <atomic>
|
||||
#include <cstddef>
|
||||
#include <cstdlib>
|
||||
#include <mutex>
|
||||
#include <string>
|
||||
|
||||
namespace c10::cuda::CUDACachingAllocator {
|
||||
@ -58,6 +59,16 @@ class C10_CUDA_API CUDAAllocatorConfig {
|
||||
// using env variable: PYTORCH_CUDA_ALLOC_CONF=roundup_power2_divisions:4
|
||||
static size_t roundup_power2_divisions(size_t size);
|
||||
|
||||
static std::vector<size_t> roundup_power2_divisions() {
|
||||
return instance().m_roundup_power2_divisions;
|
||||
}
|
||||
|
||||
static std::string last_allocator_settings() {
|
||||
std::lock_guard<std::mutex> lock(
|
||||
instance().m_last_allocator_settings_mutex);
|
||||
return instance().m_last_allocator_settings;
|
||||
}
|
||||
|
||||
static CUDAAllocatorConfig& instance() {
|
||||
static CUDAAllocatorConfig* s_instance = ([]() {
|
||||
auto inst = new CUDAAllocatorConfig();
|
||||
@ -103,6 +114,8 @@ class C10_CUDA_API CUDAAllocatorConfig {
|
||||
std::atomic<bool> m_expandable_segments;
|
||||
std::atomic<bool> m_release_lock_on_cudamalloc;
|
||||
std::atomic<bool> m_pinned_use_cuda_host_register;
|
||||
std::string m_last_allocator_settings;
|
||||
std::mutex m_last_allocator_settings_mutex;
|
||||
};
|
||||
|
||||
// General caching allocator utilities
|
||||
|
@ -3032,6 +3032,22 @@ class NativeCachingAllocator : public CUDAAllocator {
|
||||
auto snap = da->snapshot();
|
||||
result.segments.insert(result.segments.end(), snap.begin(), snap.end());
|
||||
}
|
||||
|
||||
auto& md = result.config_metadata;
|
||||
md.garbage_collection_threshold =
|
||||
CUDAAllocatorConfig::garbage_collection_threshold();
|
||||
md.max_split_size = CUDAAllocatorConfig::max_split_size();
|
||||
md.pinned_num_register_threads =
|
||||
CUDAAllocatorConfig::pinned_num_register_threads();
|
||||
md.expandable_segments = CUDAAllocatorConfig::expandable_segments();
|
||||
md.release_lock_on_malloc =
|
||||
CUDAAllocatorConfig::release_lock_on_cudamalloc();
|
||||
md.pinned_use_host_register =
|
||||
CUDAAllocatorConfig::pinned_use_cuda_host_register();
|
||||
md.last_allocator_settings = CUDAAllocatorConfig::last_allocator_settings();
|
||||
md.roundup_power2_divisions =
|
||||
CUDAAllocatorConfig::roundup_power2_divisions();
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -187,9 +187,21 @@ struct TraceEntry {
|
||||
trace_time_ time_{};
|
||||
};
|
||||
|
||||
struct AllocatorConfigInfo {
|
||||
double garbage_collection_threshold;
|
||||
size_t max_split_size;
|
||||
size_t pinned_num_register_threads;
|
||||
bool expandable_segments;
|
||||
bool release_lock_on_malloc;
|
||||
bool pinned_use_host_register;
|
||||
std::string last_allocator_settings;
|
||||
std::vector<size_t> roundup_power2_divisions;
|
||||
};
|
||||
|
||||
struct SnapshotInfo {
|
||||
std::vector<SegmentInfo> segments;
|
||||
std::vector<std::vector<TraceEntry>> device_traces;
|
||||
AllocatorConfigInfo config_metadata;
|
||||
};
|
||||
|
||||
// returns the pointers freed in the pool
|
||||
|
@ -782,9 +782,43 @@ PyObject* THCPModule_memorySnapshot(PyObject* _unused, PyObject* noargs) {
|
||||
traces.append(trace);
|
||||
}
|
||||
|
||||
py::dict allocator_settings;
|
||||
py::str last_allocator_settings_s = "PYTORCH_CUDA_ALLOC_CONF";
|
||||
py::str max_split_size_s = "max_split_size";
|
||||
py::str garbage_collection_threshold_s = "garbage_collection_threshold";
|
||||
py::str expandable_segments_s = "expandable_segments";
|
||||
py::str pinned_num_register_threads_s = "pinned_num_register_threads";
|
||||
py::str release_lock_on_malloc_s = "release_lock_on_cudamalloc";
|
||||
py::str pinned_use_host_register_s = "pinned_use_cuda_host_register";
|
||||
py::str roundup_power2_divisions_s = "roundup_power2_divisions";
|
||||
|
||||
allocator_settings[last_allocator_settings_s] =
|
||||
snapshot.config_metadata.last_allocator_settings;
|
||||
allocator_settings[max_split_size_s] =
|
||||
int64_t(snapshot.config_metadata.max_split_size);
|
||||
allocator_settings[garbage_collection_threshold_s] =
|
||||
snapshot.config_metadata.garbage_collection_threshold;
|
||||
allocator_settings[expandable_segments_s] =
|
||||
snapshot.config_metadata.expandable_segments;
|
||||
allocator_settings[pinned_num_register_threads_s] =
|
||||
int64_t(snapshot.config_metadata.pinned_num_register_threads);
|
||||
allocator_settings[release_lock_on_malloc_s] =
|
||||
snapshot.config_metadata.release_lock_on_malloc;
|
||||
allocator_settings[pinned_use_host_register_s] =
|
||||
snapshot.config_metadata.pinned_use_host_register;
|
||||
unsigned int roundup_key = 1;
|
||||
py::dict roundup_settings;
|
||||
for (const auto& v : snapshot.config_metadata.roundup_power2_divisions) {
|
||||
py::str roundup_key_s = std::to_string(roundup_key);
|
||||
roundup_settings[roundup_key_s] = int64_t(v);
|
||||
roundup_key *= 2;
|
||||
}
|
||||
allocator_settings[roundup_power2_divisions_s] = roundup_settings;
|
||||
|
||||
py::dict result;
|
||||
result["segments"] = segments;
|
||||
result["device_traces"] = traces;
|
||||
result["allocator_settings"] = allocator_settings;
|
||||
|
||||
auto frames = py_symbolize(to_gather_frames);
|
||||
for (auto i : c10::irange(frames.size())) {
|
||||
|
@ -317,9 +317,48 @@ std::string _memory_snapshot_pickled() {
|
||||
traces.push_back(trace);
|
||||
}
|
||||
|
||||
auto allocator_settings = new_dict();
|
||||
IValue last_allocator_settings_s = "PYTORCH_CUDA_ALLOC_CONF";
|
||||
IValue max_split_size_s = "max_split_size";
|
||||
IValue garbage_collection_threshold_s = "garbage_collection_threshold";
|
||||
IValue expandable_segments_s = "expandable_segments";
|
||||
IValue pinned_num_register_threads_s = "pinned_num_register_threads";
|
||||
IValue release_lock_on_malloc_s = "release_lock_on_cudamalloc";
|
||||
IValue pinned_use_host_register_s = "pinned_use_cuda_host_register";
|
||||
IValue roundup_power2_divisions_s = "roundup_power2_divisions";
|
||||
|
||||
allocator_settings.insert(
|
||||
last_allocator_settings_s,
|
||||
snapshot.config_metadata.last_allocator_settings);
|
||||
allocator_settings.insert(
|
||||
max_split_size_s, int64_t(snapshot.config_metadata.max_split_size));
|
||||
allocator_settings.insert(
|
||||
garbage_collection_threshold_s,
|
||||
snapshot.config_metadata.garbage_collection_threshold);
|
||||
allocator_settings.insert(
|
||||
expandable_segments_s, snapshot.config_metadata.expandable_segments);
|
||||
allocator_settings.insert(
|
||||
pinned_num_register_threads_s,
|
||||
int64_t(snapshot.config_metadata.pinned_num_register_threads));
|
||||
allocator_settings.insert(
|
||||
release_lock_on_malloc_s,
|
||||
snapshot.config_metadata.release_lock_on_malloc);
|
||||
allocator_settings.insert(
|
||||
pinned_use_host_register_s,
|
||||
snapshot.config_metadata.pinned_use_host_register);
|
||||
unsigned int roundup_key = 1;
|
||||
auto roundup_settings = new_dict();
|
||||
for (const auto& v : snapshot.config_metadata.roundup_power2_divisions) {
|
||||
IValue roundup_key_s = std::to_string(roundup_key);
|
||||
roundup_settings.insert(roundup_key_s, int64_t(v));
|
||||
roundup_key *= 2;
|
||||
}
|
||||
allocator_settings.insert(roundup_power2_divisions_s, roundup_settings);
|
||||
|
||||
auto result = new_dict();
|
||||
result.insert("segments", segments);
|
||||
result.insert("device_traces", traces);
|
||||
result.insert("allocator_settings", allocator_settings);
|
||||
|
||||
auto frames = ivalue_symbolize(frame_tracebacks);
|
||||
for (auto i : c10::irange(frames.size())) {
|
||||
|
Reference in New Issue
Block a user