[Memory Snapshot] Add CUDAAllocatorConfig details into snapshot metadata (#119404)

Summary:
Include the CUDAAllocatorConfig at the time of snapshot into the snapshot file. These include adding variables:

```
  double garbage_collection_threshold;
  size_t max_split_size;
  size_t pinned_num_register_threads;
  bool expandable_segments;
  bool release_lock_on_cudamalloc;
  bool pinned_use_cuda_host_register;
  std::string last_allocator_settings;
  std::vector<size_t> roundup_power2_divisions;
```

Test Plan:
`PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True ` produces
```
{'PYTORCH_CUDA_ALLOC_CONF': 'expandable_segments:True',
 'max_split_size': -1,
 'garbage_collection_threshold': 0.0,
 'expandable_segments': True,
 'pinned_num_register_threads': 1,
 'release_lock_on_cudamalloc': False,
 'pinned_use_cuda_host_register': False,
 'roundup_power2_divisions': {'1': 0,
  '2': 0,
  '4': 0,
  '8': 0,
  '16': 0,
  '32': 0,
  '64': 0,
  '128': 0,
  '256': 0,
  '512': 0,
  '1024': 0,
  '2048': 0,
  '4096': 0,
  '8192': 0,
  '16384': 0,
  '32768': 0}}
```
`PYTORCH_CUDA_ALLOC_CONF="max_split_size_mb:2000,roundup_power2_divisions:[256:1,512:2,1024:4,>:8]"` produces
```
{'PYTORCH_CUDA_ALLOC_CONF': 'max_split_size_mb:2000,roundup_power2_divisions:[256:1,512:2,1024:4,>:8]',
 'max_split_size': 2097152000,
 'garbage_collection_threshold': 0.0,
 'expandable_segments': False,
 'pinned_num_register_threads': 1,
 'release_lock_on_cudamalloc': False,
 'pinned_use_cuda_host_register': False,
 'roundup_power2_divisions': {'1': 1, '2': 1, '4': 1, '8': 1, '16': 1, '32': 1, '64': 1, '128': 1, '256': 1, '512': 2, '1024': 8, '2048': 8, '4096': 8, '8192': 8, '16384': 8, '32768': 8}
}
```

Differential Revision: D53536199

Pulled By: aaronenyeshi

Pull Request resolved: https://github.com/pytorch/pytorch/pull/119404
Approved by: https://github.com/zdevito
This commit is contained in:
Aaron Enye Shi
2024-02-17 01:16:33 +00:00
committed by PyTorch MergeBot
parent 9aa8bbf7f2
commit 7973ac586d
6 changed files with 120 additions and 1 deletions

View File

@ -16,7 +16,8 @@ CUDAAllocatorConfig::CUDAAllocatorConfig()
m_pinned_num_register_threads(1),
m_expandable_segments(false),
m_release_lock_on_cudamalloc(false),
m_pinned_use_cuda_host_register(false) {
m_pinned_use_cuda_host_register(false),
m_last_allocator_settings("") {
m_roundup_power2_divisions.assign(kRoundUpPowerOfTwoIntervals, 0);
}
@ -243,6 +244,10 @@ void CUDAAllocatorConfig::parseArgs(const char* env) {
if (env == nullptr) {
return;
}
{
std::lock_guard<std::mutex> lock(m_last_allocator_settings_mutex);
m_last_allocator_settings = env;
}
std::vector<std::string> config;
lexArgs(env, config);

View File

@ -7,6 +7,7 @@
#include <atomic>
#include <cstddef>
#include <cstdlib>
#include <mutex>
#include <string>
namespace c10::cuda::CUDACachingAllocator {
@ -58,6 +59,16 @@ class C10_CUDA_API CUDAAllocatorConfig {
// using env variable: PYTORCH_CUDA_ALLOC_CONF=roundup_power2_divisions:4
static size_t roundup_power2_divisions(size_t size);
static std::vector<size_t> roundup_power2_divisions() {
return instance().m_roundup_power2_divisions;
}
static std::string last_allocator_settings() {
std::lock_guard<std::mutex> lock(
instance().m_last_allocator_settings_mutex);
return instance().m_last_allocator_settings;
}
static CUDAAllocatorConfig& instance() {
static CUDAAllocatorConfig* s_instance = ([]() {
auto inst = new CUDAAllocatorConfig();
@ -103,6 +114,8 @@ class C10_CUDA_API CUDAAllocatorConfig {
std::atomic<bool> m_expandable_segments;
std::atomic<bool> m_release_lock_on_cudamalloc;
std::atomic<bool> m_pinned_use_cuda_host_register;
std::string m_last_allocator_settings;
std::mutex m_last_allocator_settings_mutex;
};
// General caching allocator utilities

View File

@ -3032,6 +3032,22 @@ class NativeCachingAllocator : public CUDAAllocator {
auto snap = da->snapshot();
result.segments.insert(result.segments.end(), snap.begin(), snap.end());
}
auto& md = result.config_metadata;
md.garbage_collection_threshold =
CUDAAllocatorConfig::garbage_collection_threshold();
md.max_split_size = CUDAAllocatorConfig::max_split_size();
md.pinned_num_register_threads =
CUDAAllocatorConfig::pinned_num_register_threads();
md.expandable_segments = CUDAAllocatorConfig::expandable_segments();
md.release_lock_on_malloc =
CUDAAllocatorConfig::release_lock_on_cudamalloc();
md.pinned_use_host_register =
CUDAAllocatorConfig::pinned_use_cuda_host_register();
md.last_allocator_settings = CUDAAllocatorConfig::last_allocator_settings();
md.roundup_power2_divisions =
CUDAAllocatorConfig::roundup_power2_divisions();
return result;
}

View File

@ -187,9 +187,21 @@ struct TraceEntry {
trace_time_ time_{};
};
struct AllocatorConfigInfo {
double garbage_collection_threshold;
size_t max_split_size;
size_t pinned_num_register_threads;
bool expandable_segments;
bool release_lock_on_malloc;
bool pinned_use_host_register;
std::string last_allocator_settings;
std::vector<size_t> roundup_power2_divisions;
};
struct SnapshotInfo {
std::vector<SegmentInfo> segments;
std::vector<std::vector<TraceEntry>> device_traces;
AllocatorConfigInfo config_metadata;
};
// returns the pointers freed in the pool

View File

@ -782,9 +782,43 @@ PyObject* THCPModule_memorySnapshot(PyObject* _unused, PyObject* noargs) {
traces.append(trace);
}
py::dict allocator_settings;
py::str last_allocator_settings_s = "PYTORCH_CUDA_ALLOC_CONF";
py::str max_split_size_s = "max_split_size";
py::str garbage_collection_threshold_s = "garbage_collection_threshold";
py::str expandable_segments_s = "expandable_segments";
py::str pinned_num_register_threads_s = "pinned_num_register_threads";
py::str release_lock_on_malloc_s = "release_lock_on_cudamalloc";
py::str pinned_use_host_register_s = "pinned_use_cuda_host_register";
py::str roundup_power2_divisions_s = "roundup_power2_divisions";
allocator_settings[last_allocator_settings_s] =
snapshot.config_metadata.last_allocator_settings;
allocator_settings[max_split_size_s] =
int64_t(snapshot.config_metadata.max_split_size);
allocator_settings[garbage_collection_threshold_s] =
snapshot.config_metadata.garbage_collection_threshold;
allocator_settings[expandable_segments_s] =
snapshot.config_metadata.expandable_segments;
allocator_settings[pinned_num_register_threads_s] =
int64_t(snapshot.config_metadata.pinned_num_register_threads);
allocator_settings[release_lock_on_malloc_s] =
snapshot.config_metadata.release_lock_on_malloc;
allocator_settings[pinned_use_host_register_s] =
snapshot.config_metadata.pinned_use_host_register;
unsigned int roundup_key = 1;
py::dict roundup_settings;
for (const auto& v : snapshot.config_metadata.roundup_power2_divisions) {
py::str roundup_key_s = std::to_string(roundup_key);
roundup_settings[roundup_key_s] = int64_t(v);
roundup_key *= 2;
}
allocator_settings[roundup_power2_divisions_s] = roundup_settings;
py::dict result;
result["segments"] = segments;
result["device_traces"] = traces;
result["allocator_settings"] = allocator_settings;
auto frames = py_symbolize(to_gather_frames);
for (auto i : c10::irange(frames.size())) {

View File

@ -317,9 +317,48 @@ std::string _memory_snapshot_pickled() {
traces.push_back(trace);
}
auto allocator_settings = new_dict();
IValue last_allocator_settings_s = "PYTORCH_CUDA_ALLOC_CONF";
IValue max_split_size_s = "max_split_size";
IValue garbage_collection_threshold_s = "garbage_collection_threshold";
IValue expandable_segments_s = "expandable_segments";
IValue pinned_num_register_threads_s = "pinned_num_register_threads";
IValue release_lock_on_malloc_s = "release_lock_on_cudamalloc";
IValue pinned_use_host_register_s = "pinned_use_cuda_host_register";
IValue roundup_power2_divisions_s = "roundup_power2_divisions";
allocator_settings.insert(
last_allocator_settings_s,
snapshot.config_metadata.last_allocator_settings);
allocator_settings.insert(
max_split_size_s, int64_t(snapshot.config_metadata.max_split_size));
allocator_settings.insert(
garbage_collection_threshold_s,
snapshot.config_metadata.garbage_collection_threshold);
allocator_settings.insert(
expandable_segments_s, snapshot.config_metadata.expandable_segments);
allocator_settings.insert(
pinned_num_register_threads_s,
int64_t(snapshot.config_metadata.pinned_num_register_threads));
allocator_settings.insert(
release_lock_on_malloc_s,
snapshot.config_metadata.release_lock_on_malloc);
allocator_settings.insert(
pinned_use_host_register_s,
snapshot.config_metadata.pinned_use_host_register);
unsigned int roundup_key = 1;
auto roundup_settings = new_dict();
for (const auto& v : snapshot.config_metadata.roundup_power2_divisions) {
IValue roundup_key_s = std::to_string(roundup_key);
roundup_settings.insert(roundup_key_s, int64_t(v));
roundup_key *= 2;
}
allocator_settings.insert(roundup_power2_divisions_s, roundup_settings);
auto result = new_dict();
result.insert("segments", segments);
result.insert("device_traces", traces);
result.insert("allocator_settings", allocator_settings);
auto frames = ivalue_symbolize(frame_tracebacks);
for (auto i : c10::irange(frames.size())) {