Compare commits

...

1 Commits

Author SHA1 Message Date
a6df8acdbf init version 2025-10-10 10:26:40 -07:00
6 changed files with 246 additions and 3 deletions

View File

@ -1271,6 +1271,9 @@ class DeviceCachingAllocator {
// thread local compile context for each device
static thread_local std::stack<std::string> compile_context;
// thread local user metadata for annotating allocations
static thread_local std::string user_metadata;
public:
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
explicit DeviceCachingAllocator(c10::DeviceIndex id)
@ -1313,6 +1316,14 @@ class DeviceCachingAllocator {
}
}
void setUserMetadata(const std::string& metadata) {
user_metadata = metadata;
}
std::string getUserMetadata() {
return user_metadata;
}
bool checkPoolLiveAllocations(
MempoolId_t mempool_id,
const std::unordered_set<void*>& expected_live_allocations) const {
@ -3695,7 +3706,8 @@ class DeviceCachingAllocator {
mempool_id,
getApproximateTime(),
record_context_ >= RecordContext::ALLOC ? std::move(context) : nullptr,
compile_string);
compile_string,
user_metadata);
// Callbacks should not include any Pytorch call
for (const auto& cb : trace_trackers_) {
@ -3750,6 +3762,7 @@ static void uncached_delete(void* ptr) {
static void local_raw_delete(void* ptr);
thread_local std::stack<std::string> DeviceCachingAllocator::compile_context;
thread_local std::string DeviceCachingAllocator::user_metadata;
#ifdef __cpp_lib_hardware_interference_size
using std::hardware_destructive_interference_size;
#else
@ -3947,6 +3960,18 @@ class NativeCachingAllocator : public CUDAAllocator {
device_allocator[device]->popCompileContext();
}
void setUserMetadata(const std::string& metadata) override {
c10::DeviceIndex device = 0;
C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
device_allocator[device]->setUserMetadata(metadata);
}
std::string getUserMetadata() override {
c10::DeviceIndex device = 0;
C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
return device_allocator[device]->getUserMetadata();
}
bool isHistoryEnabled() override {
c10::DeviceIndex device = 0;
C10_CUDA_CHECK(c10::cuda::GetDevice(&device));

View File

@ -118,7 +118,8 @@ struct TraceEntry {
MempoolId_t mempool,
approx_time_t time,
std::shared_ptr<GatheredContext> context = nullptr,
std::string compile_context = "")
std::string compile_context = "",
std::string user_metadata = "")
: action_(action),
device_(device),
addr_(addr),
@ -126,7 +127,8 @@ struct TraceEntry {
stream_(stream),
size_(size),
mempool_(std::move(mempool)),
compile_context_(std::move(compile_context)) {
compile_context_(std::move(compile_context)),
user_metadata_(std::move(user_metadata)) {
time_.approx_t_ = time;
}
Action action_;
@ -138,6 +140,7 @@ struct TraceEntry {
MempoolId_t mempool_;
trace_time_ time_{};
std::string compile_context_;
std::string user_metadata_;
};
// Calls made by record_function will save annotations
@ -297,6 +300,8 @@ class CUDAAllocator : public DeviceAllocator {
const std::vector<std::pair<std::string, std::string>>& /*md*/) {}
virtual void pushCompileContext(std::string& md) {}
virtual void popCompileContext() {}
virtual void setUserMetadata(const std::string& metadata) {}
virtual std::string getUserMetadata() { return ""; }
virtual void attachOutOfMemoryObserver(OutOfMemoryObserver observer) = 0;
// Attached AllocatorTraceTracker callbacks will be called while the
@ -536,6 +541,14 @@ inline void enablePeerAccess(
get()->enablePeerAccess(dev, dev_to_access);
}
inline void setUserMetadata(const std::string& metadata) {
get()->setUserMetadata(metadata);
}
inline std::string getUserMetadata() {
return get()->getUserMetadata();
}
} // namespace c10::cuda::CUDACachingAllocator
namespace c10::cuda {

163
test_user_metadata.py Normal file
View File

@ -0,0 +1,163 @@
#!/usr/bin/env python3
"""
Test script for custom user metadata in CUDA memory allocations.
This script demonstrates how to:
1. Set custom metadata for memory allocations
2. Allocate tensors with different metadata tags
3. Take a memory snapshot
4. Display the metadata in the snapshot
"""
import torch
import pickle
def test_memory_metadata():
"""Test the custom memory metadata feature."""
# Check if CUDA is available
if not torch.cuda.is_available():
print("CUDA is not available. Skipping test.")
return
print("Starting memory metadata test...")
print()
# Enable memory history recording
print("1. Enabling memory history recording...")
torch.cuda.memory._record_memory_history(enabled="all")
# Test 1: Set metadata and allocate tensor
print("2. Setting metadata to 'training_phase'...")
torch.cuda.memory._set_memory_metadata("training_phase")
# Verify the metadata was set
current_metadata = torch.cuda.memory._get_memory_metadata()
print(f" Current metadata: '{current_metadata}'")
assert current_metadata == "training_phase", "Metadata not set correctly!"
# Allocate a tensor
print("3. Allocating tensor x (100x100)...")
x = torch.randn(100, 100, device='cuda')
# Test 2: Change metadata and allocate another tensor
print("4. Setting metadata to 'validation_phase'...")
torch.cuda.memory._set_memory_metadata("validation_phase")
current_metadata = torch.cuda.memory._get_memory_metadata()
print(f" Current metadata: '{current_metadata}'")
print("5. Allocating tensor y (200x200)...")
y = torch.randn(200, 200, device='cuda')
# Test 3: Clear metadata and allocate another tensor
print("6. Clearing metadata (setting to empty string)...")
torch.cuda.memory._set_memory_metadata("")
current_metadata = torch.cuda.memory._get_memory_metadata()
print(f" Current metadata: '{current_metadata}'")
assert current_metadata == "", "Metadata not cleared!"
print("7. Allocating tensor z (50x50) with no metadata...")
z = torch.randn(50, 50, device='cuda')
# Test 4: Take a snapshot
print("8. Taking memory snapshot...")
snapshot = torch.cuda.memory._snapshot()
# Analyze the snapshot
print()
print("=" * 70)
print("SNAPSHOT ANALYSIS")
print("=" * 70)
# Look at device traces
if 'device_traces' in snapshot:
device_traces = snapshot['device_traces']
print(f"Number of devices: {len(device_traces)}")
if len(device_traces) > 0:
traces = device_traces[0] # First device
print(f"Number of trace entries: {len(traces)}")
print()
# Find allocation entries with metadata
alloc_count = 0
metadata_found = {}
for i, trace in enumerate(traces):
if trace.get('action') == 'alloc':
alloc_count += 1
user_metadata = trace.get('user_metadata', '')
size = trace.get('size', 0)
if user_metadata:
if user_metadata not in metadata_found:
metadata_found[user_metadata] = []
metadata_found[user_metadata].append({
'index': i,
'size': size,
'addr': trace.get('addr', 'N/A')
})
print(f"Total allocations found: {alloc_count}")
print()
# Display allocations grouped by metadata
if metadata_found:
print("Allocations with metadata:")
print("-" * 70)
for metadata, allocs in metadata_found.items():
print(f"\nMetadata: '{metadata}'")
for alloc in allocs:
print(f" - Trace #{alloc['index']}: "
f"size={alloc['size']:,} bytes, "
f"addr=0x{alloc['addr']:x}")
else:
print("WARNING: No allocations with metadata found!")
print("This might indicate the feature is not working correctly.")
# Show some example traces with metadata
print()
print("Sample trace entries with user_metadata field:")
print("-" * 70)
shown = 0
for i, trace in enumerate(traces):
if 'user_metadata' in trace and trace.get('action') == 'alloc':
print(f"\nTrace #{i}:")
print(f" action: {trace.get('action')}")
print(f" size: {trace.get('size'):,} bytes")
print(f" user_metadata: '{trace.get('user_metadata')}'")
print(f" compile_context: '{trace.get('compile_context')}'")
shown += 1
if shown >= 5: # Show first 5
break
print()
print("=" * 70)
# Save snapshot for inspection
snapshot_file = "/tmp/memory_snapshot_with_metadata.pickle"
print(f"9. Saving snapshot to {snapshot_file}...")
torch.cuda.memory._dump_snapshot(snapshot_file)
print(f" Snapshot saved! You can inspect it with:")
print(f" python -c \"import pickle; s=pickle.load(open('{snapshot_file}','rb')); print(s)\"")
# Cleanup
print()
print("10. Cleaning up...")
del x, y, z
torch.cuda.empty_cache()
print()
print("✓ Test completed successfully!")
print()
print("Summary:")
print(" - Successfully set and retrieved custom metadata")
print(" - Metadata was recorded in memory allocations")
print(" - Snapshot contains user_metadata field in trace entries")
if __name__ == "__main__":
test_memory_metadata()

View File

@ -764,6 +764,7 @@ PyObject* THCPModule_memorySnapshot(PyObject* _unused, PyObject* arg) {
py::str frames_s = "frames";
py::str time_us_s = "time_us";
py::str compile_context_s = "compile_context";
py::str user_metadata_s = "user_metadata";
py::list empty_frames;
std::vector<CapturedTraceback*> to_gather_frames;
@ -881,6 +882,7 @@ PyObject* THCPModule_memorySnapshot(PyObject* _unused, PyObject* arg) {
trace_entry[stream_s] = int64_t(te.stream_);
trace_entry[time_us_s] = te.time_.t_;
trace_entry[compile_context_s] = te.compile_context_;
trace_entry[user_metadata_s] = te.user_metadata_;
trace.append(trace_entry);
}
traces.append(trace);
@ -1136,6 +1138,14 @@ static void registerCudaDeviceProperties(PyObject* module) {
return c10::cuda::CUDACachingAllocator::isHistoryEnabled();
});
m.def("_cuda_setMemoryMetadata", [](const std::string& metadata) {
c10::cuda::CUDACachingAllocator::setUserMetadata(metadata);
});
m.def("_cuda_getMemoryMetadata", []() {
return c10::cuda::CUDACachingAllocator::getUserMetadata();
});
m.def("_cuda_get_conv_benchmark_empty_cache", []() {
return at::native::_cudnn_get_conv_benchmark_empty_cache();
});

View File

@ -310,6 +310,7 @@ std::string _memory_snapshot_pickled() {
IValue is_expandable_s = "is_expandable";
IValue time_us_s = "time_us";
IValue compile_contexts_s = "compile_context";
IValue user_metadata_s = "user_metadata";
auto empty_frames = new_list();
@ -427,6 +428,7 @@ std::string _memory_snapshot_pickled() {
trace_entry.insert(size_s, (int64_t)te.size_);
trace_entry.insert(stream_s, int64_t(te.stream_));
trace_entry.insert(compile_contexts_s, te.compile_context_);
trace_entry.insert(user_metadata_s, te.user_metadata_);
if (te.context_) {
auto sc = getFromContext(te.context_);
frame_tracebacks.push_back(sc);

View File

@ -1063,6 +1063,36 @@ def _dump_snapshot(filename="dump_snapshot.pickle"):
pickle.dump(s, f)
def _set_memory_metadata(metadata: str):
"""
Set custom metadata that will be attached to all subsequent CUDA memory allocations.
This metadata will be recorded in the memory snapshot for all allocations made
after this call until the metadata is cleared or changed.
Args:
metadata (str): Custom metadata string to attach to allocations.
Pass an empty string to clear the metadata.
Example:
>>> torch.cuda.memory._set_memory_metadata("training_phase")
>>> # All allocations here will have "training_phase" metadata
>>> x = torch.randn(100, 100, device='cuda')
>>> torch.cuda.memory._set_memory_metadata("") # Clear metadata
"""
torch._C._cuda_setMemoryMetadata(metadata)
def _get_memory_metadata() -> str:
"""
Get the current custom metadata that is being attached to CUDA memory allocations.
Returns:
str: The current metadata string, or empty string if no metadata is set.
"""
return torch._C._cuda_getMemoryMetadata()
def _save_segment_usage(filename="output.svg", snapshot=None):
if snapshot is None:
snapshot = _snapshot()