init version

2025-10-26 08:34:52 +08:00 · 2025-10-10 10:26:40 -07:00
6 changed files with 246 additions and 3 deletions
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp
@ -1271,6 +1271,9 @@ class DeviceCachingAllocator {
  // thread local compile context for each device
  static thread_local std::stack<std::string> compile_context;

+  // thread local user metadata for annotating allocations
+  static thread_local std::string user_metadata;
+
 public:
  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
  explicit DeviceCachingAllocator(c10::DeviceIndex id)
@ -1313,6 +1316,14 @@ class DeviceCachingAllocator {
    }
  }

+  void setUserMetadata(const std::string& metadata) {
+    user_metadata = metadata;
+  }
+
+  std::string getUserMetadata() {
+    return user_metadata;
+  }
+
  bool checkPoolLiveAllocations(
      MempoolId_t mempool_id,
      const std::unordered_set<void*>& expected_live_allocations) const {
@ -3695,7 +3706,8 @@ class DeviceCachingAllocator {
        mempool_id,
        getApproximateTime(),
        record_context_ >= RecordContext::ALLOC ? std::move(context) : nullptr,
-        compile_string);
+        compile_string,
+        user_metadata);

    // Callbacks should not include any Pytorch call
    for (const auto& cb : trace_trackers_) {
@ -3750,6 +3762,7 @@ static void uncached_delete(void* ptr) {

 static void local_raw_delete(void* ptr);
 thread_local std::stack<std::string> DeviceCachingAllocator::compile_context;
+thread_local std::string DeviceCachingAllocator::user_metadata;
 #ifdef __cpp_lib_hardware_interference_size
 using std::hardware_destructive_interference_size;
 #else
@ -3947,6 +3960,18 @@ class NativeCachingAllocator : public CUDAAllocator {
    device_allocator[device]->popCompileContext();
  }

+  void setUserMetadata(const std::string& metadata) override {
+    c10::DeviceIndex device = 0;
+    C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
+    device_allocator[device]->setUserMetadata(metadata);
+  }
+
+  std::string getUserMetadata() override {
+    c10::DeviceIndex device = 0;
+    C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
+    return device_allocator[device]->getUserMetadata();
+  }
+
  bool isHistoryEnabled() override {
    c10::DeviceIndex device = 0;
    C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
--- a/c10/cuda/CUDACachingAllocator.h
+++ b/c10/cuda/CUDACachingAllocator.h
@ -118,7 +118,8 @@ struct TraceEntry {
      MempoolId_t mempool,
      approx_time_t time,
      std::shared_ptr<GatheredContext> context = nullptr,
-      std::string compile_context = "")
+      std::string compile_context = "",
+      std::string user_metadata = "")
      : action_(action),
        device_(device),
        addr_(addr),
@ -126,7 +127,8 @@ struct TraceEntry {
        stream_(stream),
        size_(size),
        mempool_(std::move(mempool)),
-        compile_context_(std::move(compile_context)) {
+        compile_context_(std::move(compile_context)),
+        user_metadata_(std::move(user_metadata)) {
    time_.approx_t_ = time;
  }
  Action action_;
@ -138,6 +140,7 @@ struct TraceEntry {
  MempoolId_t mempool_;
  trace_time_ time_{};
  std::string compile_context_;
+  std::string user_metadata_;
 };

 // Calls made by record_function will save annotations
@ -297,6 +300,8 @@ class CUDAAllocator : public DeviceAllocator {
      const std::vector<std::pair<std::string, std::string>>& /*md*/) {}
  virtual void pushCompileContext(std::string& md) {}
  virtual void popCompileContext() {}
+  virtual void setUserMetadata(const std::string& metadata) {}
+  virtual std::string getUserMetadata() { return ""; }
  virtual void attachOutOfMemoryObserver(OutOfMemoryObserver observer) = 0;

  // Attached AllocatorTraceTracker callbacks will be called while the
@ -536,6 +541,14 @@ inline void enablePeerAccess(
  get()->enablePeerAccess(dev, dev_to_access);
 }

+inline void setUserMetadata(const std::string& metadata) {
+  get()->setUserMetadata(metadata);
+}
+
+inline std::string getUserMetadata() {
+  return get()->getUserMetadata();
+}
+
 } // namespace c10::cuda::CUDACachingAllocator

 namespace c10::cuda {
--- a/test_user_metadata.py
+++ b/test_user_metadata.py
@ -0,0 +1,163 @@
+#!/usr/bin/env python3
+"""
+Test script for custom user metadata in CUDA memory allocations.
+
+This script demonstrates how to:
+1. Set custom metadata for memory allocations
+2. Allocate tensors with different metadata tags
+3. Take a memory snapshot
+4. Display the metadata in the snapshot
+"""
+
+import torch
+import pickle
+
+
+def test_memory_metadata():
+    """Test the custom memory metadata feature."""
+
+    # Check if CUDA is available
+    if not torch.cuda.is_available():
+        print("CUDA is not available. Skipping test.")
+        return
+
+    print("Starting memory metadata test...")
+    print()
+
+    # Enable memory history recording
+    print("1. Enabling memory history recording...")
+    torch.cuda.memory._record_memory_history(enabled="all")
+
+    # Test 1: Set metadata and allocate tensor
+    print("2. Setting metadata to 'training_phase'...")
+    torch.cuda.memory._set_memory_metadata("training_phase")
+
+    # Verify the metadata was set
+    current_metadata = torch.cuda.memory._get_memory_metadata()
+    print(f"   Current metadata: '{current_metadata}'")
+    assert current_metadata == "training_phase", "Metadata not set correctly!"
+
+    # Allocate a tensor
+    print("3. Allocating tensor x (100x100)...")
+    x = torch.randn(100, 100, device='cuda')
+
+    # Test 2: Change metadata and allocate another tensor
+    print("4. Setting metadata to 'validation_phase'...")
+    torch.cuda.memory._set_memory_metadata("validation_phase")
+
+    current_metadata = torch.cuda.memory._get_memory_metadata()
+    print(f"   Current metadata: '{current_metadata}'")
+
+    print("5. Allocating tensor y (200x200)...")
+    y = torch.randn(200, 200, device='cuda')
+
+    # Test 3: Clear metadata and allocate another tensor
+    print("6. Clearing metadata (setting to empty string)...")
+    torch.cuda.memory._set_memory_metadata("")
+
+    current_metadata = torch.cuda.memory._get_memory_metadata()
+    print(f"   Current metadata: '{current_metadata}'")
+    assert current_metadata == "", "Metadata not cleared!"
+
+    print("7. Allocating tensor z (50x50) with no metadata...")
+    z = torch.randn(50, 50, device='cuda')
+
+    # Test 4: Take a snapshot
+    print("8. Taking memory snapshot...")
+    snapshot = torch.cuda.memory._snapshot()
+
+    # Analyze the snapshot
+    print()
+    print("=" * 70)
+    print("SNAPSHOT ANALYSIS")
+    print("=" * 70)
+
+    # Look at device traces
+    if 'device_traces' in snapshot:
+        device_traces = snapshot['device_traces']
+        print(f"Number of devices: {len(device_traces)}")
+
+        if len(device_traces) > 0:
+            traces = device_traces[0]  # First device
+            print(f"Number of trace entries: {len(traces)}")
+            print()
+
+            # Find allocation entries with metadata
+            alloc_count = 0
+            metadata_found = {}
+
+            for i, trace in enumerate(traces):
+                if trace.get('action') == 'alloc':
+                    alloc_count += 1
+                    user_metadata = trace.get('user_metadata', '')
+                    size = trace.get('size', 0)
+
+                    if user_metadata:
+                        if user_metadata not in metadata_found:
+                            metadata_found[user_metadata] = []
+                        metadata_found[user_metadata].append({
+                            'index': i,
+                            'size': size,
+                            'addr': trace.get('addr', 'N/A')
+                        })
+
+            print(f"Total allocations found: {alloc_count}")
+            print()
+
+            # Display allocations grouped by metadata
+            if metadata_found:
+                print("Allocations with metadata:")
+                print("-" * 70)
+                for metadata, allocs in metadata_found.items():
+                    print(f"\nMetadata: '{metadata}'")
+                    for alloc in allocs:
+                        print(f"  - Trace #{alloc['index']}: "
+                              f"size={alloc['size']:,} bytes, "
+                              f"addr=0x{alloc['addr']:x}")
+            else:
+                print("WARNING: No allocations with metadata found!")
+                print("This might indicate the feature is not working correctly.")
+
+            # Show some example traces with metadata
+            print()
+            print("Sample trace entries with user_metadata field:")
+            print("-" * 70)
+            shown = 0
+            for i, trace in enumerate(traces):
+                if 'user_metadata' in trace and trace.get('action') == 'alloc':
+                    print(f"\nTrace #{i}:")
+                    print(f"  action: {trace.get('action')}")
+                    print(f"  size: {trace.get('size'):,} bytes")
+                    print(f"  user_metadata: '{trace.get('user_metadata')}'")
+                    print(f"  compile_context: '{trace.get('compile_context')}'")
+                    shown += 1
+                    if shown >= 5:  # Show first 5
+                        break
+
+    print()
+    print("=" * 70)
+
+    # Save snapshot for inspection
+    snapshot_file = "/tmp/memory_snapshot_with_metadata.pickle"
+    print(f"9. Saving snapshot to {snapshot_file}...")
+    torch.cuda.memory._dump_snapshot(snapshot_file)
+    print(f"   Snapshot saved! You can inspect it with:")
+    print(f"   python -c \"import pickle; s=pickle.load(open('{snapshot_file}','rb')); print(s)\"")
+
+    # Cleanup
+    print()
+    print("10. Cleaning up...")
+    del x, y, z
+    torch.cuda.empty_cache()
+
+    print()
+    print("✓ Test completed successfully!")
+    print()
+    print("Summary:")
+    print("  - Successfully set and retrieved custom metadata")
+    print("  - Metadata was recorded in memory allocations")
+    print("  - Snapshot contains user_metadata field in trace entries")
+
+
+if __name__ == "__main__":
+    test_memory_metadata()
--- a/torch/csrc/cuda/Module.cpp
+++ b/torch/csrc/cuda/Module.cpp
@ -764,6 +764,7 @@ PyObject* THCPModule_memorySnapshot(PyObject* _unused, PyObject* arg) {
  py::str frames_s = "frames";
  py::str time_us_s = "time_us";
  py::str compile_context_s = "compile_context";
+  py::str user_metadata_s = "user_metadata";

  py::list empty_frames;
  std::vector<CapturedTraceback*> to_gather_frames;
@ -881,6 +882,7 @@ PyObject* THCPModule_memorySnapshot(PyObject* _unused, PyObject* arg) {
      trace_entry[stream_s] = int64_t(te.stream_);
      trace_entry[time_us_s] = te.time_.t_;
      trace_entry[compile_context_s] = te.compile_context_;
+      trace_entry[user_metadata_s] = te.user_metadata_;
      trace.append(trace_entry);
    }
    traces.append(trace);
@ -1136,6 +1138,14 @@ static void registerCudaDeviceProperties(PyObject* module) {
    return c10::cuda::CUDACachingAllocator::isHistoryEnabled();
  });

+  m.def("_cuda_setMemoryMetadata", [](const std::string& metadata) {
+    c10::cuda::CUDACachingAllocator::setUserMetadata(metadata);
+  });
+
+  m.def("_cuda_getMemoryMetadata", []() {
+    return c10::cuda::CUDACachingAllocator::getUserMetadata();
+  });
+
  m.def("_cuda_get_conv_benchmark_empty_cache", []() {
    return at::native::_cudnn_get_conv_benchmark_empty_cache();
  });
--- a/torch/csrc/cuda/memory_snapshot.cpp
+++ b/torch/csrc/cuda/memory_snapshot.cpp
@ -310,6 +310,7 @@ std::string _memory_snapshot_pickled() {
  IValue is_expandable_s = "is_expandable";
  IValue time_us_s = "time_us";
  IValue compile_contexts_s = "compile_context";
+  IValue user_metadata_s = "user_metadata";

  auto empty_frames = new_list();

@ -427,6 +428,7 @@ std::string _memory_snapshot_pickled() {
      trace_entry.insert(size_s, (int64_t)te.size_);
      trace_entry.insert(stream_s, int64_t(te.stream_));
      trace_entry.insert(compile_contexts_s, te.compile_context_);
+      trace_entry.insert(user_metadata_s, te.user_metadata_);
      if (te.context_) {
        auto sc = getFromContext(te.context_);
        frame_tracebacks.push_back(sc);
--- a/torch/cuda/memory.py
+++ b/torch/cuda/memory.py
@ -1063,6 +1063,36 @@ def _dump_snapshot(filename="dump_snapshot.pickle"):
        pickle.dump(s, f)


+def _set_memory_metadata(metadata: str):
+    """
+    Set custom metadata that will be attached to all subsequent CUDA memory allocations.
+
+    This metadata will be recorded in the memory snapshot for all allocations made
+    after this call until the metadata is cleared or changed.
+
+    Args:
+        metadata (str): Custom metadata string to attach to allocations.
+                       Pass an empty string to clear the metadata.
+
+    Example:
+        >>> torch.cuda.memory._set_memory_metadata("training_phase")
+        >>> # All allocations here will have "training_phase" metadata
+        >>> x = torch.randn(100, 100, device='cuda')
+        >>> torch.cuda.memory._set_memory_metadata("")  # Clear metadata
+    """
+    torch._C._cuda_setMemoryMetadata(metadata)
+
+
+def _get_memory_metadata() -> str:
+    """
+    Get the current custom metadata that is being attached to CUDA memory allocations.
+
+    Returns:
+        str: The current metadata string, or empty string if no metadata is set.
+    """
+    return torch._C._cuda_getMemoryMetadata()
+
+
 def _save_segment_usage(filename="output.svg", snapshot=None):
    if snapshot is None:
        snapshot = _snapshot()