[CCA][Memory Snapshot] Create TraceEntryRingBuffer class for alloc_trace logic (#130741)

Summary: Move the alloc_trace logic into a separate class, to reduce risk of deadlocks when mixing with CCA's lock. Switch to an std::mutex instead of std::recursive_mutex. Let's us re-use the logic in TraceEntryRingBuffer class for later diffs. Test Plan: CI, resnet run, and FBR model. Differential Revision: D59690408 Pulled By: aaronenyeshi Pull Request resolved: https://github.com/pytorch/pytorch/pull/130741 Approved by: https://github.com/davidberard98
2025-10-20 21:14:14 +08:00 · 2024-07-16 15:01:48 +00:00
parent e11c41035c
commit aa4ad711ef
1 changed files with 73 additions and 47 deletions
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp
@ -792,6 +792,69 @@ cudaError_t cudaMallocMaybeCapturing(void** p, size_t size) {
  }
 }

+class TraceEntryRingBuffer {
+ public:
+  TraceEntryRingBuffer() {
+    // alloc_trace is a pointer because we need to intentionally
+    // leak this on deallocation it can hold references to Python
+    // state which will already be destroyed when we are in exit handlers
+    // NOLINTNEXTLINE(cppcoreguidelines-prefer-member-initializer)
+    alloc_trace = new std::vector<TraceEntry>();
+  }
+
+  void setMaxEntries(size_t size) {
+    std::lock_guard<std::mutex> lk(alloc_trace_lock);
+    alloc_trace_max_entries_ = std::max(size_t(1), size);
+  }
+
+  void insertTraceEntries(const TraceEntry& te) {
+    std::lock_guard<std::mutex> lk(alloc_trace_lock);
+    if (alloc_trace->size() < alloc_trace_max_entries_) {
+      alloc_trace->emplace_back(te);
+    } else {
+      (*alloc_trace)[alloc_trace_next++] = te;
+      if (alloc_trace_next == alloc_trace_max_entries_) {
+        alloc_trace_next = 0;
+      }
+    }
+  }
+
+  void getTraceEntries(std::vector<TraceEntry>& result) {
+    std::lock_guard<std::mutex> lk(alloc_trace_lock);
+    result.reserve(alloc_trace->size());
+    result.insert(
+        result.end(),
+        alloc_trace->begin() +
+            static_cast<std::vector<TraceEntry>::difference_type>(
+                alloc_trace_next),
+        alloc_trace->end());
+    result.insert(
+        result.end(),
+        alloc_trace->begin(),
+        alloc_trace->begin() +
+            static_cast<std::vector<TraceEntry>::difference_type>(
+                alloc_trace_next));
+  }
+
+  void clear() {
+    std::lock_guard<std::mutex> lk(alloc_trace_lock);
+    alloc_trace_next = 0;
+    alloc_trace->clear();
+  }
+
+ private:
+  size_t alloc_trace_max_entries_ = 1;
+
+  // Both alloc_trace and alloc_trace_next needs to be used
+  // under alloc_trace_lock.
+  std::mutex alloc_trace_lock;
+  size_t alloc_trace_next = 0;
+  std::vector<TraceEntry>*
+      alloc_trace; // pointer because we need to intentionally leak this on
+                   // deallocation it can hold references to Python state which
+                   // will already be destroyed when we are in exit handlers
+};
+
 } // anonymous namespace
 } // namespace Native

@ -902,18 +965,9 @@ class DeviceCachingAllocator {

  std::atomic<CreateContextFn> context_recorder_;
  RecordContext record_context_ = RecordContext::NEVER;
-  size_t alloc_trace_max_entries_ = 1;

-  // Both alloc_trace and alloc_trace_next needs to be used
-  // under alloc_trace_lock.
-  // TODO: reduce risk of deadlock and remove recursive lock by
-  //       wrapping this into a class instance.
-  std::recursive_mutex alloc_trace_lock;
-  size_t alloc_trace_next = 0;
-  std::vector<TraceEntry>*
-      alloc_trace; // pointer because we need to intentionally leak this on
-                   // deallocation it can hold references to Python state which
-                   // will already be destroyed when we are in exit handlers
+  // Ring buffer for memory snapshot TraceEntry's
+  TraceEntryRingBuffer alloc_buffer;

  // Members specific to CUDA graphs

@ -939,9 +993,7 @@ class DeviceCachingAllocator {
 public:
  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
  DeviceCachingAllocator()
-      : large_blocks(/*small=*/false),
-        small_blocks(/*small=*/true),
-        alloc_trace(new std::vector<TraceEntry>()) {
+      : large_blocks(/*small=*/false), small_blocks(/*small=*/true) {
    stats.max_split_size =
        static_cast<int64_t>(CUDAAllocatorConfig::max_split_size());
    context_recorder_.store(nullptr);
@ -950,18 +1002,16 @@ class DeviceCachingAllocator {
  void recordHistory(
      bool enabled,
      CreateContextFn context_recorder,
-      size_t alloc_trace_max_entries,
+      size_t alloc_buffer_max_entries,
      RecordContext when) {
    std::unique_lock<std::recursive_mutex> lock(mutex);
    TORCH_CHECK(when == RecordContext::NEVER || context_recorder);
    record_history = enabled;
    context_recorder_.store(record_history ? context_recorder : nullptr);
-    alloc_trace_max_entries_ = std::max(size_t(1), alloc_trace_max_entries);
+    alloc_buffer.setMaxEntries(alloc_buffer_max_entries);
    record_context_ = enabled ? when : RecordContext::NEVER;
    if (!enabled) {
-      std::lock_guard<std::recursive_mutex> lk(alloc_trace_lock);
-      alloc_trace_next = 0;
-      alloc_trace->clear();
+      alloc_buffer.clear();
    }
  }

@ -1792,23 +1842,7 @@ class DeviceCachingAllocator {
      const std::function<time_t(approx_time_t)>& tsc_to_us) {
    std::lock_guard<std::recursive_mutex> lock(mutex);
    std::vector<TraceEntry> result;
-
-    {
-      std::lock_guard<std::recursive_mutex> lk(alloc_trace_lock);
-      result.reserve(alloc_trace->size());
-      result.insert(
-          result.end(),
-          alloc_trace->begin() +
-              static_cast<std::vector<TraceEntry>::difference_type>(
-                  alloc_trace_next),
-          alloc_trace->end());
-      result.insert(
-          result.end(),
-          alloc_trace->begin(),
-          alloc_trace->begin() +
-              static_cast<std::vector<TraceEntry>::difference_type>(
-                  alloc_trace_next));
-    }
+    alloc_buffer.getTraceEntries(result);

    // Convert all the timestamps from tsc to epoch time in microseconds.
    for (auto& te : result) {
@ -2910,15 +2944,7 @@ class DeviceCachingAllocator {
    }

    if (record_history) {
-      std::lock_guard<std::recursive_mutex> lk(alloc_trace_lock);
-      if (alloc_trace->size() < alloc_trace_max_entries_) {
-        alloc_trace->emplace_back(te);
-      } else {
-        (*alloc_trace)[alloc_trace_next++] = te;
-        if (alloc_trace_next == alloc_trace_max_entries_) {
-          alloc_trace_next = 0;
-        }
-      }
+      alloc_buffer.insertTraceEntries(te);
    }
  }
 };
@ -3062,11 +3088,11 @@ class NativeCachingAllocator : public CUDAAllocator {
  void recordHistory(
      bool enabled,
      CreateContextFn context_recorder,
-      size_t alloc_trace_max_entries,
+      size_t alloc_buffer_max_entries,
      RecordContext when) override {
    for (auto& allocator : device_allocator) {
      allocator->recordHistory(
-          enabled, context_recorder, alloc_trace_max_entries, when);
+          enabled, context_recorder, alloc_buffer_max_entries, when);
    }
  }