[Memory Snapshot] Make recordAnnotations callback initialize lazily (#129242)

Summary: Make the recordAnnotations' Record function callback lazily initialize when record memory history starts. This will help reduce the impact on Time To First Batch metric. Test Plan: CI and ran locally. Differential Revision: D58875576 Pulled By: aaronenyeshi Pull Request resolved: https://github.com/pytorch/pytorch/pull/129242 Approved by: https://github.com/zdevito
2025-10-20 21:14:14 +08:00 · 2024-06-22 04:05:55 +00:00
parent 858fb05dac
commit f42d5b6dca
3 changed files with 31 additions and 25 deletions
--- a/c10/core/Allocator.h
+++ b/c10/core/Allocator.h
@ -4,7 +4,6 @@
 #include <cstdint>
 #include <functional>
 #include <memory>
-#include <string>
 #include <utility>

 #include <c10/core/Device.h>
--- a/torch/csrc/cuda/Module.cpp
+++ b/torch/csrc/cuda/Module.cpp
@ -3,7 +3,6 @@
 #include <ATen/core/TensorBody.h>
 #include <ATen/cuda/CUDAConfig.h>
 #include <ATen/native/ConvUtils.h>
-#include <ATen/record_function.h>
 #include <c10/core/Device.h>
 #include <c10/core/TensorImpl.h>
 #include <c10/util/UniqueVoidPtr.h>
@ -39,7 +38,6 @@
 #include <torch/csrc/cuda/THCP.h>
 #include <torch/csrc/cuda/memory_snapshot.h>
 #include <torch/csrc/cuda/python_comm.h>
-#include <torch/csrc/profiler/combined_traceback.h>
 #include <torch/csrc/profiler/python/combined_traceback.h>
 #include <torch/csrc/python_headers.h>
 #include <torch/csrc/utils/device_lazy_init.h>
@ -967,28 +965,6 @@ static void registerCudaDeviceProperties(PyObject* module) {
          const std::string&,
          size_t)>(torch::cuda::_record_memory_history));

-  // Save user annotations to CCA memory snapshot tool
-  at::addThreadLocalCallback(at::RecordFunctionCallback(
-      [](const at::RecordFunction& fn) -> std::unique_ptr<at::ObserverContext> {
-        if (fn.scope() != at::RecordScope::USER_SCOPE) {
-          return nullptr; // only record user-defined scopes.
-        }
-        unwind::Frame frame{fn.name(), "START", 0};
-        auto r = std::make_shared<CapturedTraceback>();
-        r->recordUserDefinedFrame(frame);
-        c10::cuda::CUDACachingAllocator::recordAnnotation(r);
-        return nullptr;
-      },
-      [](const at::RecordFunction& fn, at::ObserverContext* ctx_ptr) {
-        if (fn.scope() != at::RecordScope::USER_SCOPE) {
-          return; // only record user-defined scopes.
-        }
-        unwind::Frame frame{fn.name(), "END", 0};
-        auto r = std::make_shared<CapturedTraceback>();
-        r->recordUserDefinedFrame(frame);
-        c10::cuda::CUDACachingAllocator::recordAnnotation(r);
-      }));
-
  m.def("_cuda_isHistoryEnabled", []() {
    return c10::cuda::CUDACachingAllocator::isHistoryEnabled();
  });
--- a/torch/csrc/cuda/memory_snapshot.cpp
+++ b/torch/csrc/cuda/memory_snapshot.cpp
@ -1,4 +1,5 @@
 #include <ATen/Context.h>
+#include <ATen/record_function.h>
 #include <c10/cuda/CUDACachingAllocator.h>
 #include <torch/csrc/cuda/memory_snapshot.h>
 #include <torch/csrc/jit/runtime/interpreter.h>
@ -96,6 +97,34 @@ CapturedTraceback* getFromContext(
      "attempting to gather stack context from the wrong StackContext type.");
 }

+void _initRecordAnnotations() {
+  static c10::once_flag ra_init;
+  c10::call_once(ra_init, [&] {
+    // Save user annotations to CCA memory snapshot tool
+    at::addThreadLocalCallback(at::RecordFunctionCallback(
+        [](const at::RecordFunction& fn)
+            -> std::unique_ptr<at::ObserverContext> {
+          if (fn.scope() != at::RecordScope::USER_SCOPE) {
+            return nullptr; // only record user-defined scopes.
+          }
+          unwind::Frame frame{fn.name(), "START", 0};
+          auto r = std::make_shared<CapturedTraceback>();
+          r->recordUserDefinedFrame(frame);
+          c10::cuda::CUDACachingAllocator::recordAnnotation(r);
+          return nullptr;
+        },
+        [](const at::RecordFunction& fn, at::ObserverContext* ctx_ptr) {
+          if (fn.scope() != at::RecordScope::USER_SCOPE) {
+            return; // only record user-defined scopes.
+          }
+          unwind::Frame frame{fn.name(), "END", 0};
+          auto r = std::make_shared<CapturedTraceback>();
+          r->recordUserDefinedFrame(frame);
+          c10::cuda::CUDACachingAllocator::recordAnnotation(r);
+        }));
+  });
+}
+
 } // namespace

 void _record_memory_history(
@ -117,6 +146,7 @@ void _record_memory_history(
    when = c10::cuda::CUDACachingAllocator::RecordContext::STATE;
  }
  at::globalContext().lazyInitCUDA();
+  _initRecordAnnotations();
  c10::cuda::CUDACachingAllocator::recordHistory(
      enabled, recorder, trace_alloc_max_entries, when);
 }
@ -167,6 +197,7 @@ void _record_memory_history(
    }
  }
  at::globalContext().lazyInitCUDA();
+  _initRecordAnnotations();
  c10::cuda::CUDACachingAllocator::recordHistory(
      enabled.has_value(), recorder, max_entries, when);
 }