[Memory Snapshot] Make recordAnnotations callback initialize lazily (#129242)

Summary: Make the recordAnnotations' Record function callback lazily initialize when record memory history starts. This will help reduce the impact on Time To First Batch metric.

Test Plan: CI and ran locally.

Differential Revision: D58875576

Pulled By: aaronenyeshi

Pull Request resolved: https://github.com/pytorch/pytorch/pull/129242
Approved by: https://github.com/zdevito
This commit is contained in:
Aaron Enye Shi
2024-06-22 04:05:55 +00:00
committed by PyTorch MergeBot
parent 858fb05dac
commit f42d5b6dca
3 changed files with 31 additions and 25 deletions

View File

@ -4,7 +4,6 @@
#include <cstdint>
#include <functional>
#include <memory>
#include <string>
#include <utility>
#include <c10/core/Device.h>

View File

@ -3,7 +3,6 @@
#include <ATen/core/TensorBody.h>
#include <ATen/cuda/CUDAConfig.h>
#include <ATen/native/ConvUtils.h>
#include <ATen/record_function.h>
#include <c10/core/Device.h>
#include <c10/core/TensorImpl.h>
#include <c10/util/UniqueVoidPtr.h>
@ -39,7 +38,6 @@
#include <torch/csrc/cuda/THCP.h>
#include <torch/csrc/cuda/memory_snapshot.h>
#include <torch/csrc/cuda/python_comm.h>
#include <torch/csrc/profiler/combined_traceback.h>
#include <torch/csrc/profiler/python/combined_traceback.h>
#include <torch/csrc/python_headers.h>
#include <torch/csrc/utils/device_lazy_init.h>
@ -967,28 +965,6 @@ static void registerCudaDeviceProperties(PyObject* module) {
const std::string&,
size_t)>(torch::cuda::_record_memory_history));
// Save user annotations to CCA memory snapshot tool
at::addThreadLocalCallback(at::RecordFunctionCallback(
[](const at::RecordFunction& fn) -> std::unique_ptr<at::ObserverContext> {
if (fn.scope() != at::RecordScope::USER_SCOPE) {
return nullptr; // only record user-defined scopes.
}
unwind::Frame frame{fn.name(), "START", 0};
auto r = std::make_shared<CapturedTraceback>();
r->recordUserDefinedFrame(frame);
c10::cuda::CUDACachingAllocator::recordAnnotation(r);
return nullptr;
},
[](const at::RecordFunction& fn, at::ObserverContext* ctx_ptr) {
if (fn.scope() != at::RecordScope::USER_SCOPE) {
return; // only record user-defined scopes.
}
unwind::Frame frame{fn.name(), "END", 0};
auto r = std::make_shared<CapturedTraceback>();
r->recordUserDefinedFrame(frame);
c10::cuda::CUDACachingAllocator::recordAnnotation(r);
}));
m.def("_cuda_isHistoryEnabled", []() {
return c10::cuda::CUDACachingAllocator::isHistoryEnabled();
});

View File

@ -1,4 +1,5 @@
#include <ATen/Context.h>
#include <ATen/record_function.h>
#include <c10/cuda/CUDACachingAllocator.h>
#include <torch/csrc/cuda/memory_snapshot.h>
#include <torch/csrc/jit/runtime/interpreter.h>
@ -96,6 +97,34 @@ CapturedTraceback* getFromContext(
"attempting to gather stack context from the wrong StackContext type.");
}
void _initRecordAnnotations() {
static c10::once_flag ra_init;
c10::call_once(ra_init, [&] {
// Save user annotations to CCA memory snapshot tool
at::addThreadLocalCallback(at::RecordFunctionCallback(
[](const at::RecordFunction& fn)
-> std::unique_ptr<at::ObserverContext> {
if (fn.scope() != at::RecordScope::USER_SCOPE) {
return nullptr; // only record user-defined scopes.
}
unwind::Frame frame{fn.name(), "START", 0};
auto r = std::make_shared<CapturedTraceback>();
r->recordUserDefinedFrame(frame);
c10::cuda::CUDACachingAllocator::recordAnnotation(r);
return nullptr;
},
[](const at::RecordFunction& fn, at::ObserverContext* ctx_ptr) {
if (fn.scope() != at::RecordScope::USER_SCOPE) {
return; // only record user-defined scopes.
}
unwind::Frame frame{fn.name(), "END", 0};
auto r = std::make_shared<CapturedTraceback>();
r->recordUserDefinedFrame(frame);
c10::cuda::CUDACachingAllocator::recordAnnotation(r);
}));
});
}
} // namespace
void _record_memory_history(
@ -117,6 +146,7 @@ void _record_memory_history(
when = c10::cuda::CUDACachingAllocator::RecordContext::STATE;
}
at::globalContext().lazyInitCUDA();
_initRecordAnnotations();
c10::cuda::CUDACachingAllocator::recordHistory(
enabled, recorder, trace_alloc_max_entries, when);
}
@ -167,6 +197,7 @@ void _record_memory_history(
}
}
at::globalContext().lazyInitCUDA();
_initRecordAnnotations();
c10::cuda::CUDACachingAllocator::recordHistory(
enabled.has_value(), recorder, max_entries, when);
}