mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
[Memory Snapshot] Make recordAnnotations callback initialize lazily (#129242)
Summary: Make the recordAnnotations' Record function callback lazily initialize when record memory history starts. This will help reduce the impact on Time To First Batch metric. Test Plan: CI and ran locally. Differential Revision: D58875576 Pulled By: aaronenyeshi Pull Request resolved: https://github.com/pytorch/pytorch/pull/129242 Approved by: https://github.com/zdevito
This commit is contained in:
committed by
PyTorch MergeBot
parent
858fb05dac
commit
f42d5b6dca
@ -4,7 +4,6 @@
|
||||
#include <cstdint>
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
|
||||
#include <c10/core/Device.h>
|
||||
|
@ -3,7 +3,6 @@
|
||||
#include <ATen/core/TensorBody.h>
|
||||
#include <ATen/cuda/CUDAConfig.h>
|
||||
#include <ATen/native/ConvUtils.h>
|
||||
#include <ATen/record_function.h>
|
||||
#include <c10/core/Device.h>
|
||||
#include <c10/core/TensorImpl.h>
|
||||
#include <c10/util/UniqueVoidPtr.h>
|
||||
@ -39,7 +38,6 @@
|
||||
#include <torch/csrc/cuda/THCP.h>
|
||||
#include <torch/csrc/cuda/memory_snapshot.h>
|
||||
#include <torch/csrc/cuda/python_comm.h>
|
||||
#include <torch/csrc/profiler/combined_traceback.h>
|
||||
#include <torch/csrc/profiler/python/combined_traceback.h>
|
||||
#include <torch/csrc/python_headers.h>
|
||||
#include <torch/csrc/utils/device_lazy_init.h>
|
||||
@ -967,28 +965,6 @@ static void registerCudaDeviceProperties(PyObject* module) {
|
||||
const std::string&,
|
||||
size_t)>(torch::cuda::_record_memory_history));
|
||||
|
||||
// Save user annotations to CCA memory snapshot tool
|
||||
at::addThreadLocalCallback(at::RecordFunctionCallback(
|
||||
[](const at::RecordFunction& fn) -> std::unique_ptr<at::ObserverContext> {
|
||||
if (fn.scope() != at::RecordScope::USER_SCOPE) {
|
||||
return nullptr; // only record user-defined scopes.
|
||||
}
|
||||
unwind::Frame frame{fn.name(), "START", 0};
|
||||
auto r = std::make_shared<CapturedTraceback>();
|
||||
r->recordUserDefinedFrame(frame);
|
||||
c10::cuda::CUDACachingAllocator::recordAnnotation(r);
|
||||
return nullptr;
|
||||
},
|
||||
[](const at::RecordFunction& fn, at::ObserverContext* ctx_ptr) {
|
||||
if (fn.scope() != at::RecordScope::USER_SCOPE) {
|
||||
return; // only record user-defined scopes.
|
||||
}
|
||||
unwind::Frame frame{fn.name(), "END", 0};
|
||||
auto r = std::make_shared<CapturedTraceback>();
|
||||
r->recordUserDefinedFrame(frame);
|
||||
c10::cuda::CUDACachingAllocator::recordAnnotation(r);
|
||||
}));
|
||||
|
||||
m.def("_cuda_isHistoryEnabled", []() {
|
||||
return c10::cuda::CUDACachingAllocator::isHistoryEnabled();
|
||||
});
|
||||
|
@ -1,4 +1,5 @@
|
||||
#include <ATen/Context.h>
|
||||
#include <ATen/record_function.h>
|
||||
#include <c10/cuda/CUDACachingAllocator.h>
|
||||
#include <torch/csrc/cuda/memory_snapshot.h>
|
||||
#include <torch/csrc/jit/runtime/interpreter.h>
|
||||
@ -96,6 +97,34 @@ CapturedTraceback* getFromContext(
|
||||
"attempting to gather stack context from the wrong StackContext type.");
|
||||
}
|
||||
|
||||
void _initRecordAnnotations() {
|
||||
static c10::once_flag ra_init;
|
||||
c10::call_once(ra_init, [&] {
|
||||
// Save user annotations to CCA memory snapshot tool
|
||||
at::addThreadLocalCallback(at::RecordFunctionCallback(
|
||||
[](const at::RecordFunction& fn)
|
||||
-> std::unique_ptr<at::ObserverContext> {
|
||||
if (fn.scope() != at::RecordScope::USER_SCOPE) {
|
||||
return nullptr; // only record user-defined scopes.
|
||||
}
|
||||
unwind::Frame frame{fn.name(), "START", 0};
|
||||
auto r = std::make_shared<CapturedTraceback>();
|
||||
r->recordUserDefinedFrame(frame);
|
||||
c10::cuda::CUDACachingAllocator::recordAnnotation(r);
|
||||
return nullptr;
|
||||
},
|
||||
[](const at::RecordFunction& fn, at::ObserverContext* ctx_ptr) {
|
||||
if (fn.scope() != at::RecordScope::USER_SCOPE) {
|
||||
return; // only record user-defined scopes.
|
||||
}
|
||||
unwind::Frame frame{fn.name(), "END", 0};
|
||||
auto r = std::make_shared<CapturedTraceback>();
|
||||
r->recordUserDefinedFrame(frame);
|
||||
c10::cuda::CUDACachingAllocator::recordAnnotation(r);
|
||||
}));
|
||||
});
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
void _record_memory_history(
|
||||
@ -117,6 +146,7 @@ void _record_memory_history(
|
||||
when = c10::cuda::CUDACachingAllocator::RecordContext::STATE;
|
||||
}
|
||||
at::globalContext().lazyInitCUDA();
|
||||
_initRecordAnnotations();
|
||||
c10::cuda::CUDACachingAllocator::recordHistory(
|
||||
enabled, recorder, trace_alloc_max_entries, when);
|
||||
}
|
||||
@ -167,6 +197,7 @@ void _record_memory_history(
|
||||
}
|
||||
}
|
||||
at::globalContext().lazyInitCUDA();
|
||||
_initRecordAnnotations();
|
||||
c10::cuda::CUDACachingAllocator::recordHistory(
|
||||
enabled.has_value(), recorder, max_entries, when);
|
||||
}
|
||||
|
Reference in New Issue
Block a user