mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-21 05:34:18 +08:00
There are some unfortunate style issues, like four space indents and various other minor issues. There is a pretty big overhaul coming to the python tracer, so I want to be able to commit them with more style compliant code. Differential Revision: [D36070201](https://our.internmc.facebook.com/intern/diff/D36070201/) Pull Request resolved: https://github.com/pytorch/pytorch/pull/77692 Approved by: https://github.com/aaronenyeshi
716 lines
25 KiB
C++
716 lines
25 KiB
C++
#include <torch/csrc/autograd/profiler_python.h>
|
|
|
|
#include <iostream>
|
|
#include <limits>
|
|
#include <memory>
|
|
#include <string>
|
|
#include <utility>
|
|
#include <vector>
|
|
|
|
#include <Python.h>
|
|
#include <frameobject.h>
|
|
|
|
#include <c10/macros/Macros.h>
|
|
#include <c10/util/flat_hash_map.h>
|
|
#include <c10/util/irange.h>
|
|
#include <torch/csrc/autograd/profiler_kineto.h>
|
|
#include <torch/csrc/utils/python_strings.h>
|
|
#include <torch/csrc/utils/pybind.h>
|
|
|
|
namespace py = pybind11;
|
|
|
|
|
|
namespace torch { namespace autograd { namespace profiler { namespace python_tracer {
|
|
namespace {
|
|
|
|
// ============================================================================
|
|
// == Core data types =========================================================
|
|
// ============================================================================
|
|
|
|
// PyObject that allows different threads to record events without colliding.
|
|
// It is passed as the second argument when enabling tracing via
|
|
// `PyEval_SetProfile`.
|
|
struct TraceContext {
|
|
PyObject_HEAD
|
|
|
|
// It is wasteful to store an entire PyThreadState* in RawEvent. So
|
|
// instead, we map thread ids down to a compact space that we can store in
|
|
// a single byte.
|
|
uint8_t thread_id_;
|
|
PyThreadState* thread_state_;
|
|
|
|
// Likewise, int64_t is more precision than we need. By tracking when the
|
|
// profiler starts we can store "time since profile begin" which can fit
|
|
// into less space.
|
|
int64_t initial_us_;
|
|
|
|
// TODO:
|
|
// Wall time is actually fairly expensive to compute. Empirically, it
|
|
// takes ~600 ns to call `now()`. This puts a hard lower bound on the
|
|
// overhead of the tracer. If we collected wall time less frequently, and
|
|
// used TSC (e.g. through __rdtsc) to interpolate it should be possible
|
|
// to reduce time spent on timestamps while retaining the same level of
|
|
// accuracy.
|
|
};
|
|
|
|
// CPython boilerplate to define `TraceContext` as a proper python object.
|
|
static PyTypeObject TraceContextType = {
|
|
PyVarObject_HEAD_INIT(nullptr, 0)
|
|
"TraceContext", /* tp_name */
|
|
sizeof(TraceContext), /* tp_basicsize */
|
|
0, /* tp_itemsize */
|
|
nullptr, /* tp_dealloc */
|
|
0, /* tp_vectorcall_offset */ // NOLINT: modernize-use-nullptr
|
|
nullptr, /* tp_getattr */
|
|
nullptr, /* tp_setattr */
|
|
nullptr, /* tp_reserved */
|
|
nullptr, /* tp_repr */
|
|
nullptr, /* tp_as_number */
|
|
nullptr, /* tp_as_sequence */
|
|
nullptr, /* tp_as_mapping */
|
|
nullptr, /* tp_hash */
|
|
nullptr, /* tp_call */
|
|
nullptr, /* tp_str */
|
|
nullptr, /* tp_getattro */
|
|
nullptr, /* tp_setattro */
|
|
nullptr, /* tp_as_buffer */
|
|
Py_TPFLAGS_DEFAULT, /* tp_flags */
|
|
"Python tracer TLS", /* tp_doc */
|
|
nullptr, /* tp_traverse */
|
|
nullptr, /* tp_clear */
|
|
nullptr, /* tp_richcompare */
|
|
0, /* tp_weaklistoffset */
|
|
nullptr, /* tp_iter */
|
|
nullptr, /* tp_iternext */
|
|
nullptr, /* tp_methods */
|
|
nullptr, /* tp_members */
|
|
nullptr, /* tp_getset */
|
|
nullptr, /* tp_base */
|
|
nullptr, /* tp_dict */
|
|
nullptr, /* tp_descr_get */
|
|
nullptr, /* tp_descr_set */
|
|
0, /* tp_dictoffset */
|
|
nullptr, /* tp_init */
|
|
nullptr, /* tp_alloc */
|
|
PyType_GenericNew, /* tp_new */
|
|
nullptr /* tp_free */
|
|
};
|
|
|
|
// CPython has a more expressive set of events for tracing / profiling:
|
|
// https://github.com/python/cpython/blob/f291404a802d6a1bc50f817c7a26ff3ac9a199ff/Include/cpython/pystate.h#L22-L29
|
|
// As an implementation detail they are defined as 0-7, however we don't want
|
|
// to rely on that while bit packing. Furthermore, the CPython descriptions
|
|
// are finer granularity than we're interested in. We do not need to
|
|
// differentiate between a normal return and an exception (both act as a pop in
|
|
// our replay stack), and we are not interested in `PyTrace_LINE` or
|
|
// `PyTrace_OPCODE`. To simplify things we store our own enum when tracefunc is
|
|
// called, and then use for all subsequent processing.
|
|
enum TraceTag { kPy_Call = 0, kPy_Return, kC_Call, kC_Return };
|
|
|
|
// When we are tracing a Python program, the general procedure is to record
|
|
// every time we enter or exit a function and later replay these events during
|
|
// post processing. Thus, during the profiling phase we want to do the MINIMAL
|
|
// amount of work to capture all of the information that we need; otherwise we
|
|
// will distort the profile. (While we don't wish to be terribly inefficient
|
|
// during post processing, we are willing to do extra fixup work in post if it
|
|
// reduces overhead in the profiling phase.)
|
|
//
|
|
// To that end, `RawEvent` (which logs calls and returns) is bitpacked to
|
|
// reduce data stored and fit more events on a cache line. The following
|
|
// techniques are used:
|
|
//
|
|
// 1) Storing `tag_` as a uint8_t rather than a TraceTag.
|
|
// The size of an enum, surprisingly, is not the amount of space needed
|
|
// to store all the fields, but rather *at least* that size.
|
|
// (`sizeof(TraceTag) == 2` on my system, for example.)
|
|
//
|
|
// 2) Storing thread id rather than the full PyThreadState*.
|
|
//
|
|
// 3) Storing f_lasti as a uint16_t rather than a full int.
|
|
// In practice this is plenty. It is also less dangerous than it might
|
|
// initially seem; when we call the CPython API (`PyCode_Addr2Line`) we
|
|
// use the full int `f_lasti`. The truncation in the stored event only
|
|
// affects the cache key when we replay the stack. While this could result
|
|
// in cache misses (and unknown names) in corner cases, it has the
|
|
// significant benefit of letting us skip the full line number calculation
|
|
// after the first call to a function.
|
|
//
|
|
// 4) Storing time relative to the start of profiling.
|
|
// In general profiling is short lived. Storing an entire int64_t just to
|
|
// record that a handful of microseconds have passed is not a good use of
|
|
// bits. So instead, we record the time since profiling began. We can
|
|
// fit over an hour into a uint32_t which is far longer than the profiler
|
|
// should ever run for a continuous period.
|
|
//
|
|
// With these tricks we can pack all of the above into a single 8 byte word.
|
|
// The second word is case dependent.
|
|
//
|
|
// One obvious question is: why manually tag the union rather than using a
|
|
// `std::variant`? (Or `c10::variant`, as it were.) The answer is that due
|
|
// to alignment the tag would have to be packed with the union data and
|
|
// `RawEvent` would grow to three words. (Not just 50% bigger, but also less
|
|
// cache friendly.)
|
|
struct RawEvent {
|
|
RawEvent(TraceTag tag, int lasti, TraceContext* ctx)
|
|
: tag_(static_cast<uint8_t>(tag)),
|
|
thread_id_(ctx->thread_id_),
|
|
lasti_(static_cast<uint16_t>(lasti)),
|
|
misc_() {
|
|
int64_t t = now() - ctx->initial_us_;
|
|
t_ = static_cast<uint32_t>(t);
|
|
|
|
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
|
|
lasti <= std::numeric_limits<uint16_t>::max());
|
|
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(t <= std::numeric_limits<uint32_t>::max());
|
|
}
|
|
|
|
RawEvent(TraceTag tag, int lasti, TraceContext* ctx, PyCodeObject* f_code)
|
|
: RawEvent(tag, lasti, ctx) {
|
|
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(tag == TraceTag::kPy_Call);
|
|
misc_.f_code_ = f_code;
|
|
}
|
|
|
|
RawEvent(TraceTag tag, int lasti, TraceContext* ctx, PyObject* arg)
|
|
: RawEvent(tag, lasti, ctx) {
|
|
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(tag == TraceTag::kC_Call);
|
|
misc_.arg_ = arg;
|
|
}
|
|
|
|
uint8_t tag_;
|
|
uint8_t thread_id_;
|
|
uint16_t lasti_;
|
|
uint32_t t_;
|
|
union {
|
|
// TraceTag::kPy_Call
|
|
PyCodeObject* f_code_;
|
|
|
|
// TraceTag::kC_Call
|
|
PyObject* arg_;
|
|
|
|
// TraceTag::kPy_Return
|
|
// TraceTag::kC_Return
|
|
// ** Unused (placeholder) **
|
|
void* null_;
|
|
} misc_;
|
|
|
|
C10_NODISCARD TraceTag tag() const {
|
|
return static_cast<TraceTag>(tag_);
|
|
}
|
|
|
|
C10_NODISCARD int lasti() const {
|
|
// f_lasti is positive, with one exception: CPython intializes frames
|
|
// with `f_lasti = -1`. We don't want to give up half of the range by
|
|
// switching to int16_t. So instead we do the fast (underflowing) cast
|
|
// in the ctor, and rectify the value in this accessor which should
|
|
// only be called during trace post processing.
|
|
return lasti_ == std::numeric_limits<uint16_t>::max() ? (int)(-1)
|
|
: (int)lasti_;
|
|
}
|
|
};
|
|
|
|
// Make sure the bit packing that we do in RawEvent actually results in the
|
|
// desired size reduction.
|
|
static_assert(sizeof(RawEvent) <= 16, "RawEvent is too large");
|
|
|
|
// std::hash doesn't have a specialization for pairs so we have to define one.
|
|
// A simple XOR is good enough for our purposes.
|
|
struct hash_pair {
|
|
template <class T1, class T2>
|
|
size_t operator()(const std::pair<T1, T2>& pair) const {
|
|
return std::hash<T1>()(pair.first) ^ std::hash<T2>()(pair.second);
|
|
}
|
|
};
|
|
|
|
// ============================================================================
|
|
// == Tracing implementation ==================================================
|
|
// ============================================================================
|
|
constexpr size_t max_py_threads = std::numeric_limits<uint8_t>::max() + 1;
|
|
|
|
class PythonTracer final {
|
|
public:
|
|
// Static methods serve as external interfaces (which expect raw pointers)
|
|
// and handle forwarding to the singleton.
|
|
static void call(Command c);
|
|
|
|
static int pyProfileFn(
|
|
PyObject* obj,
|
|
PyFrameObject* frame,
|
|
int what,
|
|
PyObject* arg);
|
|
|
|
private:
|
|
PythonTracer();
|
|
static PythonTracer& singleton();
|
|
friend class PyTraceReplay;
|
|
|
|
void start(size_t max_threads = max_py_threads);
|
|
void stop();
|
|
void clear();
|
|
|
|
void recordPyCall(TraceContext* ctx, PyFrameObject* frame);
|
|
void recordCCall(TraceContext* ctx, PyFrameObject* frame, PyObject* arg);
|
|
void recordReturn(TraceContext* ctx, PyFrameObject* frame, TraceTag tag);
|
|
|
|
void storeDescription(PyFrameObject* frame);
|
|
void trackModule(PyFrameObject* frame);
|
|
|
|
// It is imperitive that we do not store strings for each python function,
|
|
// as that would do terrible things to our profiling overhead. So instead
|
|
// we store the much cheaper pair of `PyCodeObject*` and `int` which we can
|
|
// pack into `RawEvent`, and then store a mapping to the full strings the
|
|
// first time we see a function.
|
|
//
|
|
// TODO:
|
|
// In theory we should be able to use a combination of Py_INCREF on
|
|
// `f_code` and string interning to skip this step. (Effectively reusing
|
|
// work that the CPython interpreter has already done.) However it tends
|
|
// to segfault and simply caching the strings is inexpensive.
|
|
struct CodeDescription {
|
|
CodeDescription(int line_no, std::string filename, std::string funcname)
|
|
: line_no_(line_no),
|
|
filename_(std::move(filename)),
|
|
funcname_(std::move(funcname)) {}
|
|
int line_no_;
|
|
std::string filename_;
|
|
std::string funcname_;
|
|
};
|
|
|
|
struct ModuleForward {
|
|
ModuleForward(size_t event_index, PyObject* self)
|
|
: event_index_(event_index), self_(self) {}
|
|
size_t event_index_;
|
|
|
|
// NB:
|
|
// This is a non-owning reference to keep `ModuleForward` POD;
|
|
// `PythonTracer` owns the contents instead. We Py_INCREF in
|
|
// `trackModule`, and `reset` is responsible for calling Py_DECREF
|
|
// when clearing `module_calls_`.
|
|
PyObject* self_;
|
|
};
|
|
|
|
bool active_;
|
|
PyObject* module_call_code_;
|
|
std::vector<std::string> path_prefixes_;
|
|
std::vector<TraceContext*> trace_contexts_;
|
|
|
|
std::vector<RawEvent> events_;
|
|
std::vector<ModuleForward> module_calls_;
|
|
|
|
using DescriptionKey = std::pair</*f_code=*/PyCodeObject*, /*f_lasti=*/int>;
|
|
ska::flat_hash_map<DescriptionKey, CodeDescription, hash_pair>
|
|
code_descriptions_;
|
|
ska::flat_hash_map<PyObject*, std::string> c_function_reprs_;
|
|
};
|
|
|
|
PythonTracer& PythonTracer::singleton() {
|
|
static PythonTracer singleton_;
|
|
return singleton_;
|
|
}
|
|
|
|
PythonTracer::PythonTracer() : active_(false) {
|
|
path_prefixes_ = py::module::import("torch.profiler.python_tracer")
|
|
.attr("_prefix_regex")().cast<std::vector<std::string>>();
|
|
|
|
module_call_code_ = py::module::import("torch.nn")
|
|
.attr("Module")
|
|
.attr("__call__")
|
|
.attr("__code__")
|
|
.ptr();
|
|
}
|
|
|
|
void PythonTracer::start(size_t max_threads) {
|
|
TORCH_CHECK(!active_, "PythonTracer is already active")
|
|
TORCH_CHECK(
|
|
!trace_contexts_.size(), "PythonTracer should not have active contexts");
|
|
TORCH_CHECK(
|
|
max_threads > 0, "max_threads must be positive, got ", max_threads);
|
|
TORCH_CHECK(
|
|
max_threads <= max_py_threads,
|
|
"max_threads must be less than or equal to ",
|
|
max_py_threads);
|
|
|
|
pybind11::gil_scoped_acquire gil;
|
|
auto t0 = now();
|
|
|
|
// Loop over all threads within the current interpreter. We will need to
|
|
// register a trace function with each thread. We set the current thread to
|
|
// position zero to ensure that it is traced, and so we can restore the
|
|
// thread state after registration.
|
|
std::vector<PyThreadState*> thread_states{PyThreadState_Get()};
|
|
if (max_threads > 1) {
|
|
auto thread_state = thread_states[0];
|
|
while (thread_state != nullptr) {
|
|
if (thread_state != thread_states[0]) {
|
|
thread_states.push_back(thread_state);
|
|
}
|
|
thread_state = PyThreadState_Next(thread_state);
|
|
}
|
|
|
|
if (thread_states.size() > max_threads) {
|
|
std::cout << "Warning: can only trace " << max_threads << " threads. "
|
|
<< thread_states.size() << " are currently active."
|
|
<< std::endl;
|
|
thread_states.resize(max_threads);
|
|
}
|
|
}
|
|
|
|
// Register the tracer in each thread.
|
|
for (const auto i : c10::irange(thread_states.size())) {
|
|
PyThreadState* thread_state = thread_states[i];
|
|
PyThreadState_Swap(thread_state);
|
|
|
|
auto ctx = (TraceContext*)TraceContextType.tp_alloc(&TraceContextType, 0);
|
|
ctx->thread_id_ = (uint8_t)i;
|
|
ctx->thread_state_ = thread_state;
|
|
ctx->initial_us_ = t0;
|
|
trace_contexts_.push_back(ctx);
|
|
|
|
// When we begin profiling there are already frames on the Python
|
|
// interpreter stack. To ensure a complete trace, we must push calls
|
|
// to all the prior frames onto our event stack. (We stop at depth=128)
|
|
std::vector<PyFrameObject*> current_stack;
|
|
auto frame = PyEval_GetFrame();
|
|
size_t depth = 0; // Make sure we can't infinite loop.
|
|
while (frame != nullptr && depth <= 128) {
|
|
current_stack.push_back(frame);
|
|
frame = frame->f_back;
|
|
depth++;
|
|
}
|
|
for (auto it = current_stack.rbegin(); it != current_stack.rend(); it++) {
|
|
recordPyCall(ctx, *it);
|
|
}
|
|
|
|
// Note:
|
|
// This profile will not compose with other CPython profilers, and
|
|
// cannot be round tripped via `sys.settrace(sys.gettrace())`
|
|
PyEval_SetProfile(PythonTracer::pyProfileFn, (PyObject*)ctx);
|
|
}
|
|
|
|
// Restore the thread state to its initial value.
|
|
PyThreadState_Swap(thread_states[0]);
|
|
|
|
active_ = true;
|
|
};
|
|
|
|
void PythonTracer::stop() {
|
|
TORCH_INTERNAL_ASSERT(active_, "PythonTracer is not running.")
|
|
|
|
pybind11::gil_scoped_acquire gil;
|
|
|
|
PyThreadState* initial_thread_state = PyThreadState_Get();
|
|
for (const auto i : trace_contexts_) {
|
|
PyThreadState_Swap(i->thread_state_);
|
|
PyEval_SetProfile(nullptr, nullptr);
|
|
}
|
|
PyThreadState_Swap(initial_thread_state);
|
|
active_ = false;
|
|
}
|
|
|
|
void PythonTracer::clear() {
|
|
TORCH_CHECK(!active_, "Cannot clear state while PythonTracer is active.");
|
|
for (auto i : trace_contexts_) {
|
|
Py_DECREF((PyObject*)i);
|
|
}
|
|
trace_contexts_.clear();
|
|
events_.clear();
|
|
code_descriptions_.clear();
|
|
c_function_reprs_.clear();
|
|
for (auto& i : module_calls_) {
|
|
Py_DECREF(i.self_);
|
|
}
|
|
module_calls_.clear();
|
|
}
|
|
|
|
void PythonTracer::recordPyCall(TraceContext* ctx, PyFrameObject* frame) {
|
|
events_.emplace_back(TraceTag::kPy_Call, frame->f_lasti, ctx, frame->f_code);
|
|
storeDescription(frame);
|
|
trackModule(frame);
|
|
}
|
|
|
|
void PythonTracer::recordCCall(
|
|
TraceContext* ctx,
|
|
PyFrameObject* frame,
|
|
PyObject* arg) {
|
|
events_.emplace_back(TraceTag::kC_Call, frame->f_lasti, ctx, arg);
|
|
const auto& it = c_function_reprs_.find(arg);
|
|
if C10_UNLIKELY (it == c_function_reprs_.end()) {
|
|
c_function_reprs_[arg] = py::repr(arg);
|
|
}
|
|
}
|
|
|
|
void PythonTracer::recordReturn(
|
|
TraceContext* ctx,
|
|
PyFrameObject* frame,
|
|
TraceTag tag) {
|
|
events_.emplace_back(tag, frame->f_lasti, ctx);
|
|
}
|
|
|
|
// NB:
|
|
// `frame->f_lasti` will advance as the interpreter progresses through the
|
|
// code object. Thus, we need to call `storeDescription` when we record the
|
|
// call rather than the return. (Otherwise we would get the line with the
|
|
// return stmt.)
|
|
void PythonTracer::storeDescription(PyFrameObject* frame) {
|
|
const auto& it = code_descriptions_.find({frame->f_code, frame->f_lasti});
|
|
if C10_UNLIKELY (it == code_descriptions_.end()) {
|
|
code_descriptions_.insert(
|
|
{{frame->f_code, frame->f_lasti},
|
|
{/*line_no=*/PyCode_Addr2Line(frame->f_code, frame->f_lasti),
|
|
/*filename=*/THPUtils_unpackString(frame->f_code->co_filename),
|
|
/*funcname=*/THPUtils_unpackString(frame->f_code->co_name)}});
|
|
}
|
|
}
|
|
|
|
void PythonTracer::trackModule(PyFrameObject* frame) {
|
|
if ((PyObject*)(frame->f_code) == module_call_code_) {
|
|
// By default, CPython stores locals in a "fast" format, with an array
|
|
// of names and an array of values. Consequently, frame->f_locals is
|
|
// NULL since the interpreter has no need to populate it.
|
|
//
|
|
// If these arrays were part of the public API then we could very
|
|
// quickly access `self`. Unfortunately they are not, and moreover are
|
|
// not stable across versions. As a result, we are forced to call
|
|
// `PyFrame_FastToLocals` which forces the interpreter to materialize
|
|
// the full dict of locals.
|
|
PyFrame_FastToLocals(frame);
|
|
auto self = PyDict_GetItemString(frame->f_locals, "self");
|
|
Py_INCREF(self);
|
|
module_calls_.emplace_back(
|
|
/*event_index=*/events_.size() - 1,
|
|
/*self=*/self);
|
|
PyFrame_LocalsToFast(frame, 0);
|
|
}
|
|
};
|
|
|
|
// ============================================================================
|
|
// == Post processing =========================================================
|
|
// ============================================================================
|
|
|
|
class PyTraceReplay {
|
|
public:
|
|
static std::vector<std::unique_ptr<PyTraceEvent>> getEvents() {
|
|
return PyTraceReplay().replayStack();
|
|
}
|
|
|
|
private:
|
|
PyTraceReplay();
|
|
std::vector<std::unique_ptr<PyTraceEvent>> replayStack() const;
|
|
|
|
struct ReplayFrame {
|
|
std::unique_ptr<PyTraceEvent> event_;
|
|
size_t id_;
|
|
size_t parent_id_;
|
|
};
|
|
|
|
ska::flat_hash_map<size_t, PyObject*> module_self_map_;
|
|
ska::flat_hash_map<size_t, std::string> module_name_map_;
|
|
};
|
|
|
|
PyTraceReplay::PyTraceReplay() {
|
|
ska::flat_hash_map<PyObject*, std::string> module_names;
|
|
for (const auto& call : PythonTracer::singleton().module_calls_) {
|
|
if (module_names.find(call.self_) == module_names.end()) {
|
|
std::stringstream name_stream;
|
|
auto py_class_name =
|
|
py::handle(call.self_).attr("__class__").attr("__name__");
|
|
name_stream << "nn.Module: " << py::str(py_class_name);
|
|
module_names.insert({call.self_, name_stream.str()});
|
|
}
|
|
|
|
module_self_map_.insert({call.event_index_, call.self_});
|
|
module_name_map_.insert({call.event_index_, module_names.at(call.self_)});
|
|
}
|
|
}
|
|
|
|
// TODO: Use re2.
|
|
void trimPrefix(std::string& s, const std::vector<std::string>& prefixes) {
|
|
for (const auto& p : prefixes) {
|
|
if (s.compare(0, p.size(), p) == 0) {
|
|
s.erase(0, p.size());
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
std::vector<std::unique_ptr<PyTraceEvent>> PyTraceReplay::replayStack() const {
|
|
const auto& tracer = PythonTracer::singleton();
|
|
|
|
// We want to prune paths to a sensible prefix. For example
|
|
// `/foo/bar/baz/site-packages/torch/__init__.py` -> `torch/__init__.py`
|
|
// Pruning the path prefix is somewhat expensive, so we cache it.
|
|
ska::flat_hash_map<std::string, std::string> filename_map;
|
|
for (const auto& i : tracer.code_descriptions_) {
|
|
if (filename_map.find(i.second.filename_) == filename_map.end()) {
|
|
std::string s(i.second.filename_);
|
|
trimPrefix(s, tracer.path_prefixes_);
|
|
filename_map[i.second.filename_] = s;
|
|
}
|
|
}
|
|
|
|
auto py_name = [&](const RawEvent& e) {
|
|
const auto& desc_it =
|
|
tracer.code_descriptions_.find({e.misc_.f_code_, e.lasti()});
|
|
if (desc_it != tracer.code_descriptions_.end()) {
|
|
std::stringstream name_stream;
|
|
name_stream << filename_map.at(desc_it->second.filename_) << "("
|
|
<< desc_it->second.line_no_
|
|
<< "): " << desc_it->second.funcname_;
|
|
return name_stream.str();
|
|
}
|
|
return std::string("Python: ???");
|
|
};
|
|
|
|
size_t id_counter = 0;
|
|
std::vector<std::vector<ReplayFrame>> stacks(tracer.trace_contexts_.size());
|
|
std::vector<ReplayFrame> results;
|
|
|
|
// Match calls and returns.
|
|
size_t event_idx = 0;
|
|
for (auto& raw_event : tracer.events_) {
|
|
auto& stack = stacks[raw_event.thread_id_];
|
|
auto ctx = tracer.trace_contexts_[raw_event.thread_id_];
|
|
auto t = static_cast<int64_t>(raw_event.t_) + ctx->initial_us_;
|
|
|
|
auto push_frame =
|
|
[&](std::string name, CallType call_type, size_t module_id = 0) {
|
|
stack.push_back(ReplayFrame{
|
|
/*event_=*/std::make_unique<PyTraceEvent>(PyTraceEvent{
|
|
/*startTime_=*/t,
|
|
/*endTime_=*/-1, // Placeholder
|
|
/*name_=*/name,
|
|
/*thread_id_=*/raw_event.thread_id_,
|
|
/*parent_=*/nullptr, // Placeholder
|
|
/*call_type_=*/call_type,
|
|
/*module_id_=*/module_id,
|
|
/*call_idx_=*/event_idx,
|
|
/*return_idx_=*/0 // Placeholder
|
|
}),
|
|
/*id_=*/id_counter++,
|
|
/*parent_id_=*/stack.size() ? stack.back().id_ : 0,
|
|
});
|
|
};
|
|
|
|
switch (raw_event.tag()) {
|
|
case TraceTag::kPy_Call:
|
|
if (module_name_map_.find(event_idx) != module_name_map_.end()) {
|
|
push_frame(
|
|
module_name_map_.at(event_idx),
|
|
CallType::kPyModuleCall,
|
|
reinterpret_cast<size_t>(module_self_map_.at(event_idx)));
|
|
} else {
|
|
push_frame(py_name(raw_event), CallType::kPyCall);
|
|
}
|
|
break;
|
|
|
|
case TraceTag::kC_Call:
|
|
push_frame(
|
|
tracer.c_function_reprs_.at(raw_event.misc_.arg_),
|
|
CallType::kCCall);
|
|
break;
|
|
|
|
case TraceTag::kPy_Return:
|
|
case TraceTag::kC_Return:
|
|
TORCH_INTERNAL_ASSERT(stack.size(), "Python replay stack is empty.")
|
|
stack.back().event_->endTime_ = t;
|
|
stack.back().event_->return_idx_ = event_idx;
|
|
results.push_back(std::move(stack.back()));
|
|
stack.pop_back();
|
|
break;
|
|
}
|
|
event_idx++;
|
|
}
|
|
|
|
// Cleanup by feining return to close out the stack. This is needed so
|
|
// frames above the one that called the profiler still appear in the trace.
|
|
const auto t_final = now();
|
|
for (auto& stack : stacks) {
|
|
while (stack.size()) {
|
|
stack.back().event_->endTime_ = t_final;
|
|
stack.back().event_->return_idx_ = event_idx;
|
|
results.push_back(std::move(stack.back()));
|
|
stack.pop_back();
|
|
event_idx++;
|
|
}
|
|
}
|
|
|
|
// Convert to `PyTraceEvent`, and map id to pointer.
|
|
ska::flat_hash_map<size_t, PyTraceEvent*> event_id_map{{0, nullptr}};
|
|
std::vector<std::unique_ptr<PyTraceEvent>> out;
|
|
for (auto& r : results) {
|
|
out.push_back(std::move(r.event_));
|
|
event_id_map.insert({r.id_, out.back().get()});
|
|
}
|
|
|
|
// Link parents to children.
|
|
for (const auto i : c10::irange(results.size())) {
|
|
out[i]->parent_ = event_id_map.at(results[i].parent_id_);
|
|
}
|
|
return out;
|
|
}
|
|
|
|
// ============================================================================
|
|
// == API =====================================================================
|
|
// ============================================================================
|
|
int PythonTracer::pyProfileFn(
|
|
PyObject* obj,
|
|
PyFrameObject* frame,
|
|
int what,
|
|
PyObject* arg) {
|
|
auto ctx = reinterpret_cast<TraceContext*>(obj);
|
|
switch (what) {
|
|
case PyTrace_CALL:
|
|
PythonTracer::singleton().recordPyCall(ctx, frame);
|
|
break;
|
|
|
|
case PyTrace_C_CALL:
|
|
PythonTracer::singleton().recordCCall(ctx, frame, arg);
|
|
break;
|
|
|
|
case PyTrace_EXCEPTION:
|
|
case PyTrace_RETURN:
|
|
PythonTracer::singleton().recordReturn(ctx, frame, TraceTag::kPy_Return);
|
|
break;
|
|
|
|
case PyTrace_C_EXCEPTION:
|
|
case PyTrace_C_RETURN:
|
|
PythonTracer::singleton().recordReturn(ctx, frame, TraceTag::kC_Return);
|
|
break;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
void PythonTracer::call(Command c) {
|
|
switch (c) {
|
|
case Command::kStartOne:
|
|
PythonTracer::singleton().start(1);
|
|
break;
|
|
|
|
case Command::kStartAll:
|
|
PythonTracer::singleton().start();
|
|
break;
|
|
|
|
case Command::kStop:
|
|
PythonTracer::singleton().stop();
|
|
break;
|
|
|
|
case Command::kClear:
|
|
PythonTracer::singleton().clear();
|
|
break;
|
|
|
|
default:
|
|
break;
|
|
}
|
|
};
|
|
|
|
} // namespace
|
|
|
|
void init() {
|
|
pybind11::gil_scoped_acquire gil;
|
|
TORCH_CHECK(PyType_Ready(&TraceContextType) == 0);
|
|
|
|
registerFunctions(
|
|
/*call=*/&PythonTracer::call,
|
|
/*get_events=*/&PyTraceReplay::getEvents);
|
|
}
|
|
}}}} // namespace torch::autograd::profiler::python_tracer
|