pytorch/torch/csrc/autograd/profiler_python.cpp

#include <torch/csrc/autograd/profiler_python.h>

#include <iostream>
#include <limits>
#include <memory>
#include <string>
#include <utility>
#include <vector>

#include <Python.h>
#include <frameobject.h>

#include <c10/macros/Macros.h>
#include <c10/util/flat_hash_map.h>
#include <c10/util/irange.h>
#include <torch/csrc/autograd/profiler_kineto.h>
#include <torch/csrc/utils/python_strings.h>
#include <torch/csrc/utils/pybind.h>

namespace py = pybind11;


namespace torch { namespace autograd { namespace profiler { namespace python_tracer {
namespace {

// ============================================================================
// == Core data types =========================================================
// ============================================================================

// PyObject that allows different threads to record events without colliding.
// It is passed as the second argument when enabling tracing via
// `PyEval_SetProfile`.
struct TraceContext {
  PyObject_HEAD

  // It is wasteful to store an entire PyThreadState* in RawEvent. So
  // instead, we map thread ids down to a compact space that we can store in
  // a single byte.
  uint8_t thread_id_;
  PyThreadState* thread_state_;

  // Likewise, int64_t is more precision than we need. By tracking when the
  // profiler starts we can store "time since profile begin" which can fit
  // into less space.
  int64_t initial_us_;

  // TODO:
  //   Wall time is actually fairly expensive to compute. Empirically, it
  //   takes ~600 ns to call `now()`. This puts a hard lower bound on the
  //   overhead of the tracer. If we collected wall time less frequently, and
  //   used TSC (e.g. through __rdtsc) to interpolate it should be possible
  //   to reduce time spent on timestamps while retaining the same level of
  //   accuracy.
};

// CPython boilerplate to define `TraceContext` as a proper python object.
static PyTypeObject TraceContextType = {
  PyVarObject_HEAD_INIT(nullptr, 0)
  "TraceContext",             /* tp_name */
  sizeof(TraceContext),       /* tp_basicsize */
  0,                          /* tp_itemsize */
  nullptr,                    /* tp_dealloc */
  0,                          /* tp_vectorcall_offset */  // NOLINT: modernize-use-nullptr
  nullptr,                    /* tp_getattr */
  nullptr,                    /* tp_setattr */
  nullptr,                    /* tp_reserved */
  nullptr,                    /* tp_repr */
  nullptr,                    /* tp_as_number */
  nullptr,                    /* tp_as_sequence */
  nullptr,                    /* tp_as_mapping */
  nullptr,                    /* tp_hash  */
  nullptr,                    /* tp_call */
  nullptr,                    /* tp_str */
  nullptr,                    /* tp_getattro */
  nullptr,                    /* tp_setattro */
  nullptr,                    /* tp_as_buffer */
  Py_TPFLAGS_DEFAULT,         /* tp_flags */
  "Python tracer TLS",        /* tp_doc */
  nullptr,                    /* tp_traverse */
  nullptr,                    /* tp_clear */
  nullptr,                    /* tp_richcompare */
  0,                          /* tp_weaklistoffset */
  nullptr,                    /* tp_iter */
  nullptr,                    /* tp_iternext */
  nullptr,                    /* tp_methods */
  nullptr,                    /* tp_members */
  nullptr,                    /* tp_getset */
  nullptr,                    /* tp_base */
  nullptr,                    /* tp_dict */
  nullptr,                    /* tp_descr_get */
  nullptr,                    /* tp_descr_set */
  0,                          /* tp_dictoffset */
  nullptr,                    /* tp_init */
  nullptr,                    /* tp_alloc */
  PyType_GenericNew,          /* tp_new */
  nullptr                     /* tp_free */
};

// CPython has a more expressive set of events for tracing / profiling:
//   https://github.com/python/cpython/blob/f291404a802d6a1bc50f817c7a26ff3ac9a199ff/Include/cpython/pystate.h#L22-L29
// As an implementation detail they are defined as 0-7, however we don't want
// to rely on that while bit packing. Furthermore, the CPython descriptions
// are finer granularity than we're interested in. We do not need to
// differentiate between a normal return and an exception (both act as a pop in
// our replay stack), and we are not interested in `PyTrace_LINE` or
// `PyTrace_OPCODE`. To simplify things we store our own enum when tracefunc is
// called, and then use for all subsequent processing.
enum TraceTag { kPy_Call = 0, kPy_Return, kC_Call, kC_Return };

//   When we are tracing a Python program, the general procedure is to record
// every time we enter or exit a function and later replay these events during
// post processing. Thus, during the profiling phase we want to do the MINIMAL
// amount of work to capture all of the information that we need; otherwise we
// will distort the profile. (While we don't wish to be terribly inefficient
// during post processing, we are willing to do extra fixup work in post if it
// reduces overhead in the profiling phase.)
//
//   To that end, `RawEvent` (which logs calls and returns) is bitpacked to
// reduce data stored and fit more events on a cache line. The following
// techniques are used:
//
//  1) Storing `tag_` as a uint8_t rather than a TraceTag.
//      The size of an enum, surprisingly, is not the amount of space needed
//      to store all the fields, but rather *at least* that size.
//      (`sizeof(TraceTag) == 2` on my system, for example.)
//
//  2) Storing thread id rather than the full PyThreadState*.
//
//  3) Storing f_lasti as a uint16_t rather than a full int.
//      In practice this is plenty. It is also less dangerous than it might
//      initially seem; when we call the CPython API (`PyCode_Addr2Line`) we
//      use the full int `f_lasti`. The truncation in the stored event only
//      affects the cache key when we replay the stack. While this could result
//      in cache misses (and unknown names) in corner cases, it has the
//      significant benefit of letting us skip the full line number calculation
//      after the first call to a function.
//
//  4) Storing time relative to the start of profiling.
//      In general profiling is short lived. Storing an entire int64_t just to
//      record that a handful of microseconds have passed is not a good use of
//      bits. So instead, we record the time since profiling began. We can
//      fit over an hour into a uint32_t which is far longer than the profiler
//      should ever run for a continuous period.
//
// With these tricks we can pack all of the above into a single 8 byte word.
// The second word is case dependent.
//
// One obvious question is: why manually tag the union rather than using a
// `std::variant`? (Or `c10::variant`, as it were.) The answer is that due
// to alignment the tag would have to be packed with the union data and
// `RawEvent` would grow to three words. (Not just 50% bigger, but also less
// cache friendly.)
struct RawEvent {
  RawEvent(TraceTag tag, int lasti, TraceContext* ctx)
      : tag_(static_cast<uint8_t>(tag)),
        thread_id_(ctx->thread_id_),
        lasti_(static_cast<uint16_t>(lasti)),
        misc_() {
    int64_t t = now() - ctx->initial_us_;
    t_ = static_cast<uint32_t>(t);

    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
        lasti <= std::numeric_limits<uint16_t>::max());
    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(t <= std::numeric_limits<uint32_t>::max());
  }

  RawEvent(TraceTag tag, int lasti, TraceContext* ctx, PyCodeObject* f_code)
      : RawEvent(tag, lasti, ctx) {
    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(tag == TraceTag::kPy_Call);
    misc_.f_code_ = f_code;
  }

  RawEvent(TraceTag tag, int lasti, TraceContext* ctx, PyObject* arg)
      : RawEvent(tag, lasti, ctx) {
    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(tag == TraceTag::kC_Call);
    misc_.arg_ = arg;
  }

  uint8_t tag_;
  uint8_t thread_id_;
  uint16_t lasti_;
  uint32_t t_;
  union {
    // TraceTag::kPy_Call
    PyCodeObject* f_code_;

    // TraceTag::kC_Call
    PyObject* arg_;

    // TraceTag::kPy_Return
    // TraceTag::kC_Return
    // ** Unused (placeholder) **
    void* null_;
  } misc_;

  C10_NODISCARD TraceTag tag() const {
    return static_cast<TraceTag>(tag_);
  }

  C10_NODISCARD int lasti() const {
    // f_lasti is positive, with one exception: CPython intializes frames
    // with `f_lasti = -1`. We don't want to give up half of the range by
    // switching to int16_t. So instead we do the fast (underflowing) cast
    // in the ctor, and rectify the value in this accessor which should
    // only be called during trace post processing.
    return lasti_ == std::numeric_limits<uint16_t>::max() ? (int)(-1)
                                                          : (int)lasti_;
  }
};

// Make sure the bit packing that we do in RawEvent actually results in the
// desired size reduction.
static_assert(sizeof(RawEvent) <= 16, "RawEvent is too large");

// std::hash doesn't have a specialization for pairs so we have to define one.
// A simple XOR is good enough for our purposes.
struct hash_pair {
  template <class T1, class T2>
  size_t operator()(const std::pair<T1, T2>& pair) const {
    return std::hash<T1>()(pair.first) ^ std::hash<T2>()(pair.second);
  }
};

// ============================================================================
// == Tracing implementation ==================================================
// ============================================================================
constexpr size_t max_py_threads = std::numeric_limits<uint8_t>::max() + 1;

class PythonTracer final {
 public:
  // Static methods serve as external interfaces (which expect raw pointers)
  // and handle forwarding to the singleton.
  static void call(Command c);

  static int pyProfileFn(
      PyObject* obj,
      PyFrameObject* frame,
      int what,
      PyObject* arg);

 private:
  PythonTracer();
  static PythonTracer& singleton();
  friend class PyTraceReplay;

  void start(size_t max_threads = max_py_threads);
  void stop();
  void clear();

  void recordPyCall(TraceContext* ctx, PyFrameObject* frame);
  void recordCCall(TraceContext* ctx, PyFrameObject* frame, PyObject* arg);
  void recordReturn(TraceContext* ctx, PyFrameObject* frame, TraceTag tag);

  void storeDescription(PyFrameObject* frame);
  void trackModule(PyFrameObject* frame);

  // It is imperitive that we do not store strings for each python function,
  // as that would do terrible things to our profiling overhead. So instead
  // we store the much cheaper pair of `PyCodeObject*` and `int` which we can
  // pack into `RawEvent`, and then store a mapping to the full strings the
  // first time we see a function.
  //
  // TODO:
  //   In theory we should be able to use a combination of Py_INCREF on
  //   `f_code` and string interning to skip this step. (Effectively reusing
  //   work that the CPython interpreter has already done.) However it tends
  //   to segfault and simply caching the strings is inexpensive.
  struct CodeDescription {
    CodeDescription(int line_no, std::string filename, std::string funcname)
        : line_no_(line_no),
          filename_(std::move(filename)),
          funcname_(std::move(funcname)) {}
    int line_no_;
    std::string filename_;
    std::string funcname_;
  };

  struct ModuleForward {
    ModuleForward(size_t event_index, PyObject* self)
        : event_index_(event_index), self_(self) {}
    size_t event_index_;

    // NB:
    //  This is a non-owning reference to keep `ModuleForward` POD;
    //  `PythonTracer` owns the contents instead. We  Py_INCREF in
    //  `trackModule`, and `reset` is responsible for  calling Py_DECREF
    //  when clearing `module_calls_`.
    PyObject* self_;
  };

  bool active_;
  PyObject* module_call_code_;
  std::vector<std::string> path_prefixes_;
  std::vector<TraceContext*> trace_contexts_;

  std::vector<RawEvent> events_;
  std::vector<ModuleForward> module_calls_;

  using DescriptionKey = std::pair</*f_code=*/PyCodeObject*, /*f_lasti=*/int>;
  ska::flat_hash_map<DescriptionKey, CodeDescription, hash_pair>
      code_descriptions_;
  ska::flat_hash_map<PyObject*, std::string> c_function_reprs_;
};

PythonTracer& PythonTracer::singleton() {
  static PythonTracer singleton_;
  return singleton_;
}

PythonTracer::PythonTracer() : active_(false) {
  path_prefixes_ = py::module::import("torch.profiler.python_tracer")
    .attr("_prefix_regex")().cast<std::vector<std::string>>();

  module_call_code_ = py::module::import("torch.nn")
    .attr("Module")
    .attr("__call__")
    .attr("__code__")
    .ptr();
}

void PythonTracer::start(size_t max_threads) {
  TORCH_CHECK(!active_, "PythonTracer is already active")
  TORCH_CHECK(
      !trace_contexts_.size(), "PythonTracer should not have active contexts");
  TORCH_CHECK(
      max_threads > 0, "max_threads must be positive, got ", max_threads);
  TORCH_CHECK(
      max_threads <= max_py_threads,
      "max_threads must be less than or equal to ",
      max_py_threads);

  pybind11::gil_scoped_acquire gil;
  auto t0 = now();

  // Loop over all threads within the current interpreter. We will need to
  // register a trace function with each thread. We set the current thread to
  // position zero to ensure that it is traced, and so we can restore the
  // thread state after registration.
  std::vector<PyThreadState*> thread_states{PyThreadState_Get()};
  if (max_threads > 1) {
    auto thread_state = thread_states[0];
    while (thread_state != nullptr) {
      if (thread_state != thread_states[0]) {
        thread_states.push_back(thread_state);
      }
      thread_state = PyThreadState_Next(thread_state);
    }

    if (thread_states.size() > max_threads) {
      std::cout << "Warning: can only trace " << max_threads << " threads. "
                << thread_states.size() << " are currently active."
                << std::endl;
      thread_states.resize(max_threads);
    }
  }

  // Register the tracer in each thread.
  for (const auto i : c10::irange(thread_states.size())) {
    PyThreadState* thread_state = thread_states[i];
    PyThreadState_Swap(thread_state);

    auto ctx = (TraceContext*)TraceContextType.tp_alloc(&TraceContextType, 0);
    ctx->thread_id_ = (uint8_t)i;
    ctx->thread_state_ = thread_state;
    ctx->initial_us_ = t0;
    trace_contexts_.push_back(ctx);

    // When we begin profiling there are already frames on the Python
    // interpreter stack. To ensure a complete trace, we must push calls
    // to all the prior frames onto our event stack. (We stop at depth=128)
    std::vector<PyFrameObject*> current_stack;
    auto frame = PyEval_GetFrame();
    size_t depth = 0; // Make sure we can't infinite loop.
    while (frame != nullptr && depth <= 128) {
      current_stack.push_back(frame);
      frame = frame->f_back;
      depth++;
    }
    for (auto it = current_stack.rbegin(); it != current_stack.rend(); it++) {
      recordPyCall(ctx, *it);
    }

    // Note:
    //   This profile will not compose with other CPython profilers, and
    //   cannot be round tripped via `sys.settrace(sys.gettrace())`
    PyEval_SetProfile(PythonTracer::pyProfileFn, (PyObject*)ctx);
  }

  // Restore the thread state to its initial value.
  PyThreadState_Swap(thread_states[0]);

  active_ = true;
};

void PythonTracer::stop() {
  TORCH_INTERNAL_ASSERT(active_, "PythonTracer is not running.")

  pybind11::gil_scoped_acquire gil;

  PyThreadState* initial_thread_state = PyThreadState_Get();
  for (const auto i : trace_contexts_) {
    PyThreadState_Swap(i->thread_state_);
    PyEval_SetProfile(nullptr, nullptr);
  }
  PyThreadState_Swap(initial_thread_state);
  active_ = false;
}

void PythonTracer::clear() {
  TORCH_CHECK(!active_, "Cannot clear state while PythonTracer is active.");
  for (auto i : trace_contexts_) {
    Py_DECREF((PyObject*)i);
  }
  trace_contexts_.clear();
  events_.clear();
  code_descriptions_.clear();
  c_function_reprs_.clear();
  for (auto& i : module_calls_) {
    Py_DECREF(i.self_);
  }
  module_calls_.clear();
}

void PythonTracer::recordPyCall(TraceContext* ctx, PyFrameObject* frame) {
  events_.emplace_back(TraceTag::kPy_Call, frame->f_lasti, ctx, frame->f_code);
  storeDescription(frame);
  trackModule(frame);
}

void PythonTracer::recordCCall(
    TraceContext* ctx,
    PyFrameObject* frame,
    PyObject* arg) {
  events_.emplace_back(TraceTag::kC_Call, frame->f_lasti, ctx, arg);
  const auto& it = c_function_reprs_.find(arg);
  if C10_UNLIKELY (it == c_function_reprs_.end()) {
    c_function_reprs_[arg] = py::repr(arg);
  }
}

void PythonTracer::recordReturn(
    TraceContext* ctx,
    PyFrameObject* frame,
    TraceTag tag) {
  events_.emplace_back(tag, frame->f_lasti, ctx);
}

// NB:
//  `frame->f_lasti` will advance as the interpreter progresses through the
//  code object. Thus, we need to call `storeDescription` when we record the
//  call rather than the return. (Otherwise we would get the line with the
//  return stmt.)
void PythonTracer::storeDescription(PyFrameObject* frame) {
  const auto& it = code_descriptions_.find({frame->f_code, frame->f_lasti});
  if C10_UNLIKELY (it == code_descriptions_.end()) {
    code_descriptions_.insert(
        {{frame->f_code, frame->f_lasti},
         {/*line_no=*/PyCode_Addr2Line(frame->f_code, frame->f_lasti),
          /*filename=*/THPUtils_unpackString(frame->f_code->co_filename),
          /*funcname=*/THPUtils_unpackString(frame->f_code->co_name)}});
  }
}

void PythonTracer::trackModule(PyFrameObject* frame) {
  if ((PyObject*)(frame->f_code) == module_call_code_) {
    // By default, CPython stores locals in a "fast" format, with an array
    // of names and an array of values. Consequently, frame->f_locals is
    // NULL since the interpreter has no need to populate it.
    //
    // If these arrays were part of the public API then we could very
    // quickly access `self`. Unfortunately they are not, and moreover are
    // not stable across versions. As a result, we are forced to call
    // `PyFrame_FastToLocals` which forces the interpreter to materialize
    // the full dict of locals.
    PyFrame_FastToLocals(frame);
    auto self = PyDict_GetItemString(frame->f_locals, "self");
    Py_INCREF(self);
    module_calls_.emplace_back(
        /*event_index=*/events_.size() - 1,
        /*self=*/self);
    PyFrame_LocalsToFast(frame, 0);
  }
};

// ============================================================================
// == Post processing =========================================================
// ============================================================================

class PyTraceReplay {
 public:
  static std::vector<std::unique_ptr<PyTraceEvent>> getEvents() {
    return PyTraceReplay().replayStack();
  }

 private:
  PyTraceReplay();
  std::vector<std::unique_ptr<PyTraceEvent>> replayStack() const;

  struct ReplayFrame {
    std::unique_ptr<PyTraceEvent> event_;
    size_t id_;
    size_t parent_id_;
  };

  ska::flat_hash_map<size_t, PyObject*> module_self_map_;
  ska::flat_hash_map<size_t, std::string> module_name_map_;
};

PyTraceReplay::PyTraceReplay() {
  ska::flat_hash_map<PyObject*, std::string> module_names;
  for (const auto& call : PythonTracer::singleton().module_calls_) {
    if (module_names.find(call.self_) == module_names.end()) {
      std::stringstream name_stream;
      auto py_class_name =
          py::handle(call.self_).attr("__class__").attr("__name__");
      name_stream << "nn.Module: " << py::str(py_class_name);
      module_names.insert({call.self_, name_stream.str()});
    }

    module_self_map_.insert({call.event_index_, call.self_});
    module_name_map_.insert({call.event_index_, module_names.at(call.self_)});
  }
}

// TODO: Use re2.
void trimPrefix(std::string& s, const std::vector<std::string>& prefixes) {
  for (const auto& p : prefixes) {
    if (s.compare(0, p.size(), p) == 0) {
      s.erase(0, p.size());
      return;
    }
  }
}

std::vector<std::unique_ptr<PyTraceEvent>> PyTraceReplay::replayStack() const {
  const auto& tracer = PythonTracer::singleton();

  // We want to prune paths to a sensible prefix. For example
  //   `/foo/bar/baz/site-packages/torch/__init__.py` -> `torch/__init__.py`
  // Pruning the path prefix is somewhat expensive, so we cache it.
  ska::flat_hash_map<std::string, std::string> filename_map;
  for (const auto& i : tracer.code_descriptions_) {
    if (filename_map.find(i.second.filename_) == filename_map.end()) {
      std::string s(i.second.filename_);
      trimPrefix(s, tracer.path_prefixes_);
      filename_map[i.second.filename_] = s;
    }
  }

  auto py_name = [&](const RawEvent& e) {
    const auto& desc_it =
        tracer.code_descriptions_.find({e.misc_.f_code_, e.lasti()});
    if (desc_it != tracer.code_descriptions_.end()) {
      std::stringstream name_stream;
      name_stream << filename_map.at(desc_it->second.filename_) << "("
                  << desc_it->second.line_no_
                  << "): " << desc_it->second.funcname_;
      return name_stream.str();
    }
    return std::string("Python: ???");
  };

  size_t id_counter = 0;
  std::vector<std::vector<ReplayFrame>> stacks(tracer.trace_contexts_.size());
  std::vector<ReplayFrame> results;

  // Match calls and returns.
  size_t event_idx = 0;
  for (auto& raw_event : tracer.events_) {
    auto& stack = stacks[raw_event.thread_id_];
    auto ctx = tracer.trace_contexts_[raw_event.thread_id_];
    auto t = static_cast<int64_t>(raw_event.t_) + ctx->initial_us_;

    auto push_frame =
        [&](std::string name, CallType call_type, size_t module_id = 0) {
          stack.push_back(ReplayFrame{
              /*event_=*/std::make_unique<PyTraceEvent>(PyTraceEvent{
                  /*startTime_=*/t,
                  /*endTime_=*/-1, // Placeholder
                  /*name_=*/name,
                  /*thread_id_=*/raw_event.thread_id_,
                  /*parent_=*/nullptr, // Placeholder
                  /*call_type_=*/call_type,
                  /*module_id_=*/module_id,
                  /*call_idx_=*/event_idx,
                  /*return_idx_=*/0 // Placeholder
              }),
              /*id_=*/id_counter++,
              /*parent_id_=*/stack.size() ? stack.back().id_ : 0,
          });
        };

    switch (raw_event.tag()) {
      case TraceTag::kPy_Call:
        if (module_name_map_.find(event_idx) != module_name_map_.end()) {
          push_frame(
              module_name_map_.at(event_idx),
              CallType::kPyModuleCall,
              reinterpret_cast<size_t>(module_self_map_.at(event_idx)));
        } else {
          push_frame(py_name(raw_event), CallType::kPyCall);
        }
        break;

      case TraceTag::kC_Call:
        push_frame(
            tracer.c_function_reprs_.at(raw_event.misc_.arg_),
            CallType::kCCall);
        break;

      case TraceTag::kPy_Return:
      case TraceTag::kC_Return:
        TORCH_INTERNAL_ASSERT(stack.size(), "Python replay stack is empty.")
        stack.back().event_->endTime_ = t;
        stack.back().event_->return_idx_ = event_idx;
        results.push_back(std::move(stack.back()));
        stack.pop_back();
        break;
    }
    event_idx++;
  }

  // Cleanup by feining return to close out the stack. This is needed so
  // frames above the one that called the profiler still appear in the trace.
  const auto t_final = now();
  for (auto& stack : stacks) {
    while (stack.size()) {
      stack.back().event_->endTime_ = t_final;
      stack.back().event_->return_idx_ = event_idx;
      results.push_back(std::move(stack.back()));
      stack.pop_back();
      event_idx++;
    }
  }

  // Convert to `PyTraceEvent`, and map id to pointer.
  ska::flat_hash_map<size_t, PyTraceEvent*> event_id_map{{0, nullptr}};
  std::vector<std::unique_ptr<PyTraceEvent>> out;
  for (auto& r : results) {
    out.push_back(std::move(r.event_));
    event_id_map.insert({r.id_, out.back().get()});
  }

  // Link parents to children.
  for (const auto i : c10::irange(results.size())) {
    out[i]->parent_ = event_id_map.at(results[i].parent_id_);
  }
  return out;
}

// ============================================================================
// == API =====================================================================
// ============================================================================
int PythonTracer::pyProfileFn(
    PyObject* obj,
    PyFrameObject* frame,
    int what,
    PyObject* arg) {
  auto ctx = reinterpret_cast<TraceContext*>(obj);
  switch (what) {
    case PyTrace_CALL:
      PythonTracer::singleton().recordPyCall(ctx, frame);
      break;

    case PyTrace_C_CALL:
      PythonTracer::singleton().recordCCall(ctx, frame, arg);
      break;

    case PyTrace_EXCEPTION:
    case PyTrace_RETURN:
      PythonTracer::singleton().recordReturn(ctx, frame, TraceTag::kPy_Return);
      break;

    case PyTrace_C_EXCEPTION:
    case PyTrace_C_RETURN:
      PythonTracer::singleton().recordReturn(ctx, frame, TraceTag::kC_Return);
      break;
  }
  return 0;
}

void PythonTracer::call(Command c) {
  switch (c) {
    case Command::kStartOne:
      PythonTracer::singleton().start(1);
      break;

    case Command::kStartAll:
      PythonTracer::singleton().start();
      break;

    case Command::kStop:
      PythonTracer::singleton().stop();
      break;

    case Command::kClear:
      PythonTracer::singleton().clear();
      break;

    default:
      break;
  }
};

} // namespace

void init() {
  pybind11::gil_scoped_acquire gil;
  TORCH_CHECK(PyType_Ready(&TraceContextType) == 0);

  registerFunctions(
      /*call=*/&PythonTracer::call,
      /*get_events=*/&PyTraceReplay::getEvents);
}
}}}} // namespace torch::autograd::profiler::python_tracer