[Profiler/CPU] Add API for Dynamic Activity Toggling [3/n] (#133353)

Summary: In this diff, we add the CPU activity implementation of being able to dynamically toggle profiling in between steps. To do this we remove the callbacks for Torch Ops and add them back in when an enable call is made. This diff also adds some support code for doing the same in python; however, the python stack comes with its own set of compilcations when enabling this feature. For one, we get into a scenario where the python stack during the toggle never gets an exit as it the tracing gets turned off which makes for some tricky post processing. For this reason, we can leave the python dynamic toggling off for now and revisit if there is enough demand. Test Plan: Got the following tracing by disabling torch and cuda ops: https://www.internalfb.com/intern/perfdoctor/trace_view?filepath=tree/traces/dynocli/devvm2185.cco0.facebook.com/rank-0.Aug_13_13_03_02.606577.pt.trace.json.gz&bucket=gpu_traces Differential Revision: D61221497 Pull Request resolved: https://github.com/pytorch/pytorch/pull/133353 Approved by: https://github.com/sanrise, https://github.com/aaronenyeshi
2025-10-20 21:14:14 +08:00 · 2024-08-16 16:36:57 +00:00
parent 46af996ce7
commit 9c2d119194
7 changed files with 92 additions and 8 deletions
--- a/torch/csrc/autograd/profiler_kineto.cpp
+++ b/torch/csrc/autograd/profiler_kineto.cpp
@ -384,6 +384,14 @@ struct KinetoThreadLocalState : public ProfilerStateBase {
    eventPostProcessCb = std::move(cb);
  }

+  void pausePython() {
+    recordQueue.stop();
+  }
+
+  void resumePython() {
+    recordQueue.restart();
+  }
+
  std::unique_ptr<torch::profiler::impl::kineto::ActivityTraceWrapper>
  finalizeTrace() {
    auto end_time = getTimeNs();
@ -610,19 +618,66 @@ void prepareProfiler(
  }
 }

+static void toggleTorchOpCollectionDynamic(bool enable) {
+  auto state_ptr = ProfilerStateBase::get();
+  if (state_ptr) {
+    const auto& config = state_ptr->config();
+    if (enable) {
+      auto scopes = profiler_state_info_ptr->scopes;
+      config.global() ? pushProfilingCallbacks</*global=*/true>(scopes)
+                      : pushProfilingCallbacks</*global=*/false>(scopes);
+    } else {
+      state_ptr->removeCallback();
+    }
+  }
+}
+#ifdef _MSC_VER
+#define UNUSED
+#else
+#define UNUSED __attribute__((unused))
+#endif
+static UNUSED void togglePythonCollectionDynamic(bool enable) {
+  auto state_ptr = ProfilerStateBase::get();
+  if (state_ptr) {
+    auto global = state_ptr->config().global();
+    KinetoThreadLocalState* kineto_thread_local_state_ptr =
+        KinetoThreadLocalState::get(global);
+    if (enable) {
+      kineto_thread_local_state_ptr->resumePython();
+    } else {
+      kineto_thread_local_state_ptr->pausePython();
+    }
+  }
+}
+
+static void toggleCPUCollectionDynamic(bool enable) {
+  toggleTorchOpCollectionDynamic(enable);
+  // For now we only support Torch Op collection dynamic toggling as
+  // implementing Python ops would require not only string parsing to get rid of
+  // the toggling events as well as other unfinished events as well as changes
+  // in stack logic
+  // togglePythonCollectionDynamic(enable);
+}
+
 void toggleCollectionDynamic(
    const bool enable,
    const std::set<torch::profiler::impl::ActivityType>& activities) {
-  // TODO: CPU toggling should be done in this file to interface with collection
-  // similar to enableProfiler call GPU toggling is called in impl::kineto as is
-  for (auto act : activities) {
-    if (act != torch::autograd::profiler::ActivityType::CUDA) {
+  if (activities.count(torch::autograd::profiler::ActivityType::CPU) > 0 &&
+      activities.count(torch::autograd::profiler::ActivityType::CUDA) == 0) {
    LOG(WARNING)
-          << "Dynamic toggle is only supported for GPU activity, skipping toggling of "
+        << "Toggling CPU activity with CUDA activity on may result in traces with CUDA events on artibrary tracks";
+  }
+  for (auto act : activities) {
+    if (act == torch::autograd::profiler::ActivityType::CUDA) {
+      torch::profiler::impl::kineto::toggleCollectionDynamic(enable);
+    } else if (act == torch::autograd::profiler::ActivityType::CPU) {
+      toggleCPUCollectionDynamic(enable);
+    } else {
+      LOG(WARNING)
+          << "Dynamic toggle is only supported for CPU/GPU activity, skipping toggling of "
          << actToString(act);
      continue;
    }
-    torch::profiler::impl::kineto::toggleCollectionDynamic(enable);
  }
 }

--- a/torch/csrc/autograd/profiler_python.cpp
+++ b/torch/csrc/autograd/profiler_python.cpp
@ -680,6 +680,7 @@ class PythonTracer final : public python_tracer::PythonTracerBase {
      PyObject* arg);

  void stop() override;
+  void restart() override;
  std::vector<std::shared_ptr<Result>> getEvents(
      std::function<c10::time_t(c10::approx_time_t)> time_converter,
      std::vector<python_tracer::CompressedEvent>& enters,
@ -811,6 +812,25 @@ void PythonTracer::stop() {
  }
 }

+void PythonTracer::restart() {
+  gil_and_restore_thread gil;
+  active_ = active_lock_.compare_exchange_strong(active_, true);
+  if (!active_) {
+    TORCH_WARN(
+        "There is already an active Python tracer. "
+        "Refusing to register profile functions.");
+    return;
+  }
+  int cur_thread = 0;
+  for (const auto thread_state : interpreterThreads()) {
+    if (thread_state->c_profilefunc == nullptr) {
+      auto* ctx = thread_local_results_[cur_thread].ctx_;
+      PyThreadState_Swap(thread_state);
+      PyEval_SetProfile(PythonTracer::pyProfileFn, (PyObject*)ctx);
+    }
+  }
+}
+
 // NOLINTNEXTLINE(bugprone-exception-escape)
 PythonTracer::~PythonTracer() {
  if (active_) {
--- a/torch/csrc/profiler/collection.cpp
+++ b/torch/csrc/profiler/collection.cpp
@ -692,6 +692,12 @@ void RecordQueue::stop() {
  }
 }

+void RecordQueue::restart() {
+  if (python_tracer_) {
+    python_tracer_->restart();
+  }
+}
+
 namespace {
 void mark_finished(std::shared_ptr<Result>& r) {
  TORCH_INTERNAL_ASSERT(!r->finished_, r->name());
--- a/torch/csrc/profiler/collection.h
+++ b/torch/csrc/profiler/collection.h
@ -642,6 +642,7 @@ class TORCH_API RecordQueue {
  bool tracePython() const;
  ThreadLocalSubqueue* getSubqueue();
  void stop();
+  void restart();

  // NB: This is a destructive operation.
  std::pair<
--- a/torch/csrc/profiler/orchestration/python_tracer.cpp
+++ b/torch/csrc/profiler/orchestration/python_tracer.cpp
@ -12,6 +12,7 @@ struct NoOpPythonTracer : public PythonTracerBase {
  ~NoOpPythonTracer() override = default;

  void stop() override {}
+  void restart() override {}
  std::vector<std::shared_ptr<Result>> getEvents(
      std::function<c10::time_t(c10::approx_time_t)>,
      std::vector<CompressedEvent>&,
--- a/torch/csrc/profiler/orchestration/python_tracer.h
+++ b/torch/csrc/profiler/orchestration/python_tracer.h
@ -49,6 +49,7 @@ struct TORCH_API PythonTracerBase {
  virtual ~PythonTracerBase() = default;

  virtual void stop() = 0;
+  virtual void restart() = 0;
  virtual std::vector<std::shared_ptr<Result>> getEvents(
      std::function<c10::time_t(c10::approx_time_t)> time_converter,
      std::vector<CompressedEvent>& enters,
--- a/torch/profiler/profiler.py
+++ b/torch/profiler/profiler.py
@ -242,12 +242,12 @@ class _KinetoProfile:
    def toggle_collection_dynamic(
        self, enable: bool, activities: Iterable[ProfilerActivity]
    ):
-        """Toggle collection of activities on/off
+        """Toggle collection of activities on/off. Currently supports toggling Torch Ops (CPU) and all CUDA activity
+        supported in Kineto

        Args:
            activities (iterable): list of activity groups (CPU, CUDA) to use in profiling, supported values:
-                ``torch.profiler.ProfilerActivity.CPU``, ``torch.profiler.ProfilerActivity.CUDA``,
-                ``torch.profiler.ProfilerActivity.XPU``.
+                ``torch.profiler.ProfilerActivity.CPU``, ``torch.profiler.ProfilerActivity.CUDA``
        """
        if not self.profiler:
            return