From a762dd1f6739ca32d596f1f0a98f71cf00db2113 Mon Sep 17 00:00:00 2001
From: Zizeng Meng <zzmeng@meta.com>
Date: Thu, 15 May 2025 06:07:04 +0000
Subject: [PATCH] [Memento] On-demand mode using without torch api (#153171)

Summary:
CUDA Post: https://fb.workplace.com/groups/ai.efficiency.tools.users/permalink/2020094788475989/

# Context
In this diff, we want to enable the on-demand mode of memory snapshot to allow user to trace any remote process via dyno command line.

# Design decision

**How do we send on-demand signal to remote process**
We leverage the dyno-Kineto approach.
Since dyno is running on all machine in Meta, it can send a request to the remote machine to start the Kineto.
Kineto will start another thread for memoryProfiler (https://fburl.com/code/dxsmmrok)

**why we use different approach as CUDA**

On CUDA side, we are using pybind to load torch Module and invoke the python api to start/stop the profiling. However, this requires us to compile the whole torch binary in the predictor which is not recommended by runtime(andruwang)

Thus, we decide to use the CPP api directly to avoid un-necessary dependency

**why the snapshot is saved as json string directly instead of pickle**
Pickle is primarily designed for use with Python and doesn't have well support in cpp. Also, it is hard for user to download the snapshot file and open locally.
Due to the dependency issue, it is hard to import the gzip/pickle library to decode the data. Thus, let's use JSON for now. I will work on the visualizer to fasten the render and support other format later.

**Plan**:
* Now, we will encoded file into gz for MTIA ondemand only and update the visualizer to support both type.
* Update auto-trace and CUDA side to encode in gzip as well
* Fully remove pickle dependency.

Test Plan:
# Remote cogwheel test
Servicelab: https://fburl.com/servicelab/pckux7a3
snapshot file manifold: https://fburl.com/manifold/fnotk18c
snapshot file in pastry: P1805522232

Visualization on D74399684
 {F1977786422}

# Local Predictor Test
url: https://fburl.com/pytorch_memory_visualizer/y06kskkm

 {F1977787329}

Differential Revision: D74179606

Pull Request resolved: https://github.com/pytorch/pytorch/pull/153171
Approved by: https://github.com/sraikund16
---
 aten/src/ATen/detail/MTIAHooksInterface.h     |  2 +-
 buckbuild.bzl                                 |  2 +
 build_variables.bzl                           |  1 +
 torch/csrc/autograd/profiler_python.cpp       |  4 +-
 torch/csrc/mtia/Module.cpp                    |  3 +-
 .../csrc/mtia/profiler/MTIAMemoryProfiler.cpp | 35 +++++++++++++++
 torch/csrc/mtia/profiler/MTIAMemoryProfiler.h | 20 +++++++++
 .../csrc/profiler/kineto_client_interface.cpp |  5 ++-
 .../profiler/orchestration/python_tracer.cpp  |  2 +-
 .../profiler/orchestration/python_tracer.h    |  2 +-
 .../profiler/python/combined_traceback.cpp    | 43 +++++++++++++++++++
 .../csrc/profiler/python/combined_traceback.h |  5 +++
 12 files changed, 117 insertions(+), 7 deletions(-)
 create mode 100644 torch/csrc/mtia/profiler/MTIAMemoryProfiler.cpp
 create mode 100644 torch/csrc/mtia/profiler/MTIAMemoryProfiler.h

diff --git a/aten/src/ATen/detail/MTIAHooksInterface.h b/aten/src/ATen/detail/MTIAHooksInterface.h
index 16981789f684..642941cb743f 100644
--- a/aten/src/ATen/detail/MTIAHooksInterface.h
+++ b/aten/src/ATen/detail/MTIAHooksInterface.h
@@ -126,7 +126,7 @@ struct TORCH_API MTIAHooksInterface : AcceleratorHooksInterface {
     FAIL_MTIAHOOKS_FUNC(__func__);
   }
 
-  virtual PyObject* memorySnapshot() const {
+  virtual PyObject* memorySnapshot(const std::optional<std::string>& local_path) const {
     FAIL_MTIAHOOKS_FUNC(__func__);
     return nullptr;
   }
diff --git a/buckbuild.bzl b/buckbuild.bzl
index 961d6185853c..747d9afec4b2 100644
--- a/buckbuild.bzl
+++ b/buckbuild.bzl
@@ -181,6 +181,7 @@ THIRD_PARTY_LIBS = {
     "pyyaml": ["//third-party/pypi/pyyaml:pyyaml", "//third_party:pyyaml"],
     "rt": ["//xplat/third-party/linker_lib:rt", "//third_party:rt"],
     "ruy": ["//third-party/ruy:ruy_xplat_lib", "//third_party:ruy_lib"],
+    "nlohmann-json": ["fbsource//third-party/nlohmann-json:nlohmann-json", "//third_party:nlohmann-json"],
     "sleef_arm": ["//third-party/sleef:sleef_arm", "//third_party:sleef_arm"],
 }
 
@@ -1735,6 +1736,7 @@ def define_buck_targets(
         deps = [
             third_party("glog"),
             third_party("kineto"),
+            third_party("nlohmann-json"),
         ],
         exported_deps = [
             ":aten_cpu",
diff --git a/build_variables.bzl b/build_variables.bzl
index 7cac3da12100..acf363d37dbb 100644
--- a/build_variables.bzl
+++ b/build_variables.bzl
@@ -101,6 +101,7 @@ libtorch_profiler_sources = [
     "torch/csrc/profiler/collection.cpp",
     "torch/csrc/profiler/data_flow.cpp",
     "torch/csrc/profiler/kineto_shim.cpp",
+    "torch/csrc/mtia/profiler/MTIAMemoryProfiler.cpp",
     "torch/csrc/profiler/kineto_client_interface.cpp",
     "torch/csrc/profiler/orchestration/observer.cpp",
     "torch/csrc/profiler/orchestration/python_tracer.cpp",
diff --git a/torch/csrc/autograd/profiler_python.cpp b/torch/csrc/autograd/profiler_python.cpp
index 4fdd87c44829..e74b7cd4a7e4 100644
--- a/torch/csrc/autograd/profiler_python.cpp
+++ b/torch/csrc/autograd/profiler_python.cpp
@@ -1162,7 +1162,7 @@ class PythonMemoryTracer final : public python_tracer::PythonMemoryTracerBase {
   ~PythonMemoryTracer() override = default;
   void start() override;
   void stop() override;
-  void export_memory_history(const std::string path) override;
+  void export_memory_history(const std::string& path) override;
 };
 
 static void toggle_memory_tracing(bool enable) {
@@ -1196,7 +1196,7 @@ void PythonMemoryTracer::start() {
   toggle_memory_tracing(true);
 }
 
-void PythonMemoryTracer::export_memory_history(const std::string path) {
+void PythonMemoryTracer::export_memory_history(const std::string& path) {
   pybind11::gil_scoped_acquire gil;
   THPObjectPtr torch_cuda_memory_module(
       PyImport_ImportModule("torch.cuda.memory"));
diff --git a/torch/csrc/mtia/Module.cpp b/torch/csrc/mtia/Module.cpp
index 1ea6c6396f17..ee71866e9bd9 100644
--- a/torch/csrc/mtia/Module.cpp
+++ b/torch/csrc/mtia/Module.cpp
@@ -96,7 +96,8 @@ void initModule(PyObject* module) {
       });
 
   m.def("_mtia_memorySnapshot", []() {
-    PyObject* raw_pyobject = at::detail::getMTIAHooks().memorySnapshot();
+    PyObject* raw_pyobject =
+        at::detail::getMTIAHooks().memorySnapshot(std::nullopt);
     return py::reinterpret_steal<py::object>(raw_pyobject);
   });
 
diff --git a/torch/csrc/mtia/profiler/MTIAMemoryProfiler.cpp b/torch/csrc/mtia/profiler/MTIAMemoryProfiler.cpp
new file mode 100644
index 000000000000..4ecc4c9bcf60
--- /dev/null
+++ b/torch/csrc/mtia/profiler/MTIAMemoryProfiler.cpp
@@ -0,0 +1,35 @@
+#include <ATen/Context.h>
+#include <ATen/detail/MTIAHooksInterface.h>
+#include <nlohmann/json.hpp>
+#include <torch/csrc/mtia/profiler/MTIAMemoryProfiler.h>
+
+using json = nlohmann::json;
+
+namespace torch::mtia {
+
+void MTIAMemoryProfiler::start() {
+  at::detail::getMTIAHooks().recordMemoryHistory("all", "all", 150000);
+}
+
+void MTIAMemoryProfiler::export_memory_history(const std::string& path) {
+  at::detail::getMTIAHooks().memorySnapshot(path);
+  return;
+}
+
+void MTIAMemoryProfiler::stop() {
+  at::detail::getMTIAHooks().recordMemoryHistory(std::nullopt, "all", 0);
+}
+
+std::unique_ptr<torch::profiler::impl::python_tracer::PythonMemoryTracerBase>
+getMemoryTracer() {
+  return std::make_unique<MTIAMemoryProfiler>();
+}
+
+void initMemoryProfiler() {
+  if (at::detail::isMTIAHooksBuilt()) {
+    fprintf(stderr, "Initializing MTIA Memory Tracer\n");
+    torch::profiler::impl::python_tracer::registerMemoryTracer(
+        &getMemoryTracer);
+  }
+}
+} // namespace torch::mtia
diff --git a/torch/csrc/mtia/profiler/MTIAMemoryProfiler.h b/torch/csrc/mtia/profiler/MTIAMemoryProfiler.h
new file mode 100644
index 000000000000..8ce22f2af780
--- /dev/null
+++ b/torch/csrc/mtia/profiler/MTIAMemoryProfiler.h
@@ -0,0 +1,20 @@
+#pragma once
+#include <torch/csrc/profiler/orchestration/python_tracer.h>
+
+namespace torch::mtia {
+using namespace torch::profiler::impl::python_tracer;
+
+void initMemoryProfiler();
+
+std::unique_ptr<PythonMemoryTracerBase> getMemoryTracer();
+
+class MTIAMemoryProfiler final : public PythonMemoryTracerBase {
+ public:
+  explicit MTIAMemoryProfiler() = default;
+  ~MTIAMemoryProfiler() override = default;
+  void start() override;
+  void stop() override;
+  void export_memory_history(const std::string& path) override;
+};
+
+} // namespace torch::mtia
diff --git a/torch/csrc/profiler/kineto_client_interface.cpp b/torch/csrc/profiler/kineto_client_interface.cpp
index 89c824cd578f..bb805ad3d72c 100644
--- a/torch/csrc/profiler/kineto_client_interface.cpp
+++ b/torch/csrc/profiler/kineto_client_interface.cpp
@@ -2,6 +2,7 @@
 #include <ATen/Context.h>
 #include <libkineto.h>
 #include <torch/csrc/autograd/profiler_kineto.h>
+#include <torch/csrc/mtia/profiler/MTIAMemoryProfiler.h>
 #include <torch/csrc/profiler/kineto_client_interface.h>
 #include <chrono>
 #include <thread>
@@ -23,7 +24,9 @@ using namespace torch::autograd::profiler;
 
 class LibKinetoClient : public libkineto::ClientInterface {
  public:
-  void init() override {}
+  void init() override {
+    ::torch::mtia::initMemoryProfiler();
+  }
 
   void prepare(
       bool report_input_shapes = false,
diff --git a/torch/csrc/profiler/orchestration/python_tracer.cpp b/torch/csrc/profiler/orchestration/python_tracer.cpp
index 73bdf3ccb017..d5d120d376f2 100644
--- a/torch/csrc/profiler/orchestration/python_tracer.cpp
+++ b/torch/csrc/profiler/orchestration/python_tracer.cpp
@@ -24,7 +24,7 @@ struct NoOpMemoryPythonTracer : public PythonMemoryTracerBase {
   ~NoOpMemoryPythonTracer() override = default;
   void start() override {}
   void stop() override {}
-  void export_memory_history(const std::string path) override {}
+  void export_memory_history(const std::string&) override {}
 };
 
 } // namespace
diff --git a/torch/csrc/profiler/orchestration/python_tracer.h b/torch/csrc/profiler/orchestration/python_tracer.h
index 725c6d8a5c95..52387e92e562 100644
--- a/torch/csrc/profiler/orchestration/python_tracer.h
+++ b/torch/csrc/profiler/orchestration/python_tracer.h
@@ -66,7 +66,7 @@ struct TORCH_API PythonMemoryTracerBase {
 
   virtual void start() = 0;
   virtual void stop() = 0;
-  virtual void export_memory_history(const std::string path) = 0;
+  virtual void export_memory_history(const std::string& path) = 0;
 };
 
 using MakeMemoryFn = std::unique_ptr<PythonMemoryTracerBase> (*)();
diff --git a/torch/csrc/profiler/python/combined_traceback.cpp b/torch/csrc/profiler/python/combined_traceback.cpp
index f9e20541ed86..fc1269ed3498 100644
--- a/torch/csrc/profiler/python/combined_traceback.cpp
+++ b/torch/csrc/profiler/python/combined_traceback.cpp
@@ -115,6 +115,49 @@ struct PythonTraceback : public CapturedTraceback::Python {
 
 } // namespace
 
+std::vector<nlohmann::json> json_symbolize(
+    std::vector<CapturedTraceback*>& to_symbolize) {
+  std::unordered_map<CapturedTraceback*, uint64_t> cached_frames;
+  std::vector<CapturedTraceback*> unique_frames;
+  for (const auto& sc : to_symbolize) {
+    auto it = cached_frames.find(sc);
+    if (it == cached_frames.end()) {
+      cached_frames.try_emplace(sc, unique_frames.size());
+      unique_frames.push_back(sc);
+    }
+  }
+  auto s = symbolize(unique_frames);
+
+  std::string line_s = "line";
+  std::string name_s = "name";
+  std::string filename_s = "filename";
+  std::vector<nlohmann::json> all_frames;
+
+  for (const auto& f : s.all_frames) {
+    nlohmann::json d;
+    d[name_s] = f.funcname;
+    d[filename_s] = f.filename;
+    d[line_s] = f.lineno;
+    all_frames.emplace_back(std::move(d));
+  }
+
+  std::vector<nlohmann::json> py_unique_frames;
+  for (const auto& t : s.tracebacks) {
+    nlohmann::json l;
+    for (const auto& e : t) {
+      l.emplace_back(all_frames.at(e));
+    }
+    py_unique_frames.push_back(std::move(l));
+  }
+
+  std::vector<nlohmann::json> result;
+  result.reserve(to_symbolize.size());
+  for (const auto& sc : to_symbolize) {
+    result.push_back(py_unique_frames.at(cached_frames.at(sc)));
+  }
+  return result;
+}
+
 std::vector<py::object> py_symbolize(
     std::vector<CapturedTraceback*>& to_symbolize) {
   // we dedup repeated to_symbolize objects to prevent
diff --git a/torch/csrc/profiler/python/combined_traceback.h b/torch/csrc/profiler/python/combined_traceback.h
index 03b3846822de..7e1f76b5c0c8 100644
--- a/torch/csrc/profiler/python/combined_traceback.h
+++ b/torch/csrc/profiler/python/combined_traceback.h
@@ -1,5 +1,6 @@
 #include <torch/csrc/profiler/combined_traceback.h>
 
+#include <nlohmann/json.hpp>
 #include <pybind11/pybind11.h>
 #include <torch/csrc/utils/pybind.h>
 
@@ -14,6 +15,10 @@ namespace torch {
 TORCH_API std::vector<pybind11::object> py_symbolize(
     std::vector<CapturedTraceback*>& to_symbolize);
 
+// Return the callback in json format so that it can be used within cpp
+TORCH_API std::vector<nlohmann::json> json_symbolize(
+    std::vector<CapturedTraceback*>& to_symbolize);
+
 // requires GIL to be held, frees any pending free frames
 TORCH_PYTHON_API void freeDeadCapturedTracebackFrames();