init kineto after torch module initialized (#131448)

Fixes #131020 As discussed in the issue thread, we can use ` KINETO_DAEMON_INIT_DELAY_S` to delay the initialization of `kineto` in case `kineto` is initialized before `libtorch_cuda.so`. It's not clear to set a proper value of environmental variable `KINETO_DAEMON_INIT_DELAY_S`, here's a trick to make the initialization of `kineto` after the initialization of module `torch`. I'm not sure whether this is an acceptable trick, please take a look at this pr, thanks. Pull Request resolved: https://github.com/pytorch/pytorch/pull/131448 Approved by: https://github.com/sraikund16, https://github.com/briancoutinho
2025-10-20 12:54:11 +08:00 · 2024-10-31 13:24:20 +00:00
parent ccaa2a206a
commit c934ed6567
4 changed files with 79 additions and 33 deletions
--- a/test/profiler/test_kineto.py
+++ b/test/profiler/test_kineto.py
@ -0,0 +1,51 @@
+# Owner(s): ["oncall: profiler"]
+import os
+import subprocess
+import sys
+from unittest.mock import patch
+
+import torch
+from torch.testing._internal.common_utils import run_tests, TestCase
+
+
+class SimpleKinetoInitializationTest(TestCase):
+    @patch.dict(os.environ, {"KINETO_USE_DAEMON": "1"})
+    def test_kineto_profiler_with_environment_variable(self):
+        """
+        This test checks whether kineto works with torch in daemon mode, please refer to issue #112389 and #131020.
+        Besides that, this test will also check that kineto will not be initialized when user loads the shared library
+        directly.
+        """
+        script = """
+import torch
+if torch.cuda.is_available() > 0:
+    torch.cuda.init()
+"""
+        try:
+            subprocess.check_output(
+                [sys.executable, "-W", "always", "-c", script],
+                cwd=os.path.dirname(os.path.realpath(__file__)),
+            )
+        except subprocess.CalledProcessError as e:
+            if e.returncode != 0:
+                self.assertTrue(
+                    False,
+                    "Kineto is not working properly with the Dynolog environment variable",
+                )
+        # import the shared library directly - it triggers static init but doesn't call kineto_init
+        env = os.environ.copy()
+        env["KINETO_USE_DAEMON"] = "1"
+        if "KINETO_DAEMON_INIT_DELAY_S" in env:
+            env.pop("KINETO_DAEMON_INIT_DELAY_S")
+        _, stderr = TestCase.run_process_no_exception(
+            f"from ctypes import CDLL; CDLL('{torch._C.__file__}')"
+        )
+        self.assertNotRegex(
+            stderr.decode("ascii"),
+            "Registering daemon config loader",
+            "kineto should not be initialized when the shared library is imported directly",
+        )
+
+
+if __name__ == "__main__":
+    run_tests()
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@ -102,6 +102,7 @@

 #include <ATen/native/transformers/sdp_utils_cpp.h>
 #include <torch/csrc/profiler/combined_traceback.h>
+#include <torch/csrc/profiler/kineto_client_interface.h>
 #include <sstream>

 #ifdef USE_CUDA
@ -2443,6 +2444,10 @@ Call this whenever a new thread is created in order to propagate values from
  torch::set_disabled_torch_dispatch_impl(
      PyObject_GetAttrString(module, "_disabled_torch_dispatch_impl"));
  ASSERT_TRUE(torch::disabled_torch_dispatch_impl() != nullptr);
+  // init kineto here
+#ifdef USE_KINETO
+  torch::global_kineto_init();
+#endif
  return module;
  END_HANDLE_TH_ERRORS
 }
--- a/torch/csrc/profiler/kineto_client_interface.cpp
+++ b/torch/csrc/profiler/kineto_client_interface.cpp
@ -2,6 +2,7 @@
 #include <ATen/Context.h>
 #include <libkineto.h>
 #include <torch/csrc/autograd/profiler_kineto.h>
+#include <torch/csrc/profiler/kineto_client_interface.h>
 #include <chrono>
 #include <thread>

@ -71,46 +72,24 @@ class LibKinetoClient : public libkineto::ClientInterface {

 } // namespace profiler::impl

+void global_kineto_init() {
+#if ENABLE_GLOBAL_OBSERVER
+  if (c10::utils::get_env("KINETO_USE_DAEMON").has_value()) {
+    libkineto_init(
+        /*cpuOnly=*/!(at::hasCUDA() || at::hasXPU() || at::hasMTIA()),
+        /*logOnError=*/true);
+    libkineto::api().suppressLogMessages();
+  }
+#endif
+}
+
 #if ENABLE_GLOBAL_OBSERVER
 namespace {

-int get_init_delay() {
-  const char* delay_c = std::getenv("KINETO_DAEMON_INIT_DELAY_S");
-  if (!delay_c) {
-    return -1;
-  }
-  std::string delay_s{delay_c};
-  try {
-    return std::stoi(delay_s);
-  } catch (const std::invalid_argument& _) {
-    return -1;
-  }
-}
-
 struct RegisterLibKinetoClient {
  RegisterLibKinetoClient() {
    static profiler::impl::LibKinetoClient client;
    libkineto::api().registerClient(&client);
-
-    auto kineto_init = []() {
-      libkineto_init(
-          /*cpuOnly=*/!(at::hasCUDA() || at::hasXPU() || at::hasMTIA()),
-          /*logOnError=*/true);
-      libkineto::api().suppressLogMessages();
-    };
-
-    if (std::getenv("KINETO_USE_DAEMON") != nullptr) {
-      int init_delay_s = get_init_delay();
-      if (init_delay_s > 0) {
-        std::thread t([init_delay_s, kineto_init]() {
-          std::this_thread::sleep_for(std::chrono::seconds(init_delay_s));
-          kineto_init();
-        });
-        t.detach();
-      } else {
-        kineto_init();
-      }
-    }
  }
 } register_libkineto_client;

--- a/torch/csrc/profiler/kineto_client_interface.h
+++ b/torch/csrc/profiler/kineto_client_interface.h
@ -0,0 +1,11 @@
+#pragma once
+
+#include <torch/csrc/jit/runtime/interpreter.h>
+#include <torch/csrc/profiler/unwind/unwind.h>
+
+namespace torch {
+
+// declare global_kineto_init for libtorch_cpu.so to call
+TORCH_API void global_kineto_init(void);
+
+} // namespace torch