[profiler] don't disable CUPTI_LAZY_REINIT for cuda >= 12.6 (#150957)

Credit to @mgmtea who wrote the initial version of this PR: https://github.com/pytorch/pytorch/pull/146604 Context: CUPTI is the NVIDIA library that Kineto uses for collecting GPU-side info during profiling. The intended usage is to register a callback while you want profiling to occur, and then unregister the callback when you want profiling to stop. But a bug would cause crashes if CUPTI callbacks were de-registered when used with cudagraphs. The workaround was to disable "CUPTI_LAZY_REINIT" and "CUPTI_TEARDOWN" in Kineto - which prevents crashes, but can result in slower execution after profiling has occurred and completed. This bug is believed to be fixed in CUDA >= 12.6, so this PR qualifies that DISABLE_CUPTI_LAZY_REINIT=1 and CUPTI_TEARDOWN=0 should only be applied if CUDA >= 12.6. Additionally, `profiler_allow_cudagraph_cupti_lazy_reinit_cuda12()` is added as an escape hatch so that we can add a killswitch in case we see more crashes related to this. Differential Revision: [D72745929](https://our.internmc.facebook.com/intern/diff/D72745929) Pull Request resolved: https://github.com/pytorch/pytorch/pull/150957 Approved by: https://github.com/aaronenyeshi, https://github.com/Skylion007
2025-10-20 21:14:14 +08:00 · 2025-04-09 14:29:56 -07:00
parent 6720d23969
commit 37812009fd
4 changed files with 40 additions and 2 deletions
--- a/test/inductor/test_profiler.py
+++ b/test/inductor/test_profiler.py
@ -13,6 +13,7 @@ from torch._inductor import config
 from torch.profiler import ProfilerActivity
 from torch.testing._internal.common_utils import TemporaryFileName
 from torch.testing._internal.inductor_utils import HAS_CUDA
+from torch.torch_version import TorchVersion
 from torch.utils._triton import has_triton


@ -280,6 +281,23 @@ class DynamoProfilerTests(torch._inductor.test_case.TestCase):
        for e in triton_events:
            check_triton_event(e)

+    @unittest.skipIf(not HAS_TRITON, "requires cuda & triton")
+    def test_cupti_lazy_reinit(self):
+        x, y = (torch.randn(4, 4, device="cuda") for _ in range(2))
+
+        def fn(x, y):
+            return (x + y).sin()
+
+        fn_c = torch.compile(fn, mode="reduce-overhead")
+
+        with torch.profiler.profile():
+            fn_c(x, y)
+
+        if TorchVersion(torch.version.cuda) >= "12.6":
+            self.assertEqual("0", os.environ.get("DISABLE_CUPTI_LAZY_REINIT", "0"))
+        else:
+            self.assertEqual("1", os.environ.get("DISABLE_CUPTI_LAZY_REINIT", "0"))
+

 if __name__ == "__main__":
    from torch._inductor.test_case import run_tests
--- a/torch/init.py
+++ b/torch/init.py
@ -54,6 +54,7 @@ from torch._utils import (
 from torch._utils_internal import (
    get_file_path,
    prepare_multiprocessing_environment,
+    profiler_allow_cudagraph_cupti_lazy_reinit_cuda12,
    USE_GLOBAL_DEPS,
    USE_RTLD_GLOBAL_WITH_LIBTORCH,
 )
@ -2294,6 +2295,7 @@ class _TorchCompileInductorWrapper:

    def __init__(self, mode, options, dynamic):
        from torch._inductor.compiler_bisector import CompilerBisector
+        from torch.torch_version import TorchVersion

        self.config: dict[str, _Any] = {}
        self.dynamic = dynamic
@ -2301,7 +2303,13 @@ class _TorchCompileInductorWrapper:
        self.apply_options(options)
        self.apply_options(CompilerBisector.get_config_change("inductor"))

-        if self.config.get("triton.cudagraphs", False):
+        if self.config.get("triton.cudagraphs", False) and (
+            (
+                getattr(torch.version, "cuda", None)
+                and TorchVersion(torch.version.cuda) < "12.6"
+            )
+            or not profiler_allow_cudagraph_cupti_lazy_reinit_cuda12()
+        ):
            os.environ["DISABLE_CUPTI_LAZY_REINIT"] = "1"
            # FIXME: CUDA Graph does not work well with CUPTI teardown.
            #   1) crashes on 1st lazy CUPTI re-init after teardown (CUDA 11)
--- a/torch/_utils_internal.py
+++ b/torch/_utils_internal.py
@ -274,3 +274,7 @@ def record_chromium_event_internal(
    event: dict[str, Any],
 ):
    return None
+
+
+def profiler_allow_cudagraph_cupti_lazy_reinit_cuda12():
+    return True
--- a/torch/profiler/profiler.py
+++ b/torch/profiler/profiler.py
@ -23,8 +23,10 @@ from torch._C._profiler import (
    _remove_execution_trace_observer,
 )
 from torch._environment import is_fbcode
+from torch._utils_internal import profiler_allow_cudagraph_cupti_lazy_reinit_cuda12
 from torch.autograd import kineto_available, ProfilerActivity
 from torch.profiler._memory_profiler import MemoryProfile, MemoryProfileTimeline
+from torch.torch_version import TorchVersion


 __all__ = [
@ -223,7 +225,13 @@ class _KinetoProfile:
            if hasattr(torch, "_inductor"):
                import torch._inductor.config as inductor_config

-                if inductor_config.triton.cudagraphs:
+                if inductor_config.triton.cudagraphs and (
+                    (
+                        getattr(torch.version, "cuda", None)
+                        and TorchVersion(torch.version.cuda) < "12.6"
+                    )
+                    or not profiler_allow_cudagraph_cupti_lazy_reinit_cuda12()
+                ):
                    os.environ["DISABLE_CUPTI_LAZY_REINIT"] = "1"
                    self.add_metadata_json("DISABLE_CUPTI_LAZY_REINIT", "1")
                    # FIXME: CUDA Graph does not work well with CUPTI teardown.