[Profiler][PrivateUse1] Profiler support PrivateUse1 key (#120556)

Summary: 1.Package public headers of kineto if USE_KINETO so that they can be used by PrivateUse1 user. 2.Add PrivateUse1 key to ActivityType. 3. Support PrivateUse1 key in function deviceTypeFromActivity and _supported_activities. 4. Fix some bugs when processing profiler results. Co-authored-by: albanD <desmaison.alban@gmail.com> Co-authored-by: Aaron Shi <enye.shi@gmail.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/120556 Approved by: https://github.com/aaronenyeshi
2025-10-20 21:14:14 +08:00 · 2024-04-12 14:28:19 +00:00
parent 6f4c7eeb08
commit 41613a0803
9 changed files with 61 additions and 11 deletions
--- a/setup.py
+++ b/setup.py
@ -1386,6 +1386,12 @@ def main():
                "include/tensorpipe/transport/uv/*.h",
            ]
        )
+    if get_cmake_cache_vars()["USE_KINETO"]:
+        torch_package_data.extend(
+            [
+                "include/kineto/*.h",
+            ]
+        )
    torchgen_package_data = [
        # Recursive glob doesn't work in setup.py,
        # https://github.com/pypa/setuptools/issues/1806
--- a/torch/autograd/profiler.py
+++ b/torch/autograd/profiler.py
@ -270,7 +270,6 @@ class profile:
                self.profiler_kind = ProfilerState.KINETO_PRIVATEUSE1_FALLBACK
            else:
                self.kineto_activities.add(ProfilerActivity.PrivateUse1)
-                self.profiler_kind = ProfilerState.KINETO_PRIVATEUSE1

        assert (
            len(self.kineto_activities) > 0
@ -317,6 +316,10 @@ class profile:
            return
        if self.use_cuda:
            torch.cuda.synchronize()
+        elif self.use_device and hasattr(torch, self.use_device):
+            privateuse1_module = getattr(torch, self.use_device)
+            if hasattr(privateuse1_module, "synchronize"):
+                privateuse1_module.synchronize()

        t0 = perf_counter_ns()
        self.kineto_results = _disable_profiler()
@ -542,7 +545,10 @@ class profile:
                and fe.id in device_corr_map
            ):
                for f_evt in device_corr_map[fe.id]:
-                    if f_evt.device_type == DeviceType.CUDA:
+                    if (
+                        f_evt.device_type == DeviceType.CUDA
+                        or f_evt.device_type == DeviceType.PrivateUse1
+                    ):
                        fe.append_kernel(
                            f_evt.name,
                            f_evt.device_index,
--- a/torch/autograd/profiler_util.py
+++ b/torch/autograd/profiler_util.py
@ -598,7 +598,7 @@ class FunctionEvent(FormattedTimesMixin):
                [child.privateuse1_time_total for child in self.cpu_children]
            )
        else:
-            assert self.device_type == DeviceType.CUDA
+            assert self.device_type == DeviceType.PrivateUse1
            return self.privateuse1_time_total

    @property
@ -867,7 +867,10 @@ def _build_table(
        event.self_privateuse1_memory_usage > 0 for event in events
    )
    use_device = events[0].use_device
-    if not use_device and (has_privateuse1_mem or has_privateuse1_time):
+    # Running on PrivateUse1 device with profiler but not enable
+    # ProfilerActivity.PrivateUse1 can also catch privateuse1 memory usage.
+    # Here only need to check has_privateuse1_time if not use_device.
+    if not use_device and has_privateuse1_time:
        raise RuntimeError(
            "use_device is None, but there is private device performance data."
        )
@ -951,7 +954,7 @@ def _build_table(
                    "Self CUDA Mem",
                ]
            )
-        if has_privateuse1_mem:
+        if use_device and has_privateuse1_mem:
            privateuse1 = use_device.upper()
            headers.extend(
                [
@ -1132,7 +1135,7 @@ def _build_table(
                        _format_memory(evt.self_cuda_memory_usage),
                    ]
                )
-            if has_privateuse1_mem:
+            if use_device and has_privateuse1_mem:
                row_values.extend(
                    [
                        # PrivateUse1 Mem Total
--- a/torch/csrc/autograd/init.cpp
+++ b/torch/csrc/autograd/init.cpp
@ -332,6 +332,9 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject* unused) {
    if (at::hasMTIA()) {
      activities.insert(torch::profiler::impl::ActivityType::MTIA);
    }
+    if (c10::get_privateuse1_backend() != "privateuseone") {
+      activities.insert(torch::profiler::impl::ActivityType::PrivateUse1);
+    }
 #endif
    return activities;
  });
--- a/torch/csrc/autograd/profiler_kineto.cpp
+++ b/torch/csrc/autograd/profiler_kineto.cpp
@ -555,7 +555,9 @@ void prepareProfiler(
          config.state == ProfilerState::KINETO_PRIVATEUSE1_FALLBACK,
      "Supported only in Kineto profiler");
  torch::profiler::impl::kineto::prepareTrace(
-      /*cpuOnly=*/!(at::hasCUDA() || at::hasXPU() || at::hasMTIA()),
+      /*cpuOnly=*/!(
+          at::hasCUDA() || at::hasXPU() || at::hasMTIA() ||
+          c10::get_privateuse1_backend() != "privateuseone"),
      activities,
      config.experimental_config);

--- a/torch/csrc/profiler/kineto_shim.cpp
+++ b/torch/csrc/profiler/kineto_shim.cpp
@ -25,6 +25,8 @@ const std::set<libkineto::ActivityType> kCpuTypes{
    libkineto::ActivityType::CUDA_RUNTIME,
    libkineto::ActivityType::CUDA_DRIVER,
    libkineto::ActivityType::PYTHON_FUNCTION,
+    libkineto::ActivityType::PRIVATEUSE1_RUNTIME,
+    libkineto::ActivityType::PRIVATEUSE1_DRIVER,
 };

 const std::set<libkineto::ActivityType> kCudaTypes = {
@ -47,6 +49,15 @@ const std::set<libkineto::ActivityType> kMtiaTypes = {
    libkineto::ActivityType::MTIA_CCP_EVENTS,
    libkineto::ActivityType::MTIA_RUNTIME,
 };
+const std::set<libkineto::ActivityType> kPrivateUse1Types = {
+    libkineto::ActivityType::GPU_MEMCPY,
+    libkineto::ActivityType::GPU_MEMSET,
+    libkineto::ActivityType::GPU_USER_ANNOTATION,
+    libkineto::ActivityType::CONCURRENT_KERNEL,
+    // PRIVATEUSE1_RUNTIME appears in both kCpuTypes and kPrivateUse1Types.
+    libkineto::ActivityType::PRIVATEUSE1_RUNTIME,
+    libkineto::ActivityType::PRIVATEUSE1_DRIVER,
+};
 } // namespace
 #endif // USE_KINETO

@ -248,6 +259,9 @@ void prepareTrace(
  if (collectivesProfilerExists()) {
    k_activities.insert(libkineto::ActivityType::COLLECTIVE_COMM);
  }
+  if (activities.count(torch::autograd::profiler::ActivityType::PrivateUse1)) {
+    k_activities.insert(kPrivateUse1Types.begin(), kPrivateUse1Types.end());
+  }

  ExperimentalConfigWrapper configWrap(config);

@ -336,8 +350,18 @@ c10::DeviceType deviceTypeFromActivity(libkineto::ActivityType activity_type) {
    case libkineto::ActivityType::GPU_USER_ANNOTATION:
    case libkineto::ActivityType::CUDA_PROFILER_RANGE:
    // TODO: T151322015
-    case libkineto::ActivityType::MTIA_CCP_EVENTS:
-      return c10::DeviceType::CUDA;
+    case libkineto::ActivityType::MTIA_CCP_EVENTS: {
+      // PrivateUse1 kineto backend reuse above ActivityTypes,
+      // If PrivateUse1 backend enabled, this should return
+      // c10::DeviceType::PrivateUse1.
+      c10::DeviceType device_type = []() {
+        if (c10::get_privateuse1_backend() != "privateuseone") {
+          return c10::DeviceType::PrivateUse1;
+        }
+        return c10::DeviceType::CUDA;
+      }();
+      return device_type;
+    }
    case libkineto::ActivityType::CPU_OP:
    case libkineto::ActivityType::USER_ANNOTATION:
    case libkineto::ActivityType::EXTERNAL_CORRELATION:
@ -347,6 +371,8 @@ c10::DeviceType deviceTypeFromActivity(libkineto::ActivityType activity_type) {
    case libkineto::ActivityType::MTIA_RUNTIME:
    case libkineto::ActivityType::PYTHON_FUNCTION:
    case libkineto::ActivityType::CUDA_DRIVER:
+    case libkineto::ActivityType::PRIVATEUSE1_RUNTIME:
+    case libkineto::ActivityType::PRIVATEUSE1_DRIVER:
      return c10::DeviceType::CPU;
    default: {
      TORCH_WARN(
--- a/torch/csrc/profiler/orchestration/observer.h
+++ b/torch/csrc/profiler/orchestration/observer.h
@ -17,6 +17,7 @@ enum class C10_API_ENUM ActivityType {
  XPU, // XPU kernels, runtime
  CUDA, // CUDA kernels, runtime
  MTIA, // MTIA kernels, runtime
+  PrivateUse1, // PrivateUse1 kernels, runtime
  NUM_KINETO_ACTIVITIES, // must be the last one
 };

--- a/torch/csrc/profiler/python/init.cpp
+++ b/torch/csrc/profiler/python/init.cpp
@ -325,7 +325,8 @@ void initPythonBindings(PyObject* module) {
      .value("CPU", ActivityType::CPU)
      .value("XPU", ActivityType::XPU)
      .value("MTIA", ActivityType::MTIA)
-      .value("CUDA", ActivityType::CUDA);
+      .value("CUDA", ActivityType::CUDA)
+      .value("PrivateUse1", ActivityType::PrivateUse1);

  py::class_<ExperimentalConfig>(m, "_ExperimentalConfig")
      .def(
--- a/torch/profiler/profiler.py
+++ b/torch/profiler/profiler.py
@ -143,7 +143,9 @@ class _KinetoProfile:
            use_cuda=(ProfilerActivity.CUDA in self.activities),
            use_cpu=(ProfilerActivity.CPU in self.activities),
            use_mtia=(ProfilerActivity.MTIA in self.activities),
-            use_device=None,
+            use_device=self.use_device
+            if (ProfilerActivity.PrivateUse1 in self.activities)
+            else None,
            record_shapes=self.record_shapes,
            with_flops=self.with_flops,
            profile_memory=self.profile_memory,