Revert "Support gpu trace on XPU (#121795)"

This reverts commit 91ead3eae4cd6cbf50fe7a7b4a2f9f35302bc9b2. Reverted https://github.com/pytorch/pytorch/pull/121795 on behalf of https://github.com/huydhn due to Sorry for reverting your change but it breaks ROCm jobs in trunk 74deacbf31, please help take a look and reland the change ([comment](https://github.com/pytorch/pytorch/pull/121794#issuecomment-2013674083))
2025-10-20 21:14:14 +08:00 · 2024-03-21 20:33:16 +00:00
parent 182bb0f2ca
commit 13afbcfc85
8 changed files with 4 additions and 218 deletions
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -2562,7 +2562,6 @@ exclude_patterns = [
    'torch/utils/viz/__init__.py',
    'torch/utils/viz/_cycles.py',
    'torch/utils/weak.py',
-    'torch/xpu/_gpu_trace.py',
 ]
 init_command = [
    'python3',
--- a/aten/src/ATen/xpu/XPUEvent.h
+++ b/aten/src/ATen/xpu/XPUEvent.h
@ -22,15 +22,7 @@ struct TORCH_XPU_API XPUEvent {
  XPUEvent(bool enable_timing = false) noexcept
      : enable_timing_{enable_timing} {}

-  ~XPUEvent() {
-    if (isCreated()) {
-      const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
-      if (C10_UNLIKELY(interp)) {
-        (*interp)->trace_gpu_event_deletion(
-            at::kXPU, reinterpret_cast<uintptr_t>(event_.get()));
-      }
-    }
-  }
+  ~XPUEvent() = default;

  XPUEvent(const XPUEvent&) = delete;
  XPUEvent& operator=(const XPUEvent&) = delete;
@ -85,13 +77,6 @@ struct TORCH_XPU_API XPUEvent {
  void record(const XPUStream& stream) {
    if (!isCreated()) {
      device_index_ = stream.device_index();
-      event_ = std::make_unique<sycl::event>(
-          stream.queue().ext_oneapi_submit_barrier());
-      const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
-      if (C10_UNLIKELY(interp)) {
-        (*interp)->trace_gpu_event_creation(
-            at::kXPU, reinterpret_cast<uintptr_t>(event_.get()));
-      }
    } else {
      TORCH_CHECK(
          device_index_ == stream.device_index(),
@ -101,16 +86,9 @@ struct TORCH_XPU_API XPUEvent {
          stream.device_index(),
          ".");
      event_.reset();
-      event_ = std::make_unique<sycl::event>(
-          stream.queue().ext_oneapi_submit_barrier());
-    }
-    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
-    if (C10_UNLIKELY(interp)) {
-      (*interp)->trace_gpu_event_record(
-          at::kXPU,
-          reinterpret_cast<uintptr_t>(event_.get()),
-          reinterpret_cast<uintptr_t>(&stream.queue()));
    }
+    event_ = std::make_unique<sycl::event>(
+        stream.queue().ext_oneapi_submit_barrier());
  }

  void block(const XPUStream& stream) {
@ -118,13 +96,6 @@ struct TORCH_XPU_API XPUEvent {
      std::vector<sycl::event> event_list{event()};
      // Make this stream wait until event_ is completed.
      stream.queue().ext_oneapi_submit_barrier(event_list);
-      const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
-      if (C10_UNLIKELY(interp)) {
-        (*interp)->trace_gpu_event_wait(
-            at::kXPU,
-            reinterpret_cast<uintptr_t>(event_.get()),
-            reinterpret_cast<uintptr_t>(&stream.queue()));
-      }
    }
  }

@ -146,11 +117,6 @@ struct TORCH_XPU_API XPUEvent {

  void synchronize() const {
    if (isCreated()) {
-      const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
-      if (C10_UNLIKELY(interp)) {
-        (*interp)->trace_gpu_event_synchronization(
-            at::kXPU, reinterpret_cast<uintptr_t>(event_.get()));
-      }
      event().wait_and_throw();
    }
  }
--- a/c10/xpu/XPUCachingAllocator.cpp
+++ b/c10/xpu/XPUCachingAllocator.cpp
@ -467,11 +467,6 @@ class XPUAllocator : public Allocator {
    Block* block = device_allocators[device]->malloc(device, size, queue);
    add_allocated_block(block);
    *devPtr = block->ptr;
-    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
-    if (C10_UNLIKELY(interp)) {
-      (*interp)->trace_gpu_memory_allocation(
-          c10::kXPU, reinterpret_cast<uintptr_t>(*devPtr));
-    }
  }

  void free(void* ptr) {
@ -481,11 +476,6 @@ class XPUAllocator : public Allocator {
    Block* block = get_allocated_block(ptr, /* remove */ true);
    TORCH_CHECK(block, "invalid device pointer: ", ptr);
    device_allocators[block->device]->free(block);
-    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
-    if (C10_UNLIKELY(interp)) {
-      (*interp)->trace_gpu_memory_deallocation(
-          c10::kXPU, reinterpret_cast<uintptr_t>(block->ptr));
-    }
  }

  void emptyCache() {
--- a/c10/xpu/XPUStream.cpp
+++ b/c10/xpu/XPUStream.cpp
@ -103,17 +103,11 @@ void initDeviceStreamState(DeviceIndex device) {
      {sycl::property::queue::in_order(), queue::priority_high()}};
  for (const auto p : c10::irange(max_compile_time_stream_priorities)) {
    for (const auto i : c10::irange(kStreamsPerPool)) {
-      auto& stream = streams[device][p][i];
-      stream = std::make_unique<sycl::queue>(sycl::queue(
+      streams[device][p][i] = std::make_unique<sycl::queue>(sycl::queue(
          c10::xpu::get_device_context(),
          c10::xpu::get_raw_device(device),
          c10::xpu::asyncHandler,
          properties[p]));
-      const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
-      if (C10_UNLIKELY(interp)) {
-        (*interp)->trace_gpu_stream_creation(
-            c10::kXPU, reinterpret_cast<uintptr_t>(stream.get()));
-      }
    }
    priority_counters[device][p] = 0;
  }
@ -286,10 +280,6 @@ void syncStreamsOnDevice(DeviceIndex device) {
      streams[device][p][i]->wait();
    }
  }
-  const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
-  if (C10_UNLIKELY(interp)) {
-    (*interp)->trace_gpu_device_synchronization(c10::kXPU);
-  }
 }

 } // namespace c10::xpu
--- a/c10/xpu/XPUStream.h
+++ b/c10/xpu/XPUStream.h
@ -1,7 +1,6 @@
 #pragma once

 #include <c10/core/Stream.h>
-#include <c10/core/impl/GPUTrace.h>
 #include <c10/xpu/XPUFunctions.h>

 namespace c10::xpu {
@ -88,11 +87,6 @@ class C10_XPU_API XPUStream {

  void synchronize() const {
    queue().wait_and_throw();
-    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
-    if (C10_UNLIKELY(interp)) {
-      (*interp)->trace_gpu_stream_synchronization(
-          c10::kXPU, reinterpret_cast<uintptr_t>(&queue()));
-    }
  }

  int priority() const;
--- a/c10/xpu/impl/XPUGuardImpl.h
+++ b/c10/xpu/impl/XPUGuardImpl.h
@ -2,7 +2,6 @@

 #include <c10/core/DeviceGuard.h>
 #include <c10/core/impl/DeviceGuardImplInterface.h>
-#include <c10/core/impl/GPUTrace.h>
 #include <c10/xpu/XPUCachingAllocator.h>
 #include <c10/xpu/XPUFunctions.h>
 #include <c10/xpu/XPUStream.h>
@ -85,13 +84,6 @@ struct XPUGuardImpl final : public c10::impl::DeviceGuardImplInterface {
    auto* xpu_event = reinterpret_cast<sycl::event*>(*event);
    const XPUStream xpu_stream{stream};
    *xpu_event = xpu_stream.queue().ext_oneapi_submit_barrier();
-    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
-    if (C10_UNLIKELY(interp)) {
-      (*interp)->trace_gpu_event_record(
-          c10::kXPU,
-          reinterpret_cast<uintptr_t>(xpu_event),
-          reinterpret_cast<uintptr_t>(&xpu_stream.queue()));
-    }
  }

  void block(void* event, const Stream& stream) const override {
@ -101,13 +93,6 @@ struct XPUGuardImpl final : public c10::impl::DeviceGuardImplInterface {
    std::vector<sycl::event> event_list{*xpu_event};
    const XPUStream xpu_stream(stream);
    xpu_stream.queue().ext_oneapi_submit_barrier(event_list);
-    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
-    if (C10_UNLIKELY(interp)) {
-      (*interp)->trace_gpu_event_wait(
-          c10::kXPU,
-          reinterpret_cast<uintptr_t>(xpu_event),
-          reinterpret_cast<uintptr_t>(&xpu_stream.queue()));
-    }
  }

  bool queryEvent(void* event) const override {
--- a/test/test_xpu.py
+++ b/test/test_xpu.py
@ -4,7 +4,6 @@ import sys
 import unittest

 import torch
-import torch.xpu._gpu_trace as gpu_trace
 from torch.testing._internal.common_utils import NoTest, run_tests, TEST_XPU, TestCase

 if not TEST_XPU:
@ -128,67 +127,5 @@ if __name__ == "__main__":
        self.assertEqual(2024, torch.xpu.initial_seed())


-class TestXpuTrace(TestCase):
-    def setUp(self):
-        torch._C._activate_gpu_trace()
-        self.mock = unittest.mock.MagicMock()
-
-    def test_event_creation_callback(self):
-        gpu_trace.register_callback_for_event_creation(self.mock)
-
-        event = torch.xpu.Event()
-        event.record()
-        self.mock.assert_called_once_with(event._as_parameter_.value)
-
-    def test_event_deletion_callback(self):
-        gpu_trace.register_callback_for_event_deletion(self.mock)
-
-        event = torch.xpu.Event()
-        event.record()
-        event_id = event._as_parameter_.value
-        del event
-        self.mock.assert_called_once_with(event_id)
-
-    def test_event_record_callback(self):
-        gpu_trace.register_callback_for_event_record(self.mock)
-
-        event = torch.xpu.Event()
-        event.record()
-        self.mock.assert_called_once_with(
-            event._as_parameter_.value, torch.xpu.current_stream().sycl_queue
-        )
-
-    def test_event_wait_callback(self):
-        gpu_trace.register_callback_for_event_wait(self.mock)
-
-        event = torch.xpu.Event()
-        event.record()
-        event.wait()
-        self.mock.assert_called_once_with(
-            event._as_parameter_.value, torch.xpu.current_stream().sycl_queue
-        )
-
-    def test_device_synchronization_callback(self):
-        gpu_trace.register_callback_for_device_synchronization(self.mock)
-
-        torch.xpu.synchronize()
-        self.mock.assert_called()
-
-    def test_stream_synchronization_callback(self):
-        gpu_trace.register_callback_for_stream_synchronization(self.mock)
-
-        stream = torch.xpu.Stream()
-        stream.synchronize()
-        self.mock.assert_called_once_with(stream.sycl_queue)
-
-    def test_event_synchronization_callback(self):
-        gpu_trace.register_callback_for_event_synchronization(self.mock)
-
-        event = torch.xpu.Event()
-        event.record()
-        event.synchronize()
-        self.mock.assert_called_once_with(event._as_parameter_.value)
-
-
 if __name__ == "__main__":
    run_tests()
--- a/torch/xpu/_gpu_trace.py
+++ b/torch/xpu/_gpu_trace.py
@ -1,75 +0,0 @@
-from typing import Callable
-
-from torch._utils import CallbackRegistry
-
-
-EventCreationCallbacks: "CallbackRegistry[int]" = CallbackRegistry(
-    "XPU event creation"
-)
-EventDeletionCallbacks: "CallbackRegistry[int]" = CallbackRegistry(
-    "XPU event deletion"
-)
-EventRecordCallbacks: "CallbackRegistry[int, int]" = CallbackRegistry(
-    "XPU event record"
-)
-EventWaitCallbacks: "CallbackRegistry[int, int]" = CallbackRegistry(
-    "XPU event wait"
-)
-MemoryAllocationCallbacks: "CallbackRegistry[int]" = CallbackRegistry(
-    "XPU memory allocation"
-)
-MemoryDeallocationCallbacks: "CallbackRegistry[int]" = CallbackRegistry(
-    "XPU memory deallocation"
-)
-StreamCreationCallbacks: "CallbackRegistry[int]" = CallbackRegistry(
-    "XPU stream creation"
-)
-DeviceSynchronizationCallbacks: "CallbackRegistry[[]]" = CallbackRegistry(
-    "XPU device synchronization"
-)
-StreamSynchronizationCallbacks: "CallbackRegistry[int]" = CallbackRegistry(
-    "XPU stream synchronization"
-)
-EventSynchronizationCallbacks: "CallbackRegistry[int]" = CallbackRegistry(
-    "XPU event synchronization"
-)
-
-
-def register_callback_for_event_creation(cb: Callable[[int], None]) -> None:
-    EventCreationCallbacks.add_callback(cb)
-
-
-def register_callback_for_event_deletion(cb: Callable[[int], None]) -> None:
-    EventDeletionCallbacks.add_callback(cb)
-
-
-def register_callback_for_event_record(cb: Callable[[int, int], None]) -> None:
-    EventRecordCallbacks.add_callback(cb)
-
-
-def register_callback_for_event_wait(cb: Callable[[int, int], None]) -> None:
-    EventWaitCallbacks.add_callback(cb)
-
-
-def register_callback_for_memory_allocation(cb: Callable[[int], None]) -> None:
-    MemoryAllocationCallbacks.add_callback(cb)
-
-
-def register_callback_for_memory_deallocation(cb: Callable[[int], None]) -> None:
-    MemoryDeallocationCallbacks.add_callback(cb)
-
-
-def register_callback_for_stream_creation(cb: Callable[[int], None]) -> None:
-    StreamCreationCallbacks.add_callback(cb)
-
-
-def register_callback_for_device_synchronization(cb: Callable[[], None]) -> None:
-    DeviceSynchronizationCallbacks.add_callback(cb)
-
-
-def register_callback_for_stream_synchronization(cb: Callable[[int], None]) -> None:
-    StreamSynchronizationCallbacks.add_callback(cb)
-
-
-def register_callback_for_event_synchronization(cb: Callable[[int], None]) -> None:
-    EventSynchronizationCallbacks.add_callback(cb)