Update

[ghstack-poisoned]
Update (base update)
2025-11-14 06:07:55 +08:00 · 2025-08-09 02:13:33 +00:00 · 2025-08-09 02:13:33 +00:00 · 2025-07-29 15:42:33 +00:00 · 2025-07-29 15:42:33 +00:00 · 2025-07-18 14:37:10 +00:00
6 changed files with 479 additions and 302 deletions
--- a/aten/src/ATen/cuda/CUDAEvent.h
+++ b/aten/src/ATen/cuda/CUDAEvent.h
@ -2,248 +2,6 @@

 #include <ATen/cuda/ATenCUDAGeneral.h>
 #include <ATen/cuda/CUDAContext.h>
-#include <c10/core/impl/GPUTrace.h>
-#include <c10/cuda/CUDAStream.h>
-#include <c10/cuda/CUDAGuard.h>
 #include <ATen/cuda/Exceptions.h>
-#include <c10/util/Exception.h>
-
-#include <cuda_runtime_api.h>
-
-#include <cstdint>
-#include <utility>
-
-/*
-* `cudaEventExternal` is a torch-specific flag that is used to
-* indicate that the CUDAEvent will be used only for synchronization
-* with work outside of the cuda graph, rather than creation of
-* cross-stream dependencies within a cuda graph. Resources:
-* https://docs.nvidia.com/cuda/archive/12.9.0/cuda-c-programming-guide/index.html#cross-stream-dependencies-and-events
-* https://docs.nvidia.com/cuda/archive/12.9.0/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3457b81d1d32c6a00f6132fbc2693d47
-* https://docs.nvidia.com/cuda/archive/12.9.0/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g0c23426b7252eaa9cef695859991304e
-*/
-#define cudaEventExternal 0x08
-
-namespace at::cuda {
-
-/*
-* CUDAEvents are movable not copyable wrappers around CUDA's events.
-*
-* CUDAEvents are constructed lazily when first recorded unless it is
-* reconstructed from a cudaIpcEventHandle_t. The event has a device, and this
-* device is acquired from the first recording stream. However, if reconstructed
-* from a handle, the device should be explicitly specified; or if ipc_handle() is
-* called before the event is ever recorded, it will use the current device.
-* Later streams that record the event must match this device.
-*/
-struct TORCH_CUDA_CPP_API CUDAEvent {
-  // Constructors
-  // Default value for `flags` is specified below - it's cudaEventDisableTiming
-  CUDAEvent() noexcept = default;
-  CUDAEvent(unsigned int flags) noexcept : flags_{flags} {}
-
-  CUDAEvent(
-      DeviceIndex device_index, const cudaIpcEventHandle_t* handle) : device_index_(device_index) {
-      CUDAGuard guard(device_index_);
-
-      AT_CUDA_CHECK(cudaIpcOpenEventHandle(&event_, *handle));
-      is_created_ = true;
-  }
-
-  // Note: event destruction done on creating device to avoid creating a
-  // CUDA context on other devices.
-  ~CUDAEvent() {
-    try {
-      if (is_created_) {
-        CUDAGuard guard(device_index_);
-        const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
-        if (C10_UNLIKELY(interp)) {
-          (*interp)->trace_gpu_event_deletion(at::kCUDA, reinterpret_cast<uintptr_t>(event_));
-        }
-        AT_CUDA_CHECK(cudaEventDestroy(event_));
-      }
-    } catch (...) { /* No throw */ }
-  }
-
-  CUDAEvent(const CUDAEvent&) = delete;
-  CUDAEvent& operator=(const CUDAEvent&) = delete;
-
-  CUDAEvent(CUDAEvent&& other) noexcept { moveHelper(std::move(other)); }
-  CUDAEvent& operator=(CUDAEvent&& other) noexcept {
-    if (this != &other) {
-      moveHelper(std::move(other));
-    }
-    return *this;
-  }
-
-  operator cudaEvent_t() const { return event(); }
-
-  // Less than operator (to allow use in sets)
-  friend bool operator<(const CUDAEvent& left, const CUDAEvent& right) {
-    return left.event_ < right.event_;
-  }
-
-  std::optional<at::Device> device() const {
-    if (is_created_) {
-      return at::Device(at::kCUDA, device_index_);
-    } else {
-      return {};
-    }
-  }
-
-  bool isCreated() const { return is_created_; }
-  DeviceIndex device_index() const {return device_index_;}
-  cudaEvent_t event() const { return event_; }
-
-  // Note: cudaEventQuery can be safely called from any device
-  bool query() const {
-    if (!is_created_) {
-      return true;
-    }
-
-    cudaError_t err = cudaEventQuery(event_);
-    if (err == cudaSuccess) {
-      return true;
-    } else if (err != cudaErrorNotReady) {
-      C10_CUDA_CHECK(err);
-    } else {
-      // ignore and clear the error if not ready
-      (void)cudaGetLastError();
-    }
-
-    return false;
-  }
-
-  void record() { record(getCurrentCUDAStream()); }
-
-  void recordOnce(const CUDAStream& stream) {
-    if (!was_recorded_) record(stream);
-  }
-
-  // Note: cudaEventRecord must be called on the same device as the event.
-  void record(const CUDAStream& stream) {
-    if (!is_created_) {
-      createEvent(stream.device_index());
-    }
-
-    TORCH_CHECK(device_index_ == stream.device_index(), "Event device ", device_index_,
-      " does not match recording stream's device ", stream.device_index(), ".");
-    CUDAGuard guard(device_index_);
-
-#ifndef USE_ROCM
-    // it is an error to use cudaEventRecordExternal when not doing stream capture
-    unsigned int flags = (c10::cuda::currentStreamCaptureStatusMayInitCtx() != c10::cuda::CaptureStatus::None && external_) ? cudaEventRecordExternal : cudaEventRecordDefault;
-    AT_CUDA_CHECK(cudaEventRecordWithFlags(event_, stream, flags));
-#else
-    AT_CUDA_CHECK(cudaEventRecord(event_, stream));
-#endif
-    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
-    if (C10_UNLIKELY(interp)) {
-      (*interp)->trace_gpu_event_record(at::kCUDA,
-          reinterpret_cast<uintptr_t>(event_),
-          reinterpret_cast<uintptr_t>(stream.stream())
-      );
-    }
-    was_recorded_ = true;
-  }
-
-  // Note: cudaStreamWaitEvent must be called on the same device as the stream.
-  // The event has no actual GPU resources associated with it.
-  void block(const CUDAStream& stream) {
-    if (is_created_) {
-      CUDAGuard guard(stream.device_index());
-#ifndef USE_ROCM
-      // it is an error to use cudaEventWaitExternal when not doing stream capture
-      unsigned int flags = (c10::cuda::currentStreamCaptureStatusMayInitCtx() != c10::cuda::CaptureStatus::None && external_) ? cudaEventWaitExternal : cudaEventWaitDefault;
-      AT_CUDA_CHECK(cudaStreamWaitEvent(stream, event_, flags));
-#else
-      AT_CUDA_CHECK(cudaStreamWaitEvent(stream, event_));
-#endif
-      const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
-      if (C10_UNLIKELY(interp)) {
-        (*interp)->trace_gpu_event_wait(at::kCUDA,
-            reinterpret_cast<uintptr_t>(event_),
-            reinterpret_cast<uintptr_t>(stream.stream())
-        );
-      }
-    }
-  }
-
-  // Note: cudaEventElapsedTime can be safely called from any device
-  float elapsed_time(const CUDAEvent& other) const {
-    TORCH_CHECK_VALUE(
-        !(flags_ & cudaEventDisableTiming) && !(other.flags_ & cudaEventDisableTiming),
-        "Both events must be created with argument 'enable_timing=True'.");
-    TORCH_CHECK_VALUE(
-        is_created_ && other.isCreated(),
-        "Both events must be recorded before calculating elapsed time.");
-    TORCH_CHECK(
-        query() && other.query(),
-        "Both events must be completed before calculating elapsed time.");
-
-    float time_ms = 0;
-    // We do not strictly have to set the device index to the same as our event,
-    // but if we don't and the current device is not initialized, it will
-    // create a new cuda context, which will consume a lot of memory.
-    CUDAGuard guard(device_index_);
-    // raise cudaErrorNotReady if either event is recorded but not yet completed
-    AT_CUDA_CHECK(cudaEventElapsedTime(&time_ms, event_, other.event_));
-    return time_ms;
-  }
-
-  // Note: cudaEventSynchronize can be safely called from any device
-  void synchronize() const {
-    if (is_created_) {
-      const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
-      if (C10_UNLIKELY(interp)) {
-          (*interp)->trace_gpu_event_synchronization(at::kCUDA, reinterpret_cast<uintptr_t>(event_));
-      }
-      AT_CUDA_CHECK(cudaEventSynchronize(event_));
-    }
-  }
-
-  // Note: cudaIpcGetEventHandle must be called on the same device as the event
-  void ipc_handle(cudaIpcEventHandle_t * handle) {
-      if (!is_created_) {
-        // this CUDAEvent object was initially constructed from flags but event_
-        // is not created yet.
-        createEvent(getCurrentCUDAStream().device_index());
-      }
-      CUDAGuard guard(device_index_);
-      AT_CUDA_CHECK(cudaIpcGetEventHandle(handle, event_));
-  }
-
-private:
-  unsigned int flags_ = cudaEventDisableTiming;
-  bool is_created_ = false;
-  bool was_recorded_ = false;
-  bool external_ = false;
-  DeviceIndex device_index_ = -1;
-  cudaEvent_t event_{};
-
-  void createEvent(DeviceIndex device_index) {
-    external_ = (flags_ & cudaEventExternal) != 0;
-#ifdef USE_ROCM
-    TORCH_CHECK(!external_, "External events are disallowed in rocm");
-#endif
-    flags_ &= ~cudaEventExternal;
-    device_index_ = device_index;
-    CUDAGuard guard(device_index_);
-    AT_CUDA_CHECK(cudaEventCreateWithFlags(&event_, flags_));
-    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
-    if (C10_UNLIKELY(interp)) {
-      (*interp)->trace_gpu_event_creation(at::kCUDA, reinterpret_cast<uintptr_t>(event_));
-    }
-    is_created_ = true;
-  }
-
-  void moveHelper(CUDAEvent&& other) {
-    std::swap(flags_, other.flags_);
-    std::swap(is_created_, other.is_created_);
-    std::swap(was_recorded_, other.was_recorded_);
-    std::swap(device_index_, other.device_index_);
-    std::swap(event_, other.event_);
-  }
-};
-
-} // namespace at::cuda
+#include <c10/cuda/CUDAEvent.h>
+#include <c10/cuda/CUDAGuard.h>
--- a/aten/src/ATen/cuda/CachingHostAllocator.cpp
+++ b/aten/src/ATen/cuda/CachingHostAllocator.cpp
@ -14,63 +14,10 @@
 namespace at::cuda {
 namespace {

-// Note: cudaEventCreate when concurrently invoked from multiple threads can be
-// very expensive (at least on certain device/driver combinations). Thus, we a)
-// serialize event creation at a per-device level, and b) pool the events to
-// avoid constantly calling cudaEventCreate/cudaEventDestroy. This results in
-// significant improvements in multithreaded workloads with high allocation
-// rates.
-class EventPool {
- public:
-  using Event = std::unique_ptr<
-      at::cuda::CUDAEvent,
-      std::function<void(at::cuda::CUDAEvent*)>>;
-  EventPool() : pools_(at::cuda::device_count()) {}
-
-  Event get(DeviceIndex device) {
-    TORCH_INTERNAL_ASSERT(0 <= device);
-    TORCH_INTERNAL_ASSERT(device < static_cast<DeviceIndex>(pools_.size()));
-    auto& pool = pools_[device];
-    auto destructor = [&pool](at::cuda::CUDAEvent* event) {
-      std::lock_guard<std::mutex> g(pool.mutex_);
-      pool.event_pool_.push_back(std::unique_ptr<at::cuda::CUDAEvent>(event));
-    };
-
-    // Try to acquire an event from the per-device pool.
-    {
-      std::lock_guard<std::mutex> g(pool.mutex_);
-      if (!pool.event_pool_.empty()) {
-        auto* event = pool.event_pool_.back().release();
-        pool.event_pool_.pop_back();
-        return Event(event, destructor);
-      }
-    }
-    // otherwise, allocate a new event that will be returned to the pool on
-    // destruction.
-    return Event(
-        std::make_unique<at::cuda::CUDAEvent>(cudaEventDisableTiming).release(),
-        destructor);
-  }
-
-  void empty_cache() {
-    for (auto& pool : pools_) {
-      std::lock_guard<std::mutex> g(pool.mutex_);
-      pool.event_pool_.clear();
-    }
-  }
-
- private:
-  struct PerDevicePool {
-    alignas(64) std::mutex mutex_;
-    std::vector<std::unique_ptr<at::cuda::CUDAEvent>> event_pool_;
-  };
-  std::vector<PerDevicePool> pools_;
-};
-
 using Block = HostBlock<CUDAStream>;

 struct CUDACachingHostAllocatorImpl
-    : public CachingHostAllocatorImpl<CUDAStream, EventPool::Event> {
+    : public CachingHostAllocatorImpl<CUDAStream, CUDAEventPool::Event> {
 private:
  std::unordered_map<void*, bool> use_host_register;

@ -143,14 +90,14 @@ struct CUDACachingHostAllocatorImpl
  }

  void record_stream(
-      std::optional<std::vector<EventPool::Event>>& events,
+      std::optional<std::vector<CUDAEventPool::Event>>& events,
      CUDAStream stream) override {
    auto event = create_event_internal(stream.device_index());
    event->record(stream);
    events->push_back(std::move(event));
  }

-  bool query_event(EventPool::Event& event) override {
+  bool query_event(CUDAEventPool::Event& event) override {
    cudaError_t err = cudaEventQuery(*event);
    if (err == cudaErrorNotReady) {
      (void)cudaGetLastError(); // clear CUDA error
@ -166,9 +113,9 @@ struct CUDACachingHostAllocatorImpl
        pinned_use_background_threads();
  }

-  EventPool::Event create_event_internal(DeviceIndex idx) {
+  CUDAEventPool::Event create_event_internal(DeviceIndex idx) {
    // Leak the event pool to avoid shutdown issue.
-    static auto* event_pool = new EventPool();
+    static auto* event_pool = new CUDAEventPool();
    return event_pool->get(idx);
  }

--- a/aten/src/ATen/hip/impl/HIPEventMasqueradingAsCUDA.h
+++ b/aten/src/ATen/hip/impl/HIPEventMasqueradingAsCUDA.h
@ -0,0 +1,134 @@
+#pragma once
+
+#include <c10/hip/HIPEvent.h>
+
+// Use of c10::hip namespace here makes hipification easier, because
+// I don't have to also fix namespaces.  Sorry!
+namespace c10 { namespace hip {
+
+// See Note [Masquerading as CUDA] for motivation
+
+struct HIPEventMasqueradingAsCUDA {
+  HIPEventMasqueradingAsCUDA() noexcept = default;
+  HIPEventMasqueradingAsCUDA(unsigned int flags) noexcept
+      : event_(HIPEvent(flags)) {}
+  HIPEventMasqueradingAsCUDA(
+      DeviceIndex device_index,
+      const hipIpcEventHandle_t* handle)
+      : event_(HIPEvent(device_index, handle)) {}
+
+  ~HIPEventMasqueradingAsCUDA() = default;
+
+  HIPEventMasqueradingAsCUDA(const HIPEventMasqueradingAsCUDA&) = delete;
+  HIPEventMasqueradingAsCUDA& operator=(const HIPEventMasqueradingAsCUDA&) = delete;
+  HIPEventMasqueradingAsCUDA(HIPEventMasqueradingAsCUDA&& other) noexcept = default;
+  HIPEventMasqueradingAsCUDA& operator=(HIPEventMasqueradingAsCUDA&& other) noexcept = default;
+
+  operator hipEvent_t() const {
+    return event_.event();
+  }
+
+  // Less than operator (to allow use in sets)
+  friend bool operator<(
+      const HIPEventMasqueradingAsCUDA& left,
+      const HIPEventMasqueradingAsCUDA& right) {
+    return left.event_ < right.event_;
+  }
+
+  std::optional<c10::Device> device() const {
+    // Unsafely coerce HIP device into CUDA device
+    return Device(c10::DeviceType::CUDA, event_.device_index());
+  }
+  bool isCreated() const {
+    return event_.isCreated();
+  }
+  DeviceIndex device_index() const {
+    return event_.device_index();
+  }
+  hipEvent_t event() const {
+    return event_.event();
+  }
+  bool query() const {
+    return event_.query();
+  }
+  void record() {
+    return event_.record();
+  }
+
+  void recordOnce(const HIPStreamMasqueradingAsCUDA& stream) {
+    event_.recordOnce(stream.hip_stream());
+  }
+
+  void record(const HIPStreamMasqueradingAsCUDA& stream) {
+    event_.record(stream.hip_stream());
+  }
+
+  void block(const HIPStreamMasqueradingAsCUDA& stream) {
+    event_.block(stream.hip_stream());
+  }
+
+  float elapsed_time(const HIPEventMasqueradingAsCUDA& other) const {
+    return event_.elapsed_time(other.event_);
+  }
+
+  void synchronize() const {
+    event_.synchronize();
+  }
+
+  void ipc_handle(hipIpcEventHandle_t* handle) {
+    event_.ipc_handle(handle);
+  }
+
+ private:
+  HIPEvent event_;
+};
+
+
+class HIPEventPoolMasqueradingAsCUDA {
+ public:
+  using Event = std::unique_ptr<
+      HIPEventMasqueradingAsCUDA,
+      std::function<void(HIPEventMasqueradingAsCUDA*)>>;
+  HIPEventPoolMasqueradingAsCUDA() = default;
+
+  Event get(DeviceIndex device) {
+    TORCH_INTERNAL_ASSERT(0 <= device);
+    TORCH_INTERNAL_ASSERT(device < static_cast<DeviceIndex>(pools_.size()));
+    auto& pool = pools_[device];
+    auto destructor = [&pool](HIPEventMasqueradingAsCUDA* event) {
+      std::lock_guard<std::mutex> g(pool.mutex_);
+      pool.event_pool_.push_back(std::unique_ptr<HIPEventMasqueradingAsCUDA>(event));
+    };
+
+    // Try to acquire an event from the per-device pool.
+    {
+      std::lock_guard<std::mutex> g(pool.mutex_);
+      if (!pool.event_pool_.empty()) {
+        auto* event = pool.event_pool_.back().release();
+        pool.event_pool_.pop_back();
+        return Event(event, destructor);
+      }
+    }
+    // otherwise, allocate a new event that will be returned to the pool on
+    // destruction.
+    return Event(
+        std::make_unique<HIPEventMasqueradingAsCUDA>().release(), destructor);
+  }
+
+  void empty_cache() {
+    for (auto& pool : pools_) {
+      std::lock_guard<std::mutex> g(pool.mutex_);
+      pool.event_pool_.clear();
+    }
+  }
+
+ private:
+  struct PerDevicePool {
+    alignas(64) std::mutex mutex_;
+    std::vector<std::unique_ptr<HIPEventMasqueradingAsCUDA>> event_pool_;
+  };
+  std::vector<PerDevicePool> pools_ =
+      std::vector<PerDevicePool>(c10::hip::device_count());
+};
+
+}} // namespace c10::hip
--- a/c10/cuda/CMakeLists.txt
+++ b/c10/cuda/CMakeLists.txt
@ -43,6 +43,7 @@ set(C10_CUDA_HEADERS
    CUDACachingAllocator.h
    CUDADeviceAssertionHost.h
    CUDAException.h
+    CUDAEvent.h
    CUDAFunctions.h
    CUDAGuard.h
    CUDAMacros.h
--- a/c10/cuda/CUDAEvent.h
+++ b/c10/cuda/CUDAEvent.h
@ -0,0 +1,324 @@
+#pragma once
+
+#include <c10/core/impl/GPUTrace.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/cuda/CUDAStream.h>
+#include <c10/util/Exception.h>
+
+/*
+ * `cudaEventExternal` is a torch-specific flag that is used to
+ * indicate that the CUDAEvent will be used only for synchronization
+ * with work outside of the cuda graph, rather than creation of
+ * cross-stream dependencies within a cuda graph. Resources:
+ * https://docs.nvidia.com/cuda/archive/12.9.0/cuda-c-programming-guide/index.html#cross-stream-dependencies-and-events
+ * https://docs.nvidia.com/cuda/archive/12.9.0/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3457b81d1d32c6a00f6132fbc2693d47
+ * https://docs.nvidia.com/cuda/archive/12.9.0/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g0c23426b7252eaa9cef695859991304e
+ */
+#define cudaEventExternal 0x08
+
+namespace c10::cuda {
+
+/*
+ * CUDAEvents are movable not copyable wrappers around CUDA's events.
+ *
+ * CUDAEvents are constructed lazily when first recorded unless it is
+ * reconstructed from a cudaIpcEventHandle_t. The event has a device, and this
+ * device is acquired from the first recording stream. However, if reconstructed
+ * from a handle, the device should be explicitly specified; or if ipc_handle()
+ * is called before the event is ever recorded, it will use the current device.
+ * Later streams that record the event must match this device.
+ */
+struct CUDAEvent {
+  // Constructors
+  // Default value for `flags` is specified below - it's cudaEventDisableTiming
+  CUDAEvent() noexcept = default;
+  CUDAEvent(unsigned int flags) noexcept : flags_{flags} {}
+
+  CUDAEvent(DeviceIndex device_index, const cudaIpcEventHandle_t* handle)
+      : device_index_(device_index) {
+    CUDAGuard guard(device_index_);
+
+    C10_CUDA_CHECK(cudaIpcOpenEventHandle(&event_, *handle));
+    is_created_ = true;
+  }
+
+  // Note: event destruction done on creating device to avoid creating a
+  // CUDA context on other devices.
+  ~CUDAEvent() {
+    if (is_created_) {
+      CUDAGuard guard(device_index_);
+      const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+      if (C10_UNLIKELY(interp)) {
+        (*interp)->trace_gpu_event_deletion(
+            c10::kCUDA, reinterpret_cast<uintptr_t>(event_));
+      }
+      C10_CUDA_CHECK_WARN(cudaEventDestroy(event_));
+    }
+  }
+
+  CUDAEvent(const CUDAEvent&) = delete;
+  CUDAEvent& operator=(const CUDAEvent&) = delete;
+
+  CUDAEvent(CUDAEvent&& other) noexcept {
+    moveHelper(other);
+  }
+  CUDAEvent& operator=(CUDAEvent&& other) noexcept {
+    if (this != &other) {
+      moveHelper(other);
+    }
+    return *this;
+  }
+
+  operator cudaEvent_t() const {
+    return event();
+  }
+
+  // Less than operator (to allow use in sets)
+  friend bool operator<(const CUDAEvent& left, const CUDAEvent& right) {
+    return left.event_ < right.event_;
+  }
+
+  std::optional<c10::Device> device() const {
+    if (is_created_) {
+      return c10::Device(c10::kCUDA, device_index_);
+    } else {
+      return {};
+    }
+  }
+
+  bool isCreated() const {
+    return is_created_;
+  }
+  DeviceIndex device_index() const {
+    return device_index_;
+  }
+  cudaEvent_t event() const {
+    return event_;
+  }
+
+  // Note: cudaEventQuery can be safely called from any device
+  bool query() const {
+    if (!is_created_) {
+      return true;
+    }
+
+    cudaError_t err = cudaEventQuery(event_);
+    if (err == cudaSuccess) {
+      return true;
+    } else if (err != cudaErrorNotReady) {
+      C10_CUDA_CHECK(err);
+    } else {
+      // ignore and clear the error if not ready
+      (void)cudaGetLastError();
+    }
+
+    return false;
+  }
+
+  void record() {
+    record(getCurrentCUDAStream());
+  }
+
+  void recordOnce(const CUDAStream& stream) {
+    if (!was_recorded_)
+      record(stream);
+  }
+
+  // Note: cudaEventRecord must be called on the same device as the event.
+  void record(const CUDAStream& stream) {
+    if (!is_created_) {
+      createEvent(stream.device_index());
+    }
+
+    TORCH_CHECK(
+        device_index_ == stream.device_index(),
+        "Event device ",
+        device_index_,
+        " does not match recording stream's device ",
+        stream.device_index(),
+        ".");
+    CUDAGuard guard(device_index_);
+
+#ifndef USE_ROCM
+    // it is an error to use cudaEventRecordExternal when not doing stream
+    // capture
+    unsigned int flags = (c10::cuda::currentStreamCaptureStatusMayInitCtx() !=
+                              c10::cuda::CaptureStatus::None &&
+                          external_)
+        ? cudaEventRecordExternal
+        : cudaEventRecordDefault;
+    C10_CUDA_CHECK(cudaEventRecordWithFlags(event_, stream, flags));
+#else
+    C10_CUDA_CHECK(cudaEventRecord(event_, stream));
+#endif
+    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+    if (C10_UNLIKELY(interp)) {
+      (*interp)->trace_gpu_event_record(
+          c10::kCUDA,
+          reinterpret_cast<uintptr_t>(event_),
+          reinterpret_cast<uintptr_t>(stream.stream()));
+    }
+    was_recorded_ = true;
+  }
+
+  // Note: cudaStreamWaitEvent must be called on the same device as the stream.
+  // The event has no actual GPU resources associated with it.
+  void block(const CUDAStream& stream) {
+    if (is_created_) {
+      CUDAGuard guard(stream.device_index());
+#ifndef USE_ROCM
+      // it is an error to use cudaEventWaitExternal when not doing stream
+      // capture
+      unsigned int flags = (c10::cuda::currentStreamCaptureStatusMayInitCtx() !=
+                                c10::cuda::CaptureStatus::None &&
+                            external_)
+          ? cudaEventWaitExternal
+          : cudaEventWaitDefault;
+      C10_CUDA_CHECK(cudaStreamWaitEvent(stream, event_, flags));
+#else
+      C10_CUDA_CHECK(cudaStreamWaitEvent(stream, event_));
+#endif
+      const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+      if (C10_UNLIKELY(interp)) {
+        (*interp)->trace_gpu_event_wait(
+            c10::kCUDA,
+            reinterpret_cast<uintptr_t>(event_),
+            reinterpret_cast<uintptr_t>(stream.stream()));
+      }
+    }
+  }
+
+  // Note: cudaEventElapsedTime can be safely called from any device
+  float elapsed_time(const CUDAEvent& other) const {
+    TORCH_CHECK_VALUE(
+        !(flags_ & cudaEventDisableTiming) &&
+            !(other.flags_ & cudaEventDisableTiming),
+        "Both events must be created with argument 'enable_timing=True'.");
+    TORCH_CHECK_VALUE(
+        is_created_ && other.isCreated(),
+        "Both events must be recorded before calculating elapsed time.");
+    TORCH_CHECK(
+        query() && other.query(),
+        "Both events must be completed before calculating elapsed time.");
+
+    float time_ms = 0;
+    // We do not strictly have to set the device index to the same as our event,
+    // but if we don't and the current device is not initialized, it will
+    // create a new cuda context, which will consume a lot of memory.
+    CUDAGuard guard(device_index_);
+    // raise cudaErrorNotReady if either event is recorded but not yet completed
+    C10_CUDA_CHECK(cudaEventElapsedTime(&time_ms, event_, other.event_));
+    return time_ms;
+  }
+
+  // Note: cudaEventSynchronize can be safely called from any device
+  void synchronize() const {
+    if (is_created_) {
+      const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+      if (C10_UNLIKELY(interp)) {
+        (*interp)->trace_gpu_event_synchronization(
+            c10::kCUDA, reinterpret_cast<uintptr_t>(event_));
+      }
+      C10_CUDA_CHECK(cudaEventSynchronize(event_));
+    }
+  }
+
+  // Note: cudaIpcGetEventHandle must be called on the same device as the event
+  void ipc_handle(cudaIpcEventHandle_t* handle) {
+    if (!is_created_) {
+      // this CUDAEvent object was initially constructed from flags but event_
+      // is not created yet.
+      createEvent(getCurrentCUDAStream().device_index());
+    }
+    CUDAGuard guard(device_index_);
+    C10_CUDA_CHECK(cudaIpcGetEventHandle(handle, event_));
+  }
+
+ private:
+  unsigned int flags_ = cudaEventDisableTiming;
+  bool is_created_ = false;
+  bool was_recorded_ = false;
+  bool external_ = false;
+  DeviceIndex device_index_ = -1;
+  cudaEvent_t event_{};
+
+  void createEvent(DeviceIndex device_index) {
+    external_ = (flags_ & cudaEventExternal) != 0;
+#ifdef USE_ROCM
+    TORCH_CHECK(!external_, "External events are disallowed in rocm");
+#endif
+    flags_ &= ~cudaEventExternal;
+    device_index_ = device_index;
+    CUDAGuard guard(device_index_);
+    C10_CUDA_CHECK(cudaEventCreateWithFlags(&event_, flags_));
+    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+    if (C10_UNLIKELY(interp)) {
+      (*interp)->trace_gpu_event_creation(
+          c10::kCUDA, reinterpret_cast<uintptr_t>(event_));
+    }
+    is_created_ = true;
+  }
+
+  void moveHelper(CUDAEvent& other) {
+    std::swap(flags_, other.flags_);
+    std::swap(is_created_, other.is_created_);
+    std::swap(was_recorded_, other.was_recorded_);
+    std::swap(device_index_, other.device_index_);
+    std::swap(event_, other.event_);
+  }
+};
+
+// Note: cudaEventCreate when concurrently invoked from multiple threads can be
+// very expensive (at least on certain device/driver combinations). Thus, we a)
+// serialize event creation at a per-device level, and b) pool the events to
+// avoid constantly calling cudaEventCreate/cudaEventDestroy. This results in
+// significant improvements in multithreaded workloads with high allocation
+// rates.
+class CUDAEventPool {
+ public:
+  using Event = std::unique_ptr<
+      c10::cuda::CUDAEvent,
+      std::function<void(c10::cuda::CUDAEvent*)>>;
+  CUDAEventPool() = default;
+
+  Event get(DeviceIndex device) {
+    TORCH_INTERNAL_ASSERT(0 <= device);
+    TORCH_INTERNAL_ASSERT(device < static_cast<DeviceIndex>(pools_.size()));
+    auto& pool = pools_[device];
+    auto destructor = [&pool](c10::cuda::CUDAEvent* event) {
+      std::lock_guard<std::mutex> g(pool.mutex_);
+      pool.event_pool_.push_back(std::unique_ptr<c10::cuda::CUDAEvent>(event));
+    };
+
+    // Try to acquire an event from the per-device pool.
+    {
+      std::lock_guard<std::mutex> g(pool.mutex_);
+      if (!pool.event_pool_.empty()) {
+        auto* event = pool.event_pool_.back().release();
+        pool.event_pool_.pop_back();
+        return Event(event, destructor);
+      }
+    }
+    // otherwise, allocate a new event that will be returned to the pool on
+    // destruction.
+    return Event(
+        std::make_unique<c10::cuda::CUDAEvent>().release(), destructor);
+  }
+
+  void empty_cache() {
+    for (auto& pool : pools_) {
+      std::lock_guard<std::mutex> g(pool.mutex_);
+      pool.event_pool_.clear();
+    }
+  }
+
+ private:
+  struct PerDevicePool {
+    alignas(64) std::mutex mutex_;
+    std::vector<std::unique_ptr<c10::cuda::CUDAEvent>> event_pool_;
+  };
+  std::vector<PerDevicePool> pools_ =
+      std::vector<PerDevicePool>(c10::cuda::device_count());
+};
+
+} // namespace c10::cuda
--- a/torch/utils/hipify/cuda_to_hip_mappings.py
+++ b/torch/utils/hipify/cuda_to_hip_mappings.py
@ -8910,6 +8910,10 @@ PYTORCH_SPECIFIC_MAPPINGS = collections.OrderedDict(
                API_PYTORCH,
            ),
        ),
+        ("cuda::CUDAEvent", ("hip::HIPEventMasqueradingAsCUDA", API_PYTORCH)),
+        ("CUDAEvent", ("HIPEventMasqueradingAsCUDA", API_PYTORCH)),
+        ("cuda::CUDAEventPool", ("hip::HIPEventPoolMasqueradingAsCUDA", API_PYTORCH)),
+        ("CUDAEventPool", ("HIPEventPoolMasqueradingAsCUDA", API_PYTORCH)),
        ("cuda::CUDAStream", ("hip::HIPStreamMasqueradingAsCUDA", API_PYTORCH)),
        ("CUDAStream", ("HIPStreamMasqueradingAsCUDA", API_PYTORCH)),
        (
@ -8964,6 +8968,14 @@ PYTORCH_SPECIFIC_MAPPINGS = collections.OrderedDict(
            "c10/cuda/CUDACachingAllocator.h",
            ("ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h", API_PYTORCH),
        ),
+        (
+            "ATen/cuda/CUDAEvent.h",  # To keep BC, we have to keep this mapping
+            ("ATen/hip/HIPEvent.h", API_PYTORCH),
+        ),
+        (
+            "c10/cuda/CUDAEvent.h",
+            ("ATen/hip/impl/HIPEventMasqueradingAsCUDA.h", API_PYTORCH),
+        ),
        (
            "c10/cuda/CUDAStream.h",
            ("ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h", API_PYTORCH),
@ -9102,6 +9114,7 @@ C10_MAPPINGS = collections.OrderedDict(
        ("c10/cuda/CUDAMathCompat.h", ("c10/hip/HIPMathCompat.h", API_C10)),
        ("c10/cuda/CUDAFunctions.h", ("c10/hip/HIPFunctions.h", API_C10)),
        ("c10/cuda/CUDAMiscFunctions.h", ("c10/hip/HIPMiscFunctions.h", API_C10)),
+        ("c10/cuda/CUDAEvent.h", ("c10/hip/HIPEvent.h", API_C10)),
        ("c10/cuda/CUDAStream.h", ("c10/hip/HIPStream.h", API_C10)),
        ("c10/cuda/CUDAGraphsC10Utils.h", ("c10/hip/HIPGraphsC10Utils.h", API_C10)),
        ("c10/cuda/CUDAAllocatorConfig.h", ("c10/hip/HIPAllocatorConfig.h", API_C10)),
Author	SHA1	Message	Date
Yu, Guangye	11475bc680	Update [ghstack-poisoned]	2025-08-09 02:13:33 +00:00
Yu, Guangye	f062bd07f6	Update (base update) [ghstack-poisoned]	2025-08-09 02:13:33 +00:00
Yu, Guangye	276bd0976f	Update [ghstack-poisoned]	2025-07-29 15:42:33 +00:00
Yu, Guangye	ef60b379bc	Update (base update) [ghstack-poisoned]	2025-07-29 15:42:33 +00:00
Yu, Guangye	c27655a804	Update [ghstack-poisoned]	2025-07-18 14:37:10 +00:00
Yu, Guangye	33e100bc65	Update (base update) [ghstack-poisoned]	2025-07-18 14:37:10 +00:00
Yu, Guangye	f58c6e0e18	Update [ghstack-poisoned]	2025-07-18 14:16:26 +00:00
Yu, Guangye	e87a329847	Update (base update) [ghstack-poisoned]	2025-07-18 14:16:26 +00:00
Yu, Guangye	df8d8745fa	Update [ghstack-poisoned]	2025-07-17 17:27:16 +00:00
Yu, Guangye	05af53b382	Update (base update) [ghstack-poisoned]	2025-07-17 17:27:16 +00:00
Yu, Guangye	ca39521f19	Update [ghstack-poisoned]	2025-07-17 11:55:09 +00:00
Yu, Guangye	2a2ca81995	Update (base update) [ghstack-poisoned]	2025-07-17 11:55:09 +00:00
Yu, Guangye	dce84b6e09	Update [ghstack-poisoned]	2025-07-16 17:20:31 +00:00
Yu, Guangye	ea84136101	Update (base update) [ghstack-poisoned]	2025-07-15 21:33:08 +00:00
Yu, Guangye	f463a526da	Update [ghstack-poisoned]	2025-07-15 21:33:08 +00:00
Yu, Guangye	41a448dc9a	Update (base update) [ghstack-poisoned]	2025-07-15 20:47:11 +00:00
Yu, Guangye	5b5d8ccfbc	Update [ghstack-poisoned]	2025-07-15 20:47:11 +00:00
Yu, Guangye	2ea7c398a2	Update [ghstack-poisoned]	2025-07-15 20:44:32 +00:00
Yu, Guangye	78dd8a0c1d	Update (base update) [ghstack-poisoned]	2025-07-15 20:08:52 +00:00
Yu, Guangye	e9df1a9ccc	Update [ghstack-poisoned]	2025-07-15 20:08:52 +00:00
Yu, Guangye	24a4ef6bb0	Update (base update) [ghstack-poisoned]	2025-07-15 19:51:53 +00:00
Yu, Guangye	626bfd34ef	Update [ghstack-poisoned]	2025-07-15 19:51:53 +00:00
Yu, Guangye	90b0b0c89a	Update (base update) [ghstack-poisoned]	2025-07-15 17:57:34 +00:00
Yu, Guangye	a4623e837c	Update [ghstack-poisoned]	2025-07-15 17:57:34 +00:00
Yu, Guangye	560ff47a2f	Update (base update) [ghstack-poisoned]	2025-07-15 16:46:52 +00:00
Yu, Guangye	9d343a6174	Update [ghstack-poisoned]	2025-07-15 16:46:52 +00:00
Yu, Guangye	cc235e71e6	Update (base update) [ghstack-poisoned]	2025-07-15 14:39:20 +00:00
Yu, Guangye	ddd16b1e20	Update [ghstack-poisoned]	2025-07-15 14:39:20 +00:00
Yu, Guangye	a81f92dc10	Update (base update) [ghstack-poisoned]	2025-07-15 13:22:34 +00:00
Yu, Guangye	6a68668218	Update [ghstack-poisoned]	2025-07-15 13:22:34 +00:00
Yu, Guangye	929c2b385d	Update (base update) [ghstack-poisoned]	2025-07-15 12:34:03 +00:00
Yu, Guangye	b1274bd5e3	Update [ghstack-poisoned]	2025-07-15 12:34:03 +00:00
Yu, Guangye	cb9dddcef3	Update (base update) [ghstack-poisoned]	2025-07-14 15:01:01 +00:00
Yu, Guangye	b2aeee430b	Update [ghstack-poisoned]	2025-07-14 15:01:01 +00:00