Check all CUDA API calls for errors in torch/ (#81560)

Summary: Original commit changeset: 0bb770d2cdb2 Original Phabricator Diff: D35194935 (79e5b053b6) Differential Revision: D35291874 Pull Request resolved: https://github.com/pytorch/pytorch/pull/81560 Approved by: https://github.com/ezyang
2025-10-20 21:14:14 +08:00 · 2022-10-28 00:40:47 +00:00
parent 4e3a0ff92e
commit 3ece9fb45d
6 changed files with 22 additions and 21 deletions
--- a/torch/csrc/CudaIPCTypes.cpp
+++ b/torch/csrc/CudaIPCTypes.cpp
@ -195,7 +195,7 @@ CudaIPCSentData::~CudaIPCSentData() {
  try {
    if (event_sync_required_) {
      at::cuda::CUDAGuard device_guard(device_.index());
-      cudaEventDestroy(event_);
+      C10_CUDA_CHECK(cudaEventDestroy(event_));
      if (!CudaIPCGlobalEntities::alive) {
        return;
      }
--- a/torch/csrc/cuda/nccl.cpp
+++ b/torch/csrc/cuda/nccl.cpp
@ -3,6 +3,7 @@
 #include <torch/csrc/cuda/nccl.h>

 #include <ATen/ATen.h>
+#include <c10/cuda/CUDAException.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/util/Exception.h>
 #include <c10/util/hash.h>
@ -142,7 +143,7 @@ struct NcclCommList {
    if (comms) {
      for (const auto i : c10::irange(ndevices)) {
        int dummy_var;
-        if (cudaGetDevice(&dummy_var) != cudaSuccess) {
+        if (C10_CUDA_ERROR_HANDLED(cudaGetDevice(&dummy_var)) != cudaSuccess) {
          /* there are cases when this destructor is called after the
           CUDA driver is already unloaded from the process.
           In these cases, skip ncclCommDestroy */
--- a/torch/csrc/cuda/shared/cudart.cpp
+++ b/torch/csrc/cuda/shared/cudart.cpp
@ -71,25 +71,26 @@ void initCudartBindings(PyObject* module) {
      "cuda"
      "HostRegister",
      [](uintptr_t ptr, size_t size, unsigned int flags) -> cudaError_t {
-        return cudaHostRegister((void*)ptr, size, flags);
+        return C10_CUDA_ERROR_HANDLED(
+            cudaHostRegister((void*)ptr, size, flags));
      });
  cudart.def(
      "cuda"
      "HostUnregister",
      [](uintptr_t ptr) -> cudaError_t {
-        return cudaHostUnregister((void*)ptr);
+        return C10_CUDA_ERROR_HANDLED(cudaHostUnregister((void*)ptr));
      });
  cudart.def(
      "cuda"
      "StreamCreate",
      [](uintptr_t ptr) -> cudaError_t {
-        return cudaStreamCreate((cudaStream_t*)ptr);
+        return C10_CUDA_ERROR_HANDLED(cudaStreamCreate((cudaStream_t*)ptr));
      });
  cudart.def(
      "cuda"
      "StreamDestroy",
      [](uintptr_t ptr) -> cudaError_t {
-        return cudaStreamDestroy((cudaStream_t)ptr);
+        return C10_CUDA_ERROR_HANDLED(cudaStreamDestroy((cudaStream_t)ptr));
      });
 #if !defined(USE_ROCM)
  cudart.def(
@ -104,7 +105,7 @@ void initCudartBindings(PyObject* module) {
        c10::cuda::CUDAGuard guard(device);
        size_t device_free = 0;
        size_t device_total = 0;
-        cudaMemGetInfo(&device_free, &device_total);
+        C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
        return {device_free, device_total};
      });
 }
--- a/torch/csrc/jit/codegen/cuda/executor.cpp
+++ b/torch/csrc/jit/codegen/cuda/executor.cpp
@ -1175,9 +1175,9 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
  if (measure_kernel_time_ ||
      isDebugDumpEnabled(DebugDumpOption::EffectiveBandwidth) ||
      isDebugDumpEnabled(DebugDumpOption::PerfDebugVerbose)) {
-    cudaEventCreate(&start_event);
-    cudaEventCreate(&finish_event);
-    cudaEventRecord(start_event);
+    C10_CUDA_CHECK(cudaEventCreate(&start_event));
+    C10_CUDA_CHECK(cudaEventCreate(&finish_event));
+    C10_CUDA_CHECK(cudaEventRecord(start_event));
  }

  if (execute_kernel_) {
@ -1233,12 +1233,13 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
  if (measure_kernel_time_ ||
      isDebugDumpEnabled(DebugDumpOption::EffectiveBandwidth) ||
      isDebugDumpEnabled(DebugDumpOption::PerfDebugVerbose)) {
-    cudaEventRecord(finish_event);
-    cudaEventSynchronize(start_event);
-    cudaEventSynchronize(finish_event);
-    cudaEventElapsedTime(&kernel_time_ms_, start_event, finish_event);
-    cudaEventDestroy(start_event);
-    cudaEventDestroy(finish_event);
+    C10_CUDA_CHECK(cudaEventRecord(finish_event));
+    C10_CUDA_CHECK(cudaEventSynchronize(start_event));
+    C10_CUDA_CHECK(cudaEventSynchronize(finish_event));
+    C10_CUDA_CHECK(
+        cudaEventElapsedTime(&kernel_time_ms_, start_event, finish_event));
+    C10_CUDA_CHECK(cudaEventDestroy(start_event));
+    C10_CUDA_CHECK(cudaEventDestroy(finish_event));

    bytes_processed_ = 0;
    // Figure how many bytes are inputs, outputs, and temporary buffers
--- a/torch/csrc/jit/codegen/cuda/executor_utils.cpp
+++ b/torch/csrc/jit/codegen/cuda/executor_utils.cpp
@ -941,7 +941,7 @@ void initializeCudaContext() {
  if (!pctx) {
    std::unique_lock<std::mutex> cudaFreeMutexLock(
        *(c10::cuda::getFreeMutex()));
-    cudaFree(nullptr);
+    C10_CUDA_CHECK(cudaFree(nullptr));
  }
 }

--- a/torch/csrc/profiler/stubs/cuda.cpp
+++ b/torch/csrc/profiler/stubs/cuda.cpp
@ -81,16 +81,14 @@ struct CUDAMethods : public ProfilerStubs {

  void onEachDevice(std::function<void(int)> op) const override {
    at::cuda::OptionalCUDAGuard device_guard;
-    // NOLINTNEXTLINE(bugprone-signed-char-misuse)
-    int count = at::cuda::device_count();
-    for (const auto i : c10::irange(count)) {
+    for (const auto i : c10::irange(at::cuda::device_count())) {
      device_guard.set_index(i);
      op(i);
    }
  }

  void synchronize() const override {
-    cudaDeviceSynchronize();
+    TORCH_CUDA_CHECK(cudaDeviceSynchronize());
  }

  bool enabled() const override {