[PG NCCL] catch cuda lib runtime error - driver shutting down (#74258)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/74258 There is a case when PG cleanup thread checks cuda event status after cuda runtime library has been unloaded. When that happens, it would leads to a "driver shutting down" error. This issue usually happens when cuda API is called in global or static object destructor. Test Plan: wait for user Reviewed By: jiayisuse, osalpekar Differential Revision: D34904896 fbshipit-source-id: 705c0812132dae97ea55fcb22730557880ca35e1 (cherry picked from commit ecb5f14a022319402c509b86209f6205212956b7)
2025-10-29 19:24:55 +08:00 · 2022-03-18 12:54:23 -07:00
parent d6edb8473e
commit ec071a0815
1 changed files with 13 additions and 4 deletions
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@ -373,11 +373,20 @@ bool ProcessGroupNCCL::WorkNCCL::startedGPUExecutionInternal() const {
 }

 bool ProcessGroupNCCL::WorkNCCL::finishedGPUExecutionInternal() const {
-  for (const auto i : c10::irange(devices_.size())) {
-    // Checking the work's corresponding CUDA events' status
-    if (!(*ncclEndEvents_)[i].query()) {
-      return false;
+  try {
+    for (const auto i : c10::irange(devices_.size())) {
+      // Checking the work's corresponding CUDA events' status
+      if (!(*ncclEndEvents_)[i].query()) {
+        return false;
+      }
    }
+  } catch (const std::exception& e) {
+    if (std::string(e.what()).find("driver shutting down") == std::string::npos) {
+      throw;
+    }
+    LOG(INFO) << "[Rank " << rank_
+              << "] Event query failed with exception: "
+              << e.what();
  }
  return true;
 }