[c10d] add ncclGetLastError to NCCL pg (#83724)

This PR add ncclGetLastError API to the nccl pg, to provide better error reporting out of nccl failures directly, instead of guessing on random reasons Differential Revision: [D39161199](https://our.internmc.facebook.com/intern/diff/D39161199) Pull Request resolved: https://github.com/pytorch/pytorch/pull/83724 Approved by: https://github.com/kwen2501, https://github.com/H-Huang
2025-10-20 21:14:14 +08:00 · 2022-09-14 06:02:08 +00:00
parent ccade9410f
commit 976f8bee94
1 changed files with 32 additions and 13 deletions
--- a/torch/csrc/distributed/c10d/NCCLUtils.hpp
+++ b/torch/csrc/distributed/c10d/NCCLUtils.hpp
@ -15,30 +15,49 @@
 namespace {
 // Provides additional detail into NCCL error codes based on when these are
 // thrown in the NCCL codebase.
-const inline char* getNcclErrorDetailStr(ncclResult_t error, c10::optional<std::string> processGroupFailureReason = c10::nullopt) {
+std::string getNcclErrorDetailStr(ncclResult_t error, c10::optional<std::string> processGroupFailureReason = c10::nullopt) {
  // Prioritize failure reason provided by PG NCCL first, as it can abort
  // communicators when it encounters collective timeouts, etc.
  if (processGroupFailureReason != c10::nullopt) {
    return (*processGroupFailureReason).c_str();
  }
+  std::string interpret;
+  std::string err;
+#ifdef ENABLE_NCCL_GET_LAST_ERROR
+  err = "\nLast error:\n" + std::string(ncclGetLastError(NULL));
+#endif
  switch (error) {
    case ncclUnhandledCudaError:
-      return "ncclUnhandledCudaError: Call to CUDA function failed.";
-    case ncclSystemError:
-      return "ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. "
-        "It can be also caused by unexpected exit of a remote peer, you can check NCCL warnings for failure reason and see if there is connection closure by a peer.";
-    case ncclInternalError:
-      return "ncclInternalError: Internal check failed. This is either a bug in NCCL or due to memory corruption";
-    case ncclInvalidArgument:
-      return "ncclInvalidArgument: Invalid value for an argument (such as invalid pointer, device count, ip:host pair, etc).";
-    case ncclInvalidUsage:
-      return "ncclInvalidUsage: This usually reflects invalid usage of NCCL library (such as too many async ops, too many collectives at once, mixing streams in a group, etc).";
-    default:
+      interpret = "ncclUnhandledCudaError: Call to CUDA function failed.";
      break;
+    case ncclSystemError:
+      interpret = "ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. "
+        "It can be also caused by unexpected exit of a remote peer.";
+      break;
+    case ncclInternalError:
+      interpret = "ncclInternalError: Internal check failed.";
+      break;
+    case ncclInvalidArgument:
+      interpret = "ncclInvalidArgument: Invalid value for an argument.";
+      break;
+    case ncclInvalidUsage:
+      interpret = "ncclInvalidUsage: This usually reflects invalid usage of NCCL library.";
+      break;
+    default:
+      interpret = "Unknown NCCL error!";
  }
-  return "Unknown NCCL error";
+  return interpret + err;
 }
 } // namespace
+
+// ncclGetLastError() is enabled only for NCCL versions 2.13+
+#if defined(NCCL_MAJOR) && (NCCL_MAJOR == 2) && defined(NCCL_MINOR) && \
+    (NCCL_MINOR >= 13)
+#define ENABLE_NCCL_GET_LAST_ERROR
+#elif defined(NCCL_MAJOR) && (NCCL_MAJOR >= 3)
+#define ENABLE_NCCL_GET_LAST_ERROR
+#endif
+
 // Error checking is enabled only for NCCL versions 2.4+ since ncclCommAbort()
 // and ncclCommGetAsyncError() are not supported in earlier versions.
 #if defined(NCCL_MAJOR) && (NCCL_MAJOR == 2) && defined(NCCL_MINOR) && \