mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
[c10d] add ncclGetLastError to NCCL pg (#83724)
This PR add ncclGetLastError API to the nccl pg, to provide better error reporting out of nccl failures directly, instead of guessing on random reasons Differential Revision: [D39161199](https://our.internmc.facebook.com/intern/diff/D39161199) Pull Request resolved: https://github.com/pytorch/pytorch/pull/83724 Approved by: https://github.com/kwen2501, https://github.com/H-Huang
This commit is contained in:
committed by
PyTorch MergeBot
parent
ccade9410f
commit
976f8bee94
@ -15,30 +15,49 @@
|
||||
namespace {
|
||||
// Provides additional detail into NCCL error codes based on when these are
|
||||
// thrown in the NCCL codebase.
|
||||
const inline char* getNcclErrorDetailStr(ncclResult_t error, c10::optional<std::string> processGroupFailureReason = c10::nullopt) {
|
||||
std::string getNcclErrorDetailStr(ncclResult_t error, c10::optional<std::string> processGroupFailureReason = c10::nullopt) {
|
||||
// Prioritize failure reason provided by PG NCCL first, as it can abort
|
||||
// communicators when it encounters collective timeouts, etc.
|
||||
if (processGroupFailureReason != c10::nullopt) {
|
||||
return (*processGroupFailureReason).c_str();
|
||||
}
|
||||
std::string interpret;
|
||||
std::string err;
|
||||
#ifdef ENABLE_NCCL_GET_LAST_ERROR
|
||||
err = "\nLast error:\n" + std::string(ncclGetLastError(NULL));
|
||||
#endif
|
||||
switch (error) {
|
||||
case ncclUnhandledCudaError:
|
||||
return "ncclUnhandledCudaError: Call to CUDA function failed.";
|
||||
case ncclSystemError:
|
||||
return "ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. "
|
||||
"It can be also caused by unexpected exit of a remote peer, you can check NCCL warnings for failure reason and see if there is connection closure by a peer.";
|
||||
case ncclInternalError:
|
||||
return "ncclInternalError: Internal check failed. This is either a bug in NCCL or due to memory corruption";
|
||||
case ncclInvalidArgument:
|
||||
return "ncclInvalidArgument: Invalid value for an argument (such as invalid pointer, device count, ip:host pair, etc).";
|
||||
case ncclInvalidUsage:
|
||||
return "ncclInvalidUsage: This usually reflects invalid usage of NCCL library (such as too many async ops, too many collectives at once, mixing streams in a group, etc).";
|
||||
default:
|
||||
interpret = "ncclUnhandledCudaError: Call to CUDA function failed.";
|
||||
break;
|
||||
case ncclSystemError:
|
||||
interpret = "ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. "
|
||||
"It can be also caused by unexpected exit of a remote peer.";
|
||||
break;
|
||||
case ncclInternalError:
|
||||
interpret = "ncclInternalError: Internal check failed.";
|
||||
break;
|
||||
case ncclInvalidArgument:
|
||||
interpret = "ncclInvalidArgument: Invalid value for an argument.";
|
||||
break;
|
||||
case ncclInvalidUsage:
|
||||
interpret = "ncclInvalidUsage: This usually reflects invalid usage of NCCL library.";
|
||||
break;
|
||||
default:
|
||||
interpret = "Unknown NCCL error!";
|
||||
}
|
||||
return "Unknown NCCL error";
|
||||
return interpret + err;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
// ncclGetLastError() is enabled only for NCCL versions 2.13+
|
||||
#if defined(NCCL_MAJOR) && (NCCL_MAJOR == 2) && defined(NCCL_MINOR) && \
|
||||
(NCCL_MINOR >= 13)
|
||||
#define ENABLE_NCCL_GET_LAST_ERROR
|
||||
#elif defined(NCCL_MAJOR) && (NCCL_MAJOR >= 3)
|
||||
#define ENABLE_NCCL_GET_LAST_ERROR
|
||||
#endif
|
||||
|
||||
// Error checking is enabled only for NCCL versions 2.4+ since ncclCommAbort()
|
||||
// and ncclCommGetAsyncError() are not supported in earlier versions.
|
||||
#if defined(NCCL_MAJOR) && (NCCL_MAJOR == 2) && defined(NCCL_MINOR) && \
|
||||
|
Reference in New Issue
Block a user