[c10d] add ncclGetLastError to NCCL pg (#83724)

This PR add ncclGetLastError API to the nccl pg, to provide better error
reporting out of nccl failures directly, instead of guessing on random
reasons

Differential Revision: [D39161199](https://our.internmc.facebook.com/intern/diff/D39161199)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/83724
Approved by: https://github.com/kwen2501, https://github.com/H-Huang
This commit is contained in:
Wanchao Liang
2022-09-14 06:02:08 +00:00
committed by PyTorch MergeBot
parent ccade9410f
commit 976f8bee94

View File

@ -15,30 +15,49 @@
namespace {
// Provides additional detail into NCCL error codes based on when these are
// thrown in the NCCL codebase.
const inline char* getNcclErrorDetailStr(ncclResult_t error, c10::optional<std::string> processGroupFailureReason = c10::nullopt) {
std::string getNcclErrorDetailStr(ncclResult_t error, c10::optional<std::string> processGroupFailureReason = c10::nullopt) {
// Prioritize failure reason provided by PG NCCL first, as it can abort
// communicators when it encounters collective timeouts, etc.
if (processGroupFailureReason != c10::nullopt) {
return (*processGroupFailureReason).c_str();
}
std::string interpret;
std::string err;
#ifdef ENABLE_NCCL_GET_LAST_ERROR
err = "\nLast error:\n" + std::string(ncclGetLastError(NULL));
#endif
switch (error) {
case ncclUnhandledCudaError:
return "ncclUnhandledCudaError: Call to CUDA function failed.";
case ncclSystemError:
return "ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. "
"It can be also caused by unexpected exit of a remote peer, you can check NCCL warnings for failure reason and see if there is connection closure by a peer.";
case ncclInternalError:
return "ncclInternalError: Internal check failed. This is either a bug in NCCL or due to memory corruption";
case ncclInvalidArgument:
return "ncclInvalidArgument: Invalid value for an argument (such as invalid pointer, device count, ip:host pair, etc).";
case ncclInvalidUsage:
return "ncclInvalidUsage: This usually reflects invalid usage of NCCL library (such as too many async ops, too many collectives at once, mixing streams in a group, etc).";
default:
interpret = "ncclUnhandledCudaError: Call to CUDA function failed.";
break;
case ncclSystemError:
interpret = "ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. "
"It can be also caused by unexpected exit of a remote peer.";
break;
case ncclInternalError:
interpret = "ncclInternalError: Internal check failed.";
break;
case ncclInvalidArgument:
interpret = "ncclInvalidArgument: Invalid value for an argument.";
break;
case ncclInvalidUsage:
interpret = "ncclInvalidUsage: This usually reflects invalid usage of NCCL library.";
break;
default:
interpret = "Unknown NCCL error!";
}
return "Unknown NCCL error";
return interpret + err;
}
} // namespace
// ncclGetLastError() is enabled only for NCCL versions 2.13+
#if defined(NCCL_MAJOR) && (NCCL_MAJOR == 2) && defined(NCCL_MINOR) && \
(NCCL_MINOR >= 13)
#define ENABLE_NCCL_GET_LAST_ERROR
#elif defined(NCCL_MAJOR) && (NCCL_MAJOR >= 3)
#define ENABLE_NCCL_GET_LAST_ERROR
#endif
// Error checking is enabled only for NCCL versions 2.4+ since ncclCommAbort()
// and ncclCommGetAsyncError() are not supported in earlier versions.
#if defined(NCCL_MAJOR) && (NCCL_MAJOR == 2) && defined(NCCL_MINOR) && \