mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Adopt ncclRemoteError (#85887)
`ncclRemoteError` was added in NCCL 2.13 to indicate a network error or a remote process exiting prematurely. Pull Request resolved: https://github.com/pytorch/pytorch/pull/85887 Approved by: https://github.com/wanchaol
This commit is contained in:
@ -57,6 +57,57 @@ std::string ncclGetErrorWithVersion(ncclResult_t error) {
|
||||
getNcclVersion();
|
||||
}
|
||||
|
||||
// Provides additional detail into NCCL error codes based on when these are
|
||||
// thrown in the NCCL codebase.
|
||||
std::string getNcclErrorDetailStr(
|
||||
ncclResult_t error,
|
||||
c10::optional<std::string> processGroupFailureReason /* = c10::nullopt */
|
||||
) {
|
||||
// Prioritize failure reason provided by PG NCCL first, as it can abort
|
||||
// communicators when it encounters collective timeouts, etc.
|
||||
if (processGroupFailureReason != c10::nullopt) {
|
||||
return *processGroupFailureReason;
|
||||
}
|
||||
std::string interpret;
|
||||
std::string err;
|
||||
#ifdef ENABLE_NCCL_GET_LAST_ERROR
|
||||
err = "\nLast error:\n" + std::string(ncclGetLastError(NULL));
|
||||
#endif
|
||||
switch (error) {
|
||||
case ncclUnhandledCudaError:
|
||||
interpret = "ncclUnhandledCudaError: Call to CUDA function failed.";
|
||||
break;
|
||||
case ncclSystemError:
|
||||
interpret =
|
||||
"ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. ";
|
||||
#ifndef NCCL_REMOTE_ERROR
|
||||
// Before ncclRemoteError was created, unexpected remote disconnect was
|
||||
// categorized as ncclSystemError
|
||||
interpret += "It can be also caused by unexpected exit of a remote peer.";
|
||||
#endif
|
||||
break;
|
||||
case ncclInternalError:
|
||||
interpret = "ncclInternalError: Internal check failed.";
|
||||
break;
|
||||
case ncclInvalidArgument:
|
||||
interpret = "ncclInvalidArgument: Invalid value for an argument.";
|
||||
break;
|
||||
case ncclInvalidUsage:
|
||||
interpret =
|
||||
"ncclInvalidUsage: This usually reflects invalid usage of NCCL library.";
|
||||
break;
|
||||
#ifdef NCCL_REMOTE_ERROR
|
||||
case ncclRemoteError:
|
||||
interpret =
|
||||
"ncclRemoteError: A call failed possibly due to a network error or a remote process exiting prematurely.";
|
||||
break;
|
||||
#endif
|
||||
default:
|
||||
interpret = "Unknown NCCL error!";
|
||||
}
|
||||
return interpret + err;
|
||||
}
|
||||
|
||||
} // namespace c10d
|
||||
|
||||
#endif // USE_C10D_NCCL
|
||||
|
@ -13,11 +13,14 @@
|
||||
#include <c10/util/Optional.h>
|
||||
|
||||
// ncclGetLastError() is enabled only for NCCL versions 2.13+
|
||||
// ncclRemoteError only exists in NCCL versions 2.13+
|
||||
#if defined(NCCL_MAJOR) && (NCCL_MAJOR == 2) && defined(NCCL_MINOR) && \
|
||||
(NCCL_MINOR >= 13)
|
||||
#define ENABLE_NCCL_GET_LAST_ERROR
|
||||
#define NCCL_REMOTE_ERROR
|
||||
#elif defined(NCCL_MAJOR) && (NCCL_MAJOR >= 3)
|
||||
#define ENABLE_NCCL_GET_LAST_ERROR
|
||||
#define NCCL_REMOTE_ERROR
|
||||
#endif
|
||||
|
||||
// Error checking is enabled only for NCCL versions 2.4+ since ncclCommAbort()
|
||||
@ -44,44 +47,6 @@
|
||||
#define ENABLE_NCCL_PREMUL_SUM_SUPPORT
|
||||
#endif
|
||||
|
||||
namespace {
|
||||
// Provides additional detail into NCCL error codes based on when these are
|
||||
// thrown in the NCCL codebase.
|
||||
std::string getNcclErrorDetailStr(ncclResult_t error, c10::optional<std::string> processGroupFailureReason = c10::nullopt) {
|
||||
// Prioritize failure reason provided by PG NCCL first, as it can abort
|
||||
// communicators when it encounters collective timeouts, etc.
|
||||
if (processGroupFailureReason != c10::nullopt) {
|
||||
return (*processGroupFailureReason).c_str();
|
||||
}
|
||||
std::string interpret;
|
||||
std::string err;
|
||||
#ifdef ENABLE_NCCL_GET_LAST_ERROR
|
||||
err = "\nLast error:\n" + std::string(ncclGetLastError(NULL));
|
||||
#endif
|
||||
switch (error) {
|
||||
case ncclUnhandledCudaError:
|
||||
interpret = "ncclUnhandledCudaError: Call to CUDA function failed.";
|
||||
break;
|
||||
case ncclSystemError:
|
||||
interpret = "ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. "
|
||||
"It can be also caused by unexpected exit of a remote peer.";
|
||||
break;
|
||||
case ncclInternalError:
|
||||
interpret = "ncclInternalError: Internal check failed.";
|
||||
break;
|
||||
case ncclInvalidArgument:
|
||||
interpret = "ncclInvalidArgument: Invalid value for an argument.";
|
||||
break;
|
||||
case ncclInvalidUsage:
|
||||
interpret = "ncclInvalidUsage: This usually reflects invalid usage of NCCL library.";
|
||||
break;
|
||||
default:
|
||||
interpret = "Unknown NCCL error!";
|
||||
}
|
||||
return interpret + err;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
// Macro to throw on a non-successful NCCL return value.
|
||||
#define C10D_NCCL_CHECK(cmd, failureReason) \
|
||||
do { \
|
||||
@ -115,6 +80,12 @@ namespace c10d {
|
||||
std::string getNcclVersion();
|
||||
std::string ncclGetErrorWithVersion(ncclResult_t error);
|
||||
|
||||
// Provides additional detail into NCCL error codes based on when these are
|
||||
// thrown in the NCCL codebase.
|
||||
std::string getNcclErrorDetailStr(
|
||||
ncclResult_t error,
|
||||
c10::optional<std::string> processGroupFailureReason = c10::nullopt);
|
||||
|
||||
// RAII wrapper for NCCL communicator
|
||||
class NCCLComm {
|
||||
public:
|
||||
|
Reference in New Issue
Block a user