Adopt ncclRemoteError (#85887)

`ncclRemoteError` was added in NCCL 2.13 to indicate a network error or a remote process exiting prematurely.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/85887
Approved by: https://github.com/wanchaol
This commit is contained in:
Ke Wen
2022-09-30 09:17:49 +00:00
committed by PyTorch MergeBot
parent 8f4edf1e1d
commit 1f38abb5d2
2 changed files with 60 additions and 38 deletions

View File

@ -57,6 +57,57 @@ std::string ncclGetErrorWithVersion(ncclResult_t error) {
getNcclVersion();
}
// Provides additional detail into NCCL error codes based on when these are
// thrown in the NCCL codebase.
std::string getNcclErrorDetailStr(
ncclResult_t error,
c10::optional<std::string> processGroupFailureReason /* = c10::nullopt */
) {
// Prioritize failure reason provided by PG NCCL first, as it can abort
// communicators when it encounters collective timeouts, etc.
if (processGroupFailureReason != c10::nullopt) {
return *processGroupFailureReason;
}
std::string interpret;
std::string err;
#ifdef ENABLE_NCCL_GET_LAST_ERROR
err = "\nLast error:\n" + std::string(ncclGetLastError(NULL));
#endif
switch (error) {
case ncclUnhandledCudaError:
interpret = "ncclUnhandledCudaError: Call to CUDA function failed.";
break;
case ncclSystemError:
interpret =
"ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. ";
#ifndef NCCL_REMOTE_ERROR
// Before ncclRemoteError was created, unexpected remote disconnect was
// categorized as ncclSystemError
interpret += "It can be also caused by unexpected exit of a remote peer.";
#endif
break;
case ncclInternalError:
interpret = "ncclInternalError: Internal check failed.";
break;
case ncclInvalidArgument:
interpret = "ncclInvalidArgument: Invalid value for an argument.";
break;
case ncclInvalidUsage:
interpret =
"ncclInvalidUsage: This usually reflects invalid usage of NCCL library.";
break;
#ifdef NCCL_REMOTE_ERROR
case ncclRemoteError:
interpret =
"ncclRemoteError: A call failed possibly due to a network error or a remote process exiting prematurely.";
break;
#endif
default:
interpret = "Unknown NCCL error!";
}
return interpret + err;
}
} // namespace c10d
#endif // USE_C10D_NCCL

View File

@ -13,11 +13,14 @@
#include <c10/util/Optional.h>
// ncclGetLastError() is enabled only for NCCL versions 2.13+
// ncclRemoteError only exists in NCCL versions 2.13+
#if defined(NCCL_MAJOR) && (NCCL_MAJOR == 2) && defined(NCCL_MINOR) && \
(NCCL_MINOR >= 13)
#define ENABLE_NCCL_GET_LAST_ERROR
#define NCCL_REMOTE_ERROR
#elif defined(NCCL_MAJOR) && (NCCL_MAJOR >= 3)
#define ENABLE_NCCL_GET_LAST_ERROR
#define NCCL_REMOTE_ERROR
#endif
// Error checking is enabled only for NCCL versions 2.4+ since ncclCommAbort()
@ -44,44 +47,6 @@
#define ENABLE_NCCL_PREMUL_SUM_SUPPORT
#endif
namespace {
// Provides additional detail into NCCL error codes based on when these are
// thrown in the NCCL codebase.
std::string getNcclErrorDetailStr(ncclResult_t error, c10::optional<std::string> processGroupFailureReason = c10::nullopt) {
// Prioritize failure reason provided by PG NCCL first, as it can abort
// communicators when it encounters collective timeouts, etc.
if (processGroupFailureReason != c10::nullopt) {
return (*processGroupFailureReason).c_str();
}
std::string interpret;
std::string err;
#ifdef ENABLE_NCCL_GET_LAST_ERROR
err = "\nLast error:\n" + std::string(ncclGetLastError(NULL));
#endif
switch (error) {
case ncclUnhandledCudaError:
interpret = "ncclUnhandledCudaError: Call to CUDA function failed.";
break;
case ncclSystemError:
interpret = "ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. "
"It can be also caused by unexpected exit of a remote peer.";
break;
case ncclInternalError:
interpret = "ncclInternalError: Internal check failed.";
break;
case ncclInvalidArgument:
interpret = "ncclInvalidArgument: Invalid value for an argument.";
break;
case ncclInvalidUsage:
interpret = "ncclInvalidUsage: This usually reflects invalid usage of NCCL library.";
break;
default:
interpret = "Unknown NCCL error!";
}
return interpret + err;
}
} // namespace
// Macro to throw on a non-successful NCCL return value.
#define C10D_NCCL_CHECK(cmd, failureReason) \
do { \
@ -115,6 +80,12 @@ namespace c10d {
std::string getNcclVersion();
std::string ncclGetErrorWithVersion(ncclResult_t error);
// Provides additional detail into NCCL error codes based on when these are
// thrown in the NCCL codebase.
std::string getNcclErrorDetailStr(
ncclResult_t error,
c10::optional<std::string> processGroupFailureReason = c10::nullopt);
// RAII wrapper for NCCL communicator
class NCCLComm {
public: