mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Revert "[c10d] PGNCCL refactor part 2: Simplify ProcessGroupNCCL into single-device style (#119421)"
This reverts commit f3e7d809936d9f1bf63102e8afe241e13ed8766a. Reverted https://github.com/pytorch/pytorch/pull/119421 on behalf of https://github.com/DanilBaibak due to Broken trunk ([comment](https://github.com/pytorch/pytorch/pull/119421#issuecomment-1938169747))
This commit is contained in:
@ -126,32 +126,37 @@
|
||||
TORCH_CHECK_WITH(DistBackendError, false, err); \
|
||||
}
|
||||
|
||||
#define C10D_NCCL_CHECK_TIMEOUT_GROUPEND(cmd, comm, failureReason) \
|
||||
ncclResult_t state = cmd; \
|
||||
auto startTimepoint = std::chrono::steady_clock::now(); \
|
||||
if (state == ncclInProgress) { \
|
||||
do { \
|
||||
if (nccl_nonblocking_timeout() > 0) { \
|
||||
auto currentTimepoint = std::chrono::steady_clock::now(); \
|
||||
auto timeElapsed = std::chrono::duration_cast<std::chrono::seconds>( \
|
||||
currentTimepoint - startTimepoint) \
|
||||
.count(); \
|
||||
if (timeElapsed > nccl_nonblocking_timeout()) { \
|
||||
std::string err = "NCCL timeout in: " + std::string(__FILE__) + \
|
||||
":" + std::to_string(__LINE__) + ", " + \
|
||||
ncclGetErrorWithVersion(state) + "\n" + \
|
||||
getNcclErrorDetailStr(state, failureReason); \
|
||||
TORCH_CHECK_WITH(DistBackendError, false, err); \
|
||||
} \
|
||||
} \
|
||||
ncclCommGetAsyncError(comm->getNcclComm(), &state); \
|
||||
} while (state == ncclInProgress); \
|
||||
} \
|
||||
if (state != ncclSuccess) { \
|
||||
std::string err = "NCCL error in: " + std::string(__FILE__) + ":" + \
|
||||
std::to_string(__LINE__) + ", " + ncclGetErrorWithVersion(state) + \
|
||||
"\n" + getNcclErrorDetailStr(state, failureReason); \
|
||||
TORCH_CHECK_WITH(DistBackendError, false, err); \
|
||||
#define C10D_NCCL_CHECK_TIMEOUT_GROUPEND(cmd, comms_, failureReason) \
|
||||
ncclResult_t state = cmd; \
|
||||
auto startTimepoint = std::chrono::steady_clock::now(); \
|
||||
if (state == ncclInProgress) { \
|
||||
for (const auto i : c10::irange(comms_.size())) { \
|
||||
do { \
|
||||
if (nccl_nonblocking_timeout() > 0) { \
|
||||
auto currentTimepoint = std::chrono::steady_clock::now(); \
|
||||
auto timeElapsed = std::chrono::duration_cast<std::chrono::seconds>( \
|
||||
currentTimepoint - startTimepoint) \
|
||||
.count(); \
|
||||
if (timeElapsed > nccl_nonblocking_timeout()) { \
|
||||
std::string err = "NCCL timeout in: " + std::string(__FILE__) + \
|
||||
":" + std::to_string(__LINE__) + ", " + \
|
||||
ncclGetErrorWithVersion(state) + "\n" + \
|
||||
getNcclErrorDetailStr(state, failureReason); \
|
||||
TORCH_CHECK_WITH(DistBackendError, false, err); \
|
||||
} \
|
||||
} \
|
||||
ncclCommGetAsyncError(comms_[i]->getNcclComm(), &state); \
|
||||
} while (state == ncclInProgress); \
|
||||
if (state != ncclSuccess) { \
|
||||
break; /* fall through to failed case */ \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
if (state != ncclSuccess) { \
|
||||
std::string err = "NCCL error in: " + std::string(__FILE__) + ":" + \
|
||||
std::to_string(__LINE__) + ", " + ncclGetErrorWithVersion(state) + \
|
||||
"\n" + getNcclErrorDetailStr(state, failureReason); \
|
||||
TORCH_CHECK_WITH(DistBackendError, false, err); \
|
||||
}
|
||||
|
||||
// Macro to print and abort on a non-successful NCCL return value.
|
||||
|
Reference in New Issue
Block a user