Revert "[c10d] PGNCCL refactor part 2: Simplify ProcessGroupNCCL into single-device style (#119421)"

This reverts commit f3e7d809936d9f1bf63102e8afe241e13ed8766a.

Reverted https://github.com/pytorch/pytorch/pull/119421 on behalf of https://github.com/DanilBaibak due to Broken trunk ([comment](https://github.com/pytorch/pytorch/pull/119421#issuecomment-1938169747))
This commit is contained in:
PyTorch MergeBot
2024-02-12 07:34:20 +00:00
parent 8a3c241094
commit 0342b227e5
8 changed files with 1128 additions and 830 deletions

View File

@ -126,32 +126,37 @@
TORCH_CHECK_WITH(DistBackendError, false, err); \
}
#define C10D_NCCL_CHECK_TIMEOUT_GROUPEND(cmd, comm, failureReason) \
ncclResult_t state = cmd; \
auto startTimepoint = std::chrono::steady_clock::now(); \
if (state == ncclInProgress) { \
do { \
if (nccl_nonblocking_timeout() > 0) { \
auto currentTimepoint = std::chrono::steady_clock::now(); \
auto timeElapsed = std::chrono::duration_cast<std::chrono::seconds>( \
currentTimepoint - startTimepoint) \
.count(); \
if (timeElapsed > nccl_nonblocking_timeout()) { \
std::string err = "NCCL timeout in: " + std::string(__FILE__) + \
":" + std::to_string(__LINE__) + ", " + \
ncclGetErrorWithVersion(state) + "\n" + \
getNcclErrorDetailStr(state, failureReason); \
TORCH_CHECK_WITH(DistBackendError, false, err); \
} \
} \
ncclCommGetAsyncError(comm->getNcclComm(), &state); \
} while (state == ncclInProgress); \
} \
if (state != ncclSuccess) { \
std::string err = "NCCL error in: " + std::string(__FILE__) + ":" + \
std::to_string(__LINE__) + ", " + ncclGetErrorWithVersion(state) + \
"\n" + getNcclErrorDetailStr(state, failureReason); \
TORCH_CHECK_WITH(DistBackendError, false, err); \
#define C10D_NCCL_CHECK_TIMEOUT_GROUPEND(cmd, comms_, failureReason) \
ncclResult_t state = cmd; \
auto startTimepoint = std::chrono::steady_clock::now(); \
if (state == ncclInProgress) { \
for (const auto i : c10::irange(comms_.size())) { \
do { \
if (nccl_nonblocking_timeout() > 0) { \
auto currentTimepoint = std::chrono::steady_clock::now(); \
auto timeElapsed = std::chrono::duration_cast<std::chrono::seconds>( \
currentTimepoint - startTimepoint) \
.count(); \
if (timeElapsed > nccl_nonblocking_timeout()) { \
std::string err = "NCCL timeout in: " + std::string(__FILE__) + \
":" + std::to_string(__LINE__) + ", " + \
ncclGetErrorWithVersion(state) + "\n" + \
getNcclErrorDetailStr(state, failureReason); \
TORCH_CHECK_WITH(DistBackendError, false, err); \
} \
} \
ncclCommGetAsyncError(comms_[i]->getNcclComm(), &state); \
} while (state == ncclInProgress); \
if (state != ncclSuccess) { \
break; /* fall through to failed case */ \
} \
} \
} \
if (state != ncclSuccess) { \
std::string err = "NCCL error in: " + std::string(__FILE__) + ":" + \
std::to_string(__LINE__) + ", " + ncclGetErrorWithVersion(state) + \
"\n" + getNcclErrorDetailStr(state, failureReason); \
TORCH_CHECK_WITH(DistBackendError, false, err); \
}
// Macro to print and abort on a non-successful NCCL return value.