Revert "[PGNCCL] Use non-blocking mode by default in eager init (#138527)"

This reverts commit 8fbf866904661b16cba4c799af81121557ba9da8.

Reverted https://github.com/pytorch/pytorch/pull/138527 on behalf of https://github.com/jeanschmidt due to Seems to have introduce regressions on main, pull / linux-focal-cuda11.8-py3.10-gcc9 / test (distributed, 2, 3, linux.g4dn.12xlarge.nvidia.gpu) checking if revert will do ([comment](https://github.com/pytorch/pytorch/pull/138527#issuecomment-2432479338))
This commit is contained in:
PyTorch MergeBot
2024-10-23 14:49:49 +00:00
parent 2f007e5de5
commit cdfe1bffd1
4 changed files with 34 additions and 73 deletions

View File

@ -237,6 +237,7 @@ DEFINE_CONSTANT(started_state, "started");
TORCH_API size_t hashTensors(const std::vector<at::Tensor>& tensors);
TORCH_API std::string getNcclVersion();
TORCH_API std::string ncclGetErrorWithVersion(ncclResult_t error);
bool nccl_use_nonblocking();
int nccl_nonblocking_timeout();
// Provides additional detail into NCCL error codes based on when these are
@ -313,8 +314,6 @@ class NCCLComm {
comm->ncclId_ = commId;
comm->rank_ = rank;
comm->initialized_ = true;
// Old style comm is always blocking.
comm->nonBlocking_ = false;
return comm;
}
@ -325,19 +324,26 @@ class NCCLComm {
ncclUniqueId commId,
ncclConfig_t& config) {
auto comm = std::make_shared<NCCLComm>();
comm->nonBlocking_ = config.blocking == 0;
LOG(INFO) << "Rank " << rank << ": creating NCCL communicator with mode: "
<< (comm->nonBlocking_ ? "nonblocking" : "blocking");
C10D_NCCL_CHECK_NONBLOCKING(
ncclCommInitRankConfig(
&(comm->ncclComm_), numRanks, commId, rank, &config),
std::nullopt);
bool isInitialized = false;
if (nccl_use_nonblocking()) {
config.blocking = 0;
LOG(INFO) << "Rank " << rank
<< ": creating NCCL communicator in nonblocking mode";
C10D_NCCL_CHECK_NONBLOCKING(
ncclCommInitRankConfig(
&(comm->ncclComm_), numRanks, commId, rank, &config),
std::nullopt);
} else {
C10D_NCCL_CHECK(
ncclCommInitRankConfig(
&(comm->ncclComm_), numRanks, commId, rank, &config),
std::nullopt);
// under blocking mode, comm is initialized after NCCL CHECK
isInitialized = true;
}
comm->ncclId_ = commId;
comm->rank_ = rank;
// Under blocking mode, comm is initialized immediately after NCCL init
// returns; Under nonblocking mode, we check whether comm is initialized the
// *next* time ncclComm_ is accessed.
comm->initialized_ = !comm->nonBlocking_;
comm->initialized_ = isInitialized;
return comm;
}
@ -381,7 +387,6 @@ class NCCLComm {
std::swap(aborted_, other.aborted_);
std::swap(ncclAsyncErr_, other.ncclAsyncErr_);
std::swap(initialized_, other.initialized_);
std::swap(nonBlocking_, other.nonBlocking_);
}
ncclComm_t getNcclComm();
@ -545,10 +550,6 @@ class NCCLComm {
// better error messaging.
std::optional<std::string> commFailureReason_;
bool initialized_{false};
// Whether this communicator is using nonblocking mode. Recorded during comm
// creation or split. For safety, we give a default value of true (more
// protection).
bool nonBlocking_{true};
#ifdef NCCL_HAS_COMM_REGISTER
// Stores handlers for tensors registered by NCCL
std::unordered_map<void*, void*> registeredSegmentHandles_;