Revert "[PGNCCL] Use non-blocking mode by default in eager init (#138527)"

This reverts commit 8fbf866904661b16cba4c799af81121557ba9da8. Reverted https://github.com/pytorch/pytorch/pull/138527 on behalf of https://github.com/jeanschmidt due to Seems to have introduce regressions on main, pull / linux-focal-cuda11.8-py3.10-gcc9 / test (distributed, 2, 3, linux.g4dn.12xlarge.nvidia.gpu) checking if revert will do ([comment](https://github.com/pytorch/pytorch/pull/138527#issuecomment-2432479338))
2025-10-20 21:14:14 +08:00 · 2024-10-23 14:49:49 +00:00
parent 2f007e5de5
commit cdfe1bffd1
4 changed files with 34 additions and 73 deletions
--- a/torch/csrc/distributed/c10d/NCCLUtils.hpp
+++ b/torch/csrc/distributed/c10d/NCCLUtils.hpp
@ -237,6 +237,7 @@ DEFINE_CONSTANT(started_state, "started");
 TORCH_API size_t hashTensors(const std::vector<at::Tensor>& tensors);
 TORCH_API std::string getNcclVersion();
 TORCH_API std::string ncclGetErrorWithVersion(ncclResult_t error);
+bool nccl_use_nonblocking();
 int nccl_nonblocking_timeout();

 // Provides additional detail into NCCL error codes based on when these are
@ -313,8 +314,6 @@ class NCCLComm {
    comm->ncclId_ = commId;
    comm->rank_ = rank;
    comm->initialized_ = true;
-    // Old style comm is always blocking.
-    comm->nonBlocking_ = false;
    return comm;
  }

@ -325,19 +324,26 @@ class NCCLComm {
      ncclUniqueId commId,
      ncclConfig_t& config) {
    auto comm = std::make_shared<NCCLComm>();
-    comm->nonBlocking_ = config.blocking == 0;
-    LOG(INFO) << "Rank " << rank << ": creating NCCL communicator with mode: "
-              << (comm->nonBlocking_ ? "nonblocking" : "blocking");
-    C10D_NCCL_CHECK_NONBLOCKING(
-        ncclCommInitRankConfig(
-            &(comm->ncclComm_), numRanks, commId, rank, &config),
-        std::nullopt);
+    bool isInitialized = false;
+    if (nccl_use_nonblocking()) {
+      config.blocking = 0;
+      LOG(INFO) << "Rank " << rank
+                << ": creating NCCL communicator in nonblocking mode";
+      C10D_NCCL_CHECK_NONBLOCKING(
+          ncclCommInitRankConfig(
+              &(comm->ncclComm_), numRanks, commId, rank, &config),
+          std::nullopt);
+    } else {
+      C10D_NCCL_CHECK(
+          ncclCommInitRankConfig(
+              &(comm->ncclComm_), numRanks, commId, rank, &config),
+          std::nullopt);
+      // under blocking mode, comm is initialized after NCCL CHECK
+      isInitialized = true;
+    }
    comm->ncclId_ = commId;
    comm->rank_ = rank;
-    // Under blocking mode, comm is initialized immediately after NCCL init
-    // returns; Under nonblocking mode, we check whether comm is initialized the
-    // *next* time ncclComm_ is accessed.
-    comm->initialized_ = !comm->nonBlocking_;
+    comm->initialized_ = isInitialized;
    return comm;
  }

@ -381,7 +387,6 @@ class NCCLComm {
    std::swap(aborted_, other.aborted_);
    std::swap(ncclAsyncErr_, other.ncclAsyncErr_);
    std::swap(initialized_, other.initialized_);
-    std::swap(nonBlocking_, other.nonBlocking_);
  }

  ncclComm_t getNcclComm();
@ -545,10 +550,6 @@ class NCCLComm {
  // better error messaging.
  std::optional<std::string> commFailureReason_;
  bool initialized_{false};
-  // Whether this communicator is using nonblocking mode. Recorded during comm
-  // creation or split. For safety, we give a default value of true (more
-  // protection).
-  bool nonBlocking_{true};
 #ifdef NCCL_HAS_COMM_REGISTER
  // Stores handlers for tensors registered by NCCL
  std::unordered_map<void*, void*> registeredSegmentHandles_;