Revert "[PGNCCL] Make sure we do not use split for P2P comm creation (#139013)"

This reverts commit 74878ac271feecfa3ff3d32f78c7d889bcac97d6. Reverted https://github.com/pytorch/pytorch/pull/139013 on behalf of https://github.com/ZainRizvi due to Sorry but this appears to be breaking on trunk. See: distributed/_composable/test_composability/test_pp_composability.py::ComposabilityTest::test_manual_with_data_parallel_dp_type_DDP_ScheduleClass0_use_new_runtime_False [GH job link](https://github.com/pytorch/pytorch/actions/runs/11559910615/job/32177150816) [HUD commit link](74878ac271) ([comment](https://github.com/pytorch/pytorch/pull/139013#issuecomment-2442667605))
2025-10-20 21:14:14 +08:00 · 2024-10-28 21:30:28 +00:00
parent 1a275fea4b
commit 02339e674d
2 changed files with 1 additions and 28 deletions
--- a/test/distributed/test_c10d_nccl.py
+++ b/test/distributed/test_c10d_nccl.py
@ -982,28 +982,6 @@ class ProcessGroupNCCLGroupTest(MultiProcessTestCase):
            self.assertEqual(send_tensor, recv_tensor)
        dist.destroy_process_group()

-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
-    @parametrize("eager_init", [True, False])
-    def test_subgroup_p2p(self, eager_init: bool):
-        store = c10d.FileStore(self.file_name, self.world_size)
-        device = torch.device(f"cuda:{self.rank % torch.cuda.device_count()}")
-        c10d.init_process_group(
-            "nccl",
-            world_size=self.world_size,
-            rank=self.rank,
-            store=store,
-            device_id=device if eager_init else None,
-        )
-        send_tensor = torch.ones(10, 10, device=device)
-        group = dist.new_group()
-        if self.rank == 0:
-            dist.send(send_tensor, 1, group=group)
-        if self.rank == 1:
-            recv_tensor = torch.rand(10, 10, device=device)
-            dist.recv(recv_tensor, 0, group=group)
-            self.assertEqual(send_tensor, recv_tensor)
-        dist.destroy_process_group()
-
    @requires_nccl()
    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
    def test_get_uid(self):
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@ -2401,12 +2401,7 @@ std::shared_ptr<NCCLComm> ProcessGroupNCCL::getNCCLComm(
 #endif

 #ifdef NCCL_HAS_COMM_SPLIT
-  // Use split to create a new communicator only if:
-  // 1. The parent comm is known; AND
-  // 2. The new comm is not for a point-to-point operation.
-  // ncclCommSplit() is a collective call, so it does not work for P2P
-  // operations.
-  if (options_->split_from && !singleP2POp) {
+  if (options_->split_from) {
    // Find a valid, healthy communicator to split from if possible.
    std::lock_guard<std::mutex> lock(options_->split_from->mutex_);
    auto& other_comms = options_->split_from->devNCCLCommMap_;