Revert "[PGNCCL] Make sure we do not use split for P2P comm creation (#139013)"

This reverts commit 74878ac271feecfa3ff3d32f78c7d889bcac97d6.

Reverted https://github.com/pytorch/pytorch/pull/139013 on behalf of https://github.com/ZainRizvi due to Sorry but this appears to be breaking on trunk. See: distributed/_composable/test_composability/test_pp_composability.py::ComposabilityTest::test_manual_with_data_parallel_dp_type_DDP_ScheduleClass0_use_new_runtime_False [GH job link](https://github.com/pytorch/pytorch/actions/runs/11559910615/job/32177150816) [HUD commit link](74878ac271) ([comment](https://github.com/pytorch/pytorch/pull/139013#issuecomment-2442667605))
This commit is contained in:
PyTorch MergeBot
2024-10-28 21:30:28 +00:00
parent 1a275fea4b
commit 02339e674d
2 changed files with 1 additions and 28 deletions

View File

@ -982,28 +982,6 @@ class ProcessGroupNCCLGroupTest(MultiProcessTestCase):
self.assertEqual(send_tensor, recv_tensor)
dist.destroy_process_group()
@skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
@parametrize("eager_init", [True, False])
def test_subgroup_p2p(self, eager_init: bool):
store = c10d.FileStore(self.file_name, self.world_size)
device = torch.device(f"cuda:{self.rank % torch.cuda.device_count()}")
c10d.init_process_group(
"nccl",
world_size=self.world_size,
rank=self.rank,
store=store,
device_id=device if eager_init else None,
)
send_tensor = torch.ones(10, 10, device=device)
group = dist.new_group()
if self.rank == 0:
dist.send(send_tensor, 1, group=group)
if self.rank == 1:
recv_tensor = torch.rand(10, 10, device=device)
dist.recv(recv_tensor, 0, group=group)
self.assertEqual(send_tensor, recv_tensor)
dist.destroy_process_group()
@requires_nccl()
@skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
def test_get_uid(self):

View File

@ -2401,12 +2401,7 @@ std::shared_ptr<NCCLComm> ProcessGroupNCCL::getNCCLComm(
#endif
#ifdef NCCL_HAS_COMM_SPLIT
// Use split to create a new communicator only if:
// 1. The parent comm is known; AND
// 2. The new comm is not for a point-to-point operation.
// ncclCommSplit() is a collective call, so it does not work for P2P
// operations.
if (options_->split_from && !singleP2POp) {
if (options_->split_from) {
// Find a valid, healthy communicator to split from if possible.
std::lock_guard<std::mutex> lock(options_->split_from->mutex_);
auto& other_comms = options_->split_from->devNCCLCommMap_;