mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Revert "[PGNCCL] Make sure we do not use split for P2P comm creation (#139013)"
This reverts commit 74878ac271feecfa3ff3d32f78c7d889bcac97d6.
Reverted https://github.com/pytorch/pytorch/pull/139013 on behalf of https://github.com/ZainRizvi due to Sorry but this appears to be breaking on trunk. See: distributed/_composable/test_composability/test_pp_composability.py::ComposabilityTest::test_manual_with_data_parallel_dp_type_DDP_ScheduleClass0_use_new_runtime_False [GH job link](https://github.com/pytorch/pytorch/actions/runs/11559910615/job/32177150816) [HUD commit link](74878ac271
) ([comment](https://github.com/pytorch/pytorch/pull/139013#issuecomment-2442667605))
This commit is contained in:
@ -982,28 +982,6 @@ class ProcessGroupNCCLGroupTest(MultiProcessTestCase):
|
||||
self.assertEqual(send_tensor, recv_tensor)
|
||||
dist.destroy_process_group()
|
||||
|
||||
@skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
|
||||
@parametrize("eager_init", [True, False])
|
||||
def test_subgroup_p2p(self, eager_init: bool):
|
||||
store = c10d.FileStore(self.file_name, self.world_size)
|
||||
device = torch.device(f"cuda:{self.rank % torch.cuda.device_count()}")
|
||||
c10d.init_process_group(
|
||||
"nccl",
|
||||
world_size=self.world_size,
|
||||
rank=self.rank,
|
||||
store=store,
|
||||
device_id=device if eager_init else None,
|
||||
)
|
||||
send_tensor = torch.ones(10, 10, device=device)
|
||||
group = dist.new_group()
|
||||
if self.rank == 0:
|
||||
dist.send(send_tensor, 1, group=group)
|
||||
if self.rank == 1:
|
||||
recv_tensor = torch.rand(10, 10, device=device)
|
||||
dist.recv(recv_tensor, 0, group=group)
|
||||
self.assertEqual(send_tensor, recv_tensor)
|
||||
dist.destroy_process_group()
|
||||
|
||||
@requires_nccl()
|
||||
@skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
|
||||
def test_get_uid(self):
|
||||
|
@ -2401,12 +2401,7 @@ std::shared_ptr<NCCLComm> ProcessGroupNCCL::getNCCLComm(
|
||||
#endif
|
||||
|
||||
#ifdef NCCL_HAS_COMM_SPLIT
|
||||
// Use split to create a new communicator only if:
|
||||
// 1. The parent comm is known; AND
|
||||
// 2. The new comm is not for a point-to-point operation.
|
||||
// ncclCommSplit() is a collective call, so it does not work for P2P
|
||||
// operations.
|
||||
if (options_->split_from && !singleP2POp) {
|
||||
if (options_->split_from) {
|
||||
// Find a valid, healthy communicator to split from if possible.
|
||||
std::lock_guard<std::mutex> lock(options_->split_from->mutex_);
|
||||
auto& other_comms = options_->split_from->devNCCLCommMap_;
|
||||
|
Reference in New Issue
Block a user