mirror of
				https://github.com/pytorch/pytorch.git
				synced 2025-10-20 21:14:14 +08:00 
			
		
		
		
	Revert "[PGNCCL] Make sure we do not use split for P2P comm creation (#139013)"
This reverts commit 74878ac271feecfa3ff3d32f78c7d889bcac97d6.
Reverted https://github.com/pytorch/pytorch/pull/139013 on behalf of https://github.com/ZainRizvi due to Sorry but this appears to be breaking on trunk. See: distributed/_composable/test_composability/test_pp_composability.py::ComposabilityTest::test_manual_with_data_parallel_dp_type_DDP_ScheduleClass0_use_new_runtime_False [GH job link](https://github.com/pytorch/pytorch/actions/runs/11559910615/job/32177150816) [HUD commit link](74878ac271) ([comment](https://github.com/pytorch/pytorch/pull/139013#issuecomment-2442667605))
			
			
This commit is contained in:
		| @ -982,28 +982,6 @@ class ProcessGroupNCCLGroupTest(MultiProcessTestCase): | ||||
|             self.assertEqual(send_tensor, recv_tensor) | ||||
|         dist.destroy_process_group() | ||||
|  | ||||
|     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs") | ||||
|     @parametrize("eager_init", [True, False]) | ||||
|     def test_subgroup_p2p(self, eager_init: bool): | ||||
|         store = c10d.FileStore(self.file_name, self.world_size) | ||||
|         device = torch.device(f"cuda:{self.rank % torch.cuda.device_count()}") | ||||
|         c10d.init_process_group( | ||||
|             "nccl", | ||||
|             world_size=self.world_size, | ||||
|             rank=self.rank, | ||||
|             store=store, | ||||
|             device_id=device if eager_init else None, | ||||
|         ) | ||||
|         send_tensor = torch.ones(10, 10, device=device) | ||||
|         group = dist.new_group() | ||||
|         if self.rank == 0: | ||||
|             dist.send(send_tensor, 1, group=group) | ||||
|         if self.rank == 1: | ||||
|             recv_tensor = torch.rand(10, 10, device=device) | ||||
|             dist.recv(recv_tensor, 0, group=group) | ||||
|             self.assertEqual(send_tensor, recv_tensor) | ||||
|         dist.destroy_process_group() | ||||
|  | ||||
|     @requires_nccl() | ||||
|     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs") | ||||
|     def test_get_uid(self): | ||||
|  | ||||
| @ -2401,12 +2401,7 @@ std::shared_ptr<NCCLComm> ProcessGroupNCCL::getNCCLComm( | ||||
| #endif | ||||
|  | ||||
| #ifdef NCCL_HAS_COMM_SPLIT | ||||
|   // Use split to create a new communicator only if: | ||||
|   // 1. The parent comm is known; AND | ||||
|   // 2. The new comm is not for a point-to-point operation. | ||||
|   // ncclCommSplit() is a collective call, so it does not work for P2P | ||||
|   // operations. | ||||
|   if (options_->split_from && !singleP2POp) { | ||||
|   if (options_->split_from) { | ||||
|     // Find a valid, healthy communicator to split from if possible. | ||||
|     std::lock_guard<std::mutex> lock(options_->split_from->mutex_); | ||||
|     auto& other_comms = options_->split_from->devNCCLCommMap_; | ||||
|  | ||||
		Reference in New Issue
	
	Block a user