[c10d] Fix extra CUDA context created by barrier (#149144)

Fixes #149119.

In ProcessGroup.hpp, we create a dummy tensor for dispatching. This
requires a correct device index. This PR uses `device_id` given by user
when calling `init_process_group`.

This PR also uses `torch._C._get_accelerator()` to determine the device
type.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/149144
Approved by: https://github.com/XilunWu, https://github.com/fduwjj, https://github.com/cyyever
This commit is contained in:
Ke Wen
2025-05-02 16:57:51 -07:00
committed by PyTorch MergeBot
parent 12a8b70247
commit a8f727c439
2 changed files with 23 additions and 21 deletions

View File

@ -3516,17 +3516,6 @@ class CommTest(test_c10d_common.AbstractCommTest, MultiProcessTestCase):
c10d.barrier(device_ids=[self.rank])
@requires_nccl()
@skip_if_lt_x_gpu(2)
def test_nccl_barrier_device_ids_function_argument(self):
store = c10d.FileStore(self.file_name, self.world_size)
c10d.init_process_group(
backend="nccl", rank=self.rank, world_size=self.world_size, store=store
)
with self.assertRaisesRegex(TypeError, "Invalid function argument"):
c10d.barrier(device_ids=self.rank)
@requires_nccl()
@skip_if_lt_x_gpu(2)
def test_unwaited(self) -> None: