[c10d] Fix extra CUDA context created by barrier (#149144)

Fixes #149119. In ProcessGroup.hpp, we create a dummy tensor for dispatching. This requires a correct device index. This PR uses `device_id` given by user when calling `init_process_group`. This PR also uses `torch._C._get_accelerator()` to determine the device type. Pull Request resolved: https://github.com/pytorch/pytorch/pull/149144 Approved by: https://github.com/XilunWu, https://github.com/fduwjj, https://github.com/cyyever
2025-10-20 21:14:14 +08:00 · 2025-05-02 16:57:51 -07:00
parent 12a8b70247
commit a8f727c439
2 changed files with 23 additions and 21 deletions
--- a/test/distributed/test_c10d_nccl.py
+++ b/test/distributed/test_c10d_nccl.py
@ -3516,17 +3516,6 @@ class CommTest(test_c10d_common.AbstractCommTest, MultiProcessTestCase):

        c10d.barrier(device_ids=[self.rank])

-    @requires_nccl()
-    @skip_if_lt_x_gpu(2)
-    def test_nccl_barrier_device_ids_function_argument(self):
-        store = c10d.FileStore(self.file_name, self.world_size)
-        c10d.init_process_group(
-            backend="nccl", rank=self.rank, world_size=self.world_size, store=store
-        )
-
-        with self.assertRaisesRegex(TypeError, "Invalid function argument"):
-            c10d.barrier(device_ids=self.rank)
-
    @requires_nccl()
    @skip_if_lt_x_gpu(2)
    def test_unwaited(self) -> None: