[c10d] Error out the case when registering symmetric memory without eager init (#160145)

Instead of implicitly creating nccl comm inside mem pool registration for symmetric memory, we decide to error it out so that we only support eager init case when the nccl comm is already initiated. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160145 Approved by: https://github.com/kwen2501
2025-10-20 21:14:14 +08:00 · 2025-08-12 20:13:16 +00:00
parent 0d71ca2c46
commit b1f43548ca
2 changed files with 32 additions and 28 deletions
--- a/test/distributed/test_c10d_nccl.py
+++ b/test/distributed/test_c10d_nccl.py
@ -3172,35 +3172,42 @@ class NcclRegistrationTest(MultiProcessTestCase):
    @requires_multicast_support()
    def test_nccl_window_registration(self):
        store = c10d.FileStore(self.file_name, self.world_size)
-        c10d.init_process_group(
-            backend="nccl", rank=self.rank, world_size=self.world_size, store=store
-        )
        device = torch.device(f"cuda:{self.rank}")
-        torch.cuda.set_device(self.rank)
-        pg = c10d.distributed_c10d._get_default_group()
-        backend = pg._get_backend(torch.device(device))
+        with torch.cuda.device(device):
+            # Eager init the nccl comm so that we don't implicitly create one during register_mem_pool
+            c10d.init_process_group(
+                backend="nccl",
+                rank=self.rank,
+                world_size=self.world_size,
+                store=store,
+                device_id=device,
+            )
+            pg = c10d.distributed_c10d._get_default_group()
+            backend = pg._get_backend(torch.device(device))

-        # Use NCCL memory allocator
-        # enable symmetric memory usage in NCCL
-        pool = torch.cuda.MemPool(backend.mem_allocator, symmetric=True)
+            # Use NCCL memory allocator
+            # enable symmetric memory usage in NCCL
+            pool = torch.cuda.MemPool(backend.mem_allocator, symmetric=True)

-        # allocate memory with ncclMemAlloc
-        # note: symmetric kernels are not available for dtypes like torch.int64
-        with torch.cuda.use_mem_pool(pool):
-            tensor = torch.arange(1024 * 1024 * 2, device=device, dtype=torch.float32)
+            # allocate memory with ncclMemAlloc
+            # note: symmetric kernels are not available for dtypes like torch.int64
+            with torch.cuda.use_mem_pool(pool):
+                tensor = torch.arange(
+                    1024 * 1024 * 2, device=device, dtype=torch.float32
+                )

-        # register buffers to NCCL
-        backend.register_mem_pool(pool)
+            # register buffers to NCCL
+            backend.register_mem_pool(pool)

-        # allreduce now should use NVIDIA Switches
-        pg.allreduce(tensor).wait()
-        torch.cuda.synchronize(device=device)
+            # allreduce now should use NVIDIA Switches
+            pg.allreduce(tensor).wait()
+            torch.cuda.synchronize(device=device)

-        # de-register buffers from NCCL
-        backend.deregister_mem_pool(pool)
+            # de-register buffers from NCCL
+            backend.deregister_mem_pool(pool)

-        # clean up memory
-        del tensor, pool
+            # clean up memory
+            del tensor, pool

        with open(os.environ["NCCL_DEBUG_FILE"]) as f:
            nccl_debug_file_content = f.read()