[FSDP2] provide public API to share cuda streams across roots (#165024)

for pipeline parallel, we can have multiple FSDP roots (chunks) ``` model = nn.Sequential([chunk0, chunk1]) fully_shard(model.chunk0) fully_shard(model.chunk1) ``` we can call `share_comm_ctx` to share all-gather, reduce-scatter, all-reduce cuda streams. this avoids inter-stream memory fragmentation ``` from torch.distributed.fsdp import share_comm_ctx share_comm_ctx([model.chunk0, model.chunk1]) ``` unit test: `pytest -s test/distributed/_composable/fsdp/test_fully_shard_training.py -k test_share_comm_context` Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: Pull Request resolved: https://github.com/pytorch/pytorch/pull/165024 Approved by: https://github.com/mori360
2025-10-20 21:14:14 +08:00 · 2025-10-13 14:03:57 -07:00
parent 9b6be53326
commit 6918f17114
6 changed files with 192 additions and 1 deletions
--- a/docs/source/distributed.fsdp.fully_shard.md
+++ b/docs/source/distributed.fsdp.fully_shard.md
@ -123,3 +123,7 @@ The frontend API is `fully_shard` that can be called on a `module`:
 .. autoclass:: CPUOffloadPolicy
    :members:
 ```
 ```{eval-rst}
 .. autofunction:: share_comm_ctx
 ```
--- a/test/distributed/_composable/fsdp/test_fully_shard_training.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_training.py
@ -6,7 +6,7 @@ import functools
 import itertools
 import unittest
 from collections import defaultdict
-from collections.abc import Iterable
+from collections.abc import Callable, Iterable
 from typing import Any, Optional, Union
 import torch
@ -24,6 +24,11 @@ from torch.distributed.fsdp import (
    fully_shard,
    OffloadPolicy,
    register_fsdp_forward_method,
    share_comm_ctx,
 )
 from torch.distributed.fsdp._fully_shard._fsdp_collectives import (
    foreach_all_gather,
    foreach_reduce,
 )
 from torch.distributed.tensor import DTensor, init_device_mesh, Shard
 from torch.distributed.tensor.debug import CommDebugMode
@ -39,6 +44,8 @@ from torch.testing._internal.common_fsdp import (
    MLP,
    MLPStack,
    patch_all_gather,
    patch_foreach_all_gather,
    patch_foreach_reduce,
    patch_reduce_scatter,
 )
 from torch.testing._internal.common_utils import (
@ -1487,6 +1494,116 @@ class TestFullyShardCustomForwardMethod(FSDPTest):
        check_sharded_parity(self, ref_model, model)
 class TestFullyShardShareCommContext(FSDPTest):
    @property
    def world_size(self) -> int:
        return min(torch.get_device_module(device_type).device_count(), 2)
    @skip_if_lt_x_gpu(2)
    def test_share_comm_context(self):
        torch.manual_seed(42)
        n_layers = 3
        lin_dim = 16
        model = nn.Sequential(
            *[MLP(lin_dim, torch.device("cpu")) for _ in range(n_layers)]
        )
        ref_model = copy.deepcopy(model).to(device_type)
        for layer in model:
            fully_shard(layer)
            layer._get_fsdp_state()._lazy_init()
        share_comm_ctx(list(model))
        torch.manual_seed(42 + self.rank + 1)
        inp = torch.randn(4, 3, lin_dim, device=device_type.type)
        ref_loss = ref_model(inp).sum()
        all_gather_streams = set()
        reduce_scatter_streams = set()
        from torch.distributed.fsdp._fully_shard._fsdp_api import (
            AllGather,
            ReduceScatter,
        )
        from torch.distributed.fsdp._fully_shard._fsdp_param import FSDPParam
        orig_foreach_all_gather = foreach_all_gather
        def foreach_all_gather_with_assert(
            fsdp_params: list[FSDPParam],
            group: dist.ProcessGroup,
            async_op: bool,
            all_gather_copy_in_stream: torch.Stream,
            all_gather_stream: torch.Stream,
            device: torch.device,
            all_gather_comm: AllGather,
        ):
            nonlocal all_gather_streams
            all_gather_streams.add(all_gather_stream)
            return orig_foreach_all_gather(
                fsdp_params,
                group,
                async_op,
                all_gather_copy_in_stream,
                all_gather_stream,
                device,
                all_gather_comm,
            )
        orig_foreach_reduce = foreach_reduce
        @torch.no_grad()
        def foreach_reduce_with_assert(
            fsdp_params: list[FSDPParam],
            unsharded_grads: list[torch.Tensor],
            reduce_scatter_group: dist.ProcessGroup,
            reduce_scatter_stream: torch.Stream,
            reduce_scatter_comm: ReduceScatter,
            orig_dtype: Optional[torch.dtype],
            reduce_dtype: Optional[torch.dtype],
            device: torch.device,
            gradient_divide_factor: Optional[float],
            all_reduce_group: Optional[dist.ProcessGroup],  # not `None` iff HSDP
            all_reduce_stream: torch.Stream,
            all_reduce_grads: bool,
            partial_reduce_output: Optional[torch.Tensor],  # only used for HSDP
            all_reduce_hook: Optional[Callable[[torch.Tensor], None]],
            force_sum_reduction_for_comms: bool = False,
        ):
            nonlocal reduce_scatter_streams
            reduce_scatter_streams.add(reduce_scatter_stream)
            return orig_foreach_reduce(
                fsdp_params,
                unsharded_grads,
                reduce_scatter_group,
                reduce_scatter_stream,
                reduce_scatter_comm,
                orig_dtype,
                reduce_dtype,
                device,
                gradient_divide_factor,
                all_reduce_group,
                all_reduce_stream,
                all_reduce_grads,
                partial_reduce_output,
                all_reduce_hook,
                force_sum_reduction_for_comms,
            )
        with (
            patch_foreach_all_gather(foreach_all_gather_with_assert),
            patch_foreach_reduce(foreach_reduce_with_assert),
        ):
            loss = model(inp).sum()
            self.assertEqual(ref_loss, loss)
            ref_loss.backward()
            loss.backward()
            for param in ref_model.parameters():
                dist.all_reduce(param.grad, op=dist.ReduceOp.AVG)
        self.assertEqual(len(all_gather_streams), 1)
        self.assertEqual(len(reduce_scatter_streams), 1)
        check_sharded_parity(self, ref_model, model)
 class TestFullyShardWorldSize1(FSDPTest):
    @property
    def world_size(self) -> int:
--- a/torch/distributed/fsdp/init.py
+++ b/torch/distributed/fsdp/init.py
@ -6,6 +6,7 @@ from ._fully_shard import (
    MixedPrecisionPolicy,
    OffloadPolicy,
    register_fsdp_forward_method,
    share_comm_ctx,
    UnshardHandle,
 )
 from .fully_sharded_data_parallel import (
@ -54,6 +55,7 @@ __all__ = [
    "OffloadPolicy",
    "register_fsdp_forward_method",
    "UnshardHandle",
    "share_comm_ctx",
 ]
 # Set namespace for exposed private names
@ -64,3 +66,4 @@ MixedPrecisionPolicy.__module__ = "torch.distributed.fsdp"
 OffloadPolicy.__module__ = "torch.distributed.fsdp"
 register_fsdp_forward_method.__module__ = "torch.distributed.fsdp"
 UnshardHandle.__module__ = "torch.distributed.fsdp"
 share_comm_ctx.__module__ = "torch.distributed.fsdp"
--- a/torch/distributed/fsdp/_fully_shard/init.py
+++ b/torch/distributed/fsdp/_fully_shard/init.py
@ -3,6 +3,7 @@ from ._fully_shard import (
    FSDPModule,
    fully_shard,
    register_fsdp_forward_method,
    share_comm_ctx,
    UnshardHandle,
 )
@ -15,4 +16,5 @@ __all__ = [
    "OffloadPolicy",
    "register_fsdp_forward_method",
    "UnshardHandle",
    "share_comm_ctx",
 ]
--- a/torch/distributed/fsdp/_fully_shard/_fully_shard.py
+++ b/torch/distributed/fsdp/_fully_shard/_fully_shard.py
@ -39,6 +39,7 @@ __all__ = [
    "register_fsdp_forward_method",
    "get_cls_to_fsdp_cls",
    "disable_fsdp_module_new_init",
    "share_comm_ctx",
 ]
@ -711,6 +712,34 @@ def register_fsdp_forward_method(module: nn.Module, method_name: str) -> None:
    )
 def share_comm_ctx(modules: list[FSDPModule]) -> None:
    """
    Share cuda streams for multiple FSDPModules
    Example usage:
        from torch.distributed.fsdp import share_comm_ctx
        share_comm_ctx([fsdp_model_1, fsdp_model_2, ...])
    For Pipeline Parallelism (PP), each model chunk is a FSDP root. We want
    to share cuda streams for all-gather, reduce-scatter, and all-reduce.
    This avoids allocating inter-stream memory framgmentation
    Args:
        modules (List[FSDPModule]): modules to share cuda streams
    """
    if len(modules) == 0:
        return
    for module in modules:
        if not isinstance(module, FSDPModule):
            raise ValueError(f"Expects list of FSDPModules but got {module}")
    fsdp_states = [module._get_fsdp_state() for module in modules]
    comm_ctx = fsdp_states[0]._comm_ctx
    for fsdp_state in fsdp_states[1:]:
        fsdp_state._comm_ctx = comm_ctx
        if fsdp_param_group := fsdp_state._fsdp_param_group:
            fsdp_param_group.comm_ctx = comm_ctx
 def _assert_all_fsdp_modules(modules: Iterable[Any]) -> None:
    for module in modules:
        if not isinstance(module, FSDPModule):
--- a/torch/testing/_internal/common_fsdp.py
+++ b/torch/testing/_internal/common_fsdp.py
@ -997,6 +997,42 @@ def patch_all_gather(new_all_gather_into_tensor: Callable):
        dist.all_gather_into_tensor = orig_all_gather
@contextlib.contextmanager
 def patch_foreach_all_gather(new_foreach_all_gather: Callable):
    orig_foreach_all_gather = (
        torch.distributed.fsdp._fully_shard._fsdp_param_group.foreach_all_gather
    )
    dist.barrier()
    torch.distributed.fsdp._fully_shard._fsdp_param_group.foreach_all_gather = (
        new_foreach_all_gather
    )
    try:
        yield
    finally:
        dist.barrier()
        torch.distributed.fsdp._fully_shard._fsdp_param_group.foreach_all_gather = (
            orig_foreach_all_gather
        )
@contextlib.contextmanager
 def patch_foreach_reduce(new_foreach_reduce: Callable):
    orig_foreach_foreach_reduce = (
        torch.distributed.fsdp._fully_shard._fsdp_param_group.foreach_reduce
    )
    dist.barrier()
    torch.distributed.fsdp._fully_shard._fsdp_param_group.foreach_reduce = (
        new_foreach_reduce
    )
    try:
        yield
    finally:
        dist.barrier()
        torch.distributed.fsdp._fully_shard._fsdp_param_group.foreach_reduce = (
            orig_foreach_foreach_reduce
        )
@contextlib.contextmanager
 def patch_reduce_scatter(new_reduce_scatter_tensor: Callable):
    orig_reduce_scatter = dist.reduce_scatter_tensor