[PP] Add ZeroBubble schedule (#133467)

Zero bubble can be expressed through `ScheduleFlexibleInterleaved1F1B` by setting `enable_zero_bubble=True`. But instead of having to include this flag in schedule initialization we should create a separate ZeroBubbleSchedule and also transition `Interleaved1F1B` to derive from `ScheduleFlexibleInterleaved1F1B`. Then we dont need to expose `ScheduleFlexibleInterleaved1F1B` since the naming is not obvious Pull Request resolved: https://github.com/pytorch/pytorch/pull/133467 Approved by: https://github.com/wconstab ghstack dependencies: #132691
2025-10-20 21:14:14 +08:00 · 2024-08-21 13:25:33 -07:00
parent cedfac20c7
commit 108a75b454
4 changed files with 43 additions and 1 deletions
--- a/docs/source/distributed.pipelining.rst
+++ b/docs/source/distributed.pipelining.rst
@ -489,6 +489,8 @@ Pipeline Schedules
 .. autoclass:: ScheduleLoopedBFS
 .. autoclass:: ScheduleInterleavedZeroBubble
 .. autoclass:: PipelineScheduleSingle
  :members:
--- a/test/distributed/pipelining/test_schedule_multiproc.py
+++ b/test/distributed/pipelining/test_schedule_multiproc.py
@ -19,6 +19,7 @@ from torch.distributed.pipelining import (
    ScheduleFlexibleInterleaved1F1B,
    ScheduleGPipe,
    ScheduleInterleaved1F1B,
    ScheduleInterleavedZeroBubble,
    ScheduleLoopedBFS,
 )
 from torch.distributed.pipelining.schedules import _PipelineScheduleRuntime
@ -348,7 +349,10 @@ class ScheduleTest(MultiProcContinousTest):
    @requires_nccl()
    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
-    @parametrize("ScheduleClass", [ScheduleInterleaved1F1B, ScheduleLoopedBFS])
+    @parametrize(
        "ScheduleClass",
        [ScheduleInterleaved1F1B, ScheduleLoopedBFS, ScheduleInterleavedZeroBubble],
    )
    @parametrize("use_new_runtime", [False, True])
    def test_grad_with_manual_interleaved(self, ScheduleClass, use_new_runtime):
        stages_per_rank = 2
@ -408,6 +412,7 @@ class ScheduleTest(MultiProcContinousTest):
                num_microbatches,
                loss_fn=loss_fn,
                stage_index_to_group_rank=old_schedule.stage_index_to_group_rank,
                use_full_backward=old_schedule.use_full_backward,
            )
            tmp_schedule._load_actions(old_schedule.pipeline_order)
            # test that csv round-trip works for compute_comms schedule
@ -416,6 +421,7 @@ class ScheduleTest(MultiProcContinousTest):
                num_microbatches,
                loss_fn=loss_fn,
                stage_index_to_group_rank=old_schedule.stage_index_to_group_rank,
                use_full_backward=old_schedule.use_full_backward,
            )
            with tempfile.NamedTemporaryFile() as f:
                tmp_schedule._dump_csv(f.name)
@ -426,6 +432,7 @@ class ScheduleTest(MultiProcContinousTest):
                num_microbatches,
                loss_fn=loss_fn,
                stage_index_to_group_rank=old_schedule.stage_index_to_group_rank,
                use_full_backward=old_schedule.use_full_backward,
            )
            one_more_schedule._load_actions(
                schedule.pipeline_order_with_comms, format="compute_comms"
--- a/torch/distributed/pipelining/init.py
+++ b/torch/distributed/pipelining/init.py
@ -6,6 +6,7 @@ from .schedules import (
    ScheduleFlexibleInterleaved1F1B,
    ScheduleGPipe,
    ScheduleInterleaved1F1B,
    ScheduleInterleavedZeroBubble,
    ScheduleLoopedBFS,
 )
 from .stage import build_stage, PipelineStage
@ -23,4 +24,5 @@ __all__ = [
    "ScheduleGPipe",
    "ScheduleInterleaved1F1B",
    "ScheduleLoopedBFS",
    "ScheduleInterleavedZeroBubble",
 ]
--- a/torch/distributed/pipelining/schedules.py
+++ b/torch/distributed/pipelining/schedules.py
@ -42,6 +42,7 @@ __all__ = [
    "ScheduleGPipe",
    "ScheduleInterleaved1F1B",
    "ScheduleLoopedBFS",
    "ScheduleInterleavedZeroBubble",
 ]
 logger = logging.getLogger(__name__)
@ -2110,6 +2111,35 @@ class ScheduleFlexibleInterleaved1F1B(PipelineScheduleMulti):
        return result
 class ScheduleInterleavedZeroBubble(ScheduleFlexibleInterleaved1F1B):
    """
    The Interleaved Zero Bubble schedule.
    See https://arxiv.org/pdf/2401.10241 for details.
    Will perform one forward and one backward on inputs for the microbatches in steady
    state and supports multiple stages per rank. Uses the backward for weights to fill in
    the pipeline bubble.
    """
    def __init__(
        self,
        stages: List[_PipelineStageBase],
        n_microbatches: int,
        loss_fn: Optional[Callable] = None,
        args_chunk_spec: Optional[Tuple[TensorChunkSpec, ...]] = None,
        kwargs_chunk_spec: Optional[Dict[str, TensorChunkSpec]] = None,
        output_merge_spec: Optional[Union[Dict[str, Any], Tuple[Any]]] = None,
    ):
        super().__init__(
            stages=stages,
            n_microbatches=n_microbatches,
            loss_fn=loss_fn,
            args_chunk_spec=args_chunk_spec,
            kwargs_chunk_spec=kwargs_chunk_spec,
            output_merge_spec=output_merge_spec,
            enable_zero_bubble=True,
        )
 def get_schedule_class(schedule_name: str):
    """
    Maps a schedule name to its corresponding class object.
@ -2123,6 +2153,7 @@ def get_schedule_class(schedule_name: str):
        "GPipe": ScheduleGPipe,
        "FlexibleInterleaved1F1B": ScheduleFlexibleInterleaved1F1B,
        "LoopedBFS": ScheduleLoopedBFS,
        "InterleavedZeroBubble": ScheduleInterleavedZeroBubble,
        "PipelineScheduleSingle": PipelineScheduleSingle,
        "PipelineScheduleMulti": PipelineScheduleMulti,
    }