pytorch/torch/distributed/fsdp/_dynamo_utils.py

# mypy: allow-untyped-defs
from typing import Set

import torch.nn as nn


def _annotate_modules_for_dynamo(
    module: nn.Module,
    ignored_modules: Set[nn.Module],
    use_orig_params: bool,
):
    """
    Annotates the submodules in ``module`` 's tree, except those in
    ``ignored_modules``, indicating that the submodules are FSDP-managed and
    saving the ``use_orig_params`` setting passed to the FSDP constructor.
    """
    for submodule in module.modules():
        if submodule not in ignored_modules:
            """[note: Dynamo treats FSDP wrapped modules as UnspecializedNNModule]

            Dynamo doesn't get to see this instance (FullyShardedDataParallel) during tracing, since
            it skips tracing all the torch.distributed.fsdp code.
                - Why? Running the FSDP code eagerly avoids lots of issues trying to trace complex hooks, and also
                gets us graph-breaks on FSDP module boundaries which we want anyway for comm ops.
                - However, we _also_ want dynamo to treat the wrapped module inside FSDP 'unspecially' (*),
                and we need a way to indicate to dynamo which modules are wrapped by FSDP.

            (*) UnspecializedNNModules in dynamo are traced-through without any assumptions, and with thorough
            guards.  NNModules otherwise are 'specialized', meaning there is less overhead due to assuming
            their code is well-behaved.

            One particular issue with specialized NNModules for FSDP is that the
            views created for orig_params are captured into the compiled graph on the first iteration, and while
            they are always going to point to the correct flatparameter and give correct results, their order
            of creation influences the order of backward execution, preventing overlap of comm and computation
            during backward.  We need to _use_ the new parameter views created on each forward iteration, in
            order for backward to interleave hooks with compute per layer.  UnspecializedNNModule lets us achieve
            this by capturing the module code more 'functionally' and passing parameters in as inputs each time.
            """
            submodule._is_fsdp_managed_module = True  # type: ignore[assignment]

            # Dynamo only supports FSDP with use_orig_params=True.
            # This is hacky, but I could not think of another way to add an assertion to dynamo
            # for this, since Dynamo skips all the FSDP code frames and thus can't inspect the
            # FSDP module directly
            submodule._fsdp_use_orig_params = use_orig_params  # type: ignore[assignment]