mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-21 05:34:18 +08:00
Add --filter-rank to torchrun: allow logs filtering by rank (#118562)
Addresses issue https://github.com/pytorch/pytorch/issues/117383 The implementation exposes `--filter-ranks` which filters by rank which files we pass to `TailLog` (used in torchrun to determine which logs to output to stdout/stderr) ## Behavior ### with --tee Currently --tee is implemented as --redirect to file, and streams file to console using `tail`. When --tee is specified, file logs will be unaffected and we will only filter the output to console. ### with --redirect When --redirect is specified without --tee, nothing is logged to console, so we no-op. ### with neither When neither --tee or --redirect are specified, torchrun uses empty string "" to indicate logging to console. We intercept this empty string, and redirect it to "/dev/null" to not print to console. The api also allows a per-rank configuration for --tee and --redirect, and is also supported by this filter implementation. ## Usage ### without --tee ``` > TORCH_LOGS_FORMAT="%(levelname)s: %(message)s" TORCH_LOGS="graph" torchrun --standalone --nproc_per_node=2 --role rank --filter_ranks=0 t.py hello from rank 0 python DEBUG: TRACED GRAPH __compiled_fn_0 <eval_with_key>.0 opcode name target args kwargs ------------- ------ ----------------------- --------- -------- placeholder l_x_ L_x_ () {} call_function mul <built-in function mul> (l_x_, 5) {} output output output ((mul,),) {} ... ``` ### with --tee ``` > TORCH_LOGS_FORMAT="%(levelname)s: %(message)s" TORCH_LOGS="graph" torchrun --standalone --nproc_per_node=2 --role rank --tee 3 --filter_ranks=0 t.py [rank0]:hello from rank 0 python [rank0]:DEBUG: TRACED GRAPH [rank0]: __compiled_fn_0 <eval_with_key>.0 opcode name target args kwargs [rank0]:------------- ------ ----------------------- --------- -------- [rank0]:placeholder l_x_ L_x_ () {} [rank0]:call_function mul <built-in function mul> (l_x_, 5) {} [rank0]:output output output ((mul,),) {} ... ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/118562 Approved by: https://github.com/wconstab, https://github.com/wanchaol
This commit is contained in:
committed by
PyTorch MergeBot
parent
995f69623d
commit
73229b4f93
@ -376,7 +376,7 @@ import os
|
||||
import sys
|
||||
import uuid
|
||||
from argparse import REMAINDER, ArgumentParser
|
||||
from typing import Callable, List, Tuple, Union
|
||||
from typing import Callable, List, Tuple, Union, Optional, Set
|
||||
|
||||
import torch
|
||||
from torch.distributed.argparse_util import check_env, env
|
||||
@ -548,6 +548,17 @@ def get_args_parser() -> ArgumentParser:
|
||||
help="Tee std streams into a log file and also to console (see --redirects for format).",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--filter-local-ranks",
|
||||
"--filter_local_ranks",
|
||||
action=env,
|
||||
type=str,
|
||||
default="",
|
||||
help="Only show logs from specified ranks in console (e.g. [--filter-local-ranks 0 1 2] will "
|
||||
"only show logs from rank 0, 1 and 2). This will only apply to stdout and stderr, not to"
|
||||
"log files saved via --redirect or --tee",
|
||||
)
|
||||
|
||||
#
|
||||
# Backwards compatible parameters with caffe2.distributed.launch.
|
||||
#
|
||||
@ -724,6 +735,16 @@ def config_from_args(args) -> Tuple[LaunchConfig, Union[Callable, str], List[str
|
||||
|
||||
rdzv_endpoint = get_rdzv_endpoint(args)
|
||||
|
||||
ranks: Optional[Set[int]] = None
|
||||
if args.filter_local_ranks:
|
||||
try:
|
||||
ranks = set(map(int, args.filter_local_ranks.split(",")))
|
||||
assert ranks
|
||||
except Exception as e:
|
||||
raise Exception(
|
||||
"--filter_local_ranks must be a comma-separated list of integers e.g. --filter_local_ranks=0,1,2"
|
||||
) from e
|
||||
|
||||
config = LaunchConfig(
|
||||
min_nodes=min_nodes,
|
||||
max_nodes=max_nodes,
|
||||
@ -741,6 +762,7 @@ def config_from_args(args) -> Tuple[LaunchConfig, Union[Callable, str], List[str
|
||||
log_dir=args.log_dir,
|
||||
log_line_prefix_template=log_line_prefix_template,
|
||||
local_addr=args.local_addr,
|
||||
filter_local_ranks=ranks,
|
||||
)
|
||||
|
||||
with_python = not args.no_python
|
||||
|
Reference in New Issue
Block a user