Files
pytorch/torch/distributed/argparse_util.py
Simon Fan 284b0b5f44 Add --local-ranks-filter to torchrun: allow logs filtering by rank (#118562)
Addresses issue https://github.com/pytorch/pytorch/issues/117383

The implementation exposes `--local-ranks-filter` which filters by rank which files we pass to `TailLog` (used in torchrun to determine which logs to output to stdout/stderr)

## Behavior
### with --tee
Currently --tee is implemented as --redirect to file, and streams file to console using `tail`. When --tee is specified, file logs will be unaffected and we will only filter the output to console.

### with --redirect
When --redirect is specified without --tee, nothing is logged to console, so we no-op.

### with neither
When neither --tee or --redirect are specified, torchrun uses empty string "" to indicate logging to console. We intercept this empty string, and redirect it to "/dev/null" to not print to console.

The api also allows a per-rank configuration for --tee and --redirect, and is also supported by this filter implementation.

## Usage
### without --tee
```
> TORCH_LOGS_FORMAT="%(levelname)s: %(message)s" TORCH_LOGS="graph" torchrun --standalone --nproc_per_node=2 --role rank --local_rank_filter=0 t.py
hello from rank 0 python
DEBUG: TRACED GRAPH
 __compiled_fn_0 <eval_with_key>.0 opcode         name    target                   args       kwargs
-------------  ------  -----------------------  ---------  --------
placeholder    l_x_    L_x_                     ()         {}
call_function  mul     <built-in function mul>  (l_x_, 5)  {}
output         output  output                   ((mul,),)  {}
...
```
### with --tee
```
> TORCH_LOGS_FORMAT="%(levelname)s: %(message)s" TORCH_LOGS="graph" torchrun --standalone --nproc_per_node=2 --role rank --tee 3 --local_rank_filter=0 t.py
[rank0]:hello from rank 0 python
[rank0]:DEBUG: TRACED GRAPH
[rank0]: __compiled_fn_0 <eval_with_key>.0 opcode         name    target                   args       kwargs
[rank0]:-------------  ------  -----------------------  ---------  --------
[rank0]:placeholder    l_x_    L_x_                     ()         {}
[rank0]:call_function  mul     <built-in function mul>  (l_x_, 5)  {}
[rank0]:output         output  output                   ((mul,),)  {}
...
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/118562
Approved by: https://github.com/wconstab, https://github.com/wanchaol
2024-02-07 04:29:54 +00:00

104 lines
3.8 KiB
Python

#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
import os
from argparse import Action
class env(Action):
"""
Get argument values from ``PET_{dest}`` before defaulting to the given ``default`` value.
For flags (e.g. ``--standalone``)
use ``check_env`` instead.
.. note:: when multiple option strings are specified, ``dest`` is
the longest option string (e.g. for ``"-f", "--foo"``
the env var to set is ``PET_FOO`` not ``PET_F``)
Example:
::
parser.add_argument("-f", "--foo", action=env, default="bar")
./program -> args.foo="bar"
./program -f baz -> args.foo="baz"
./program --foo baz -> args.foo="baz"
PET_FOO="env_bar" ./program -f baz -> args.foo="baz"
PET_FOO="env_bar" ./program --foo baz -> args.foo="baz"
PET_FOO="env_bar" ./program -> args.foo="env_bar"
parser.add_argument("-f", "--foo", action=env, required=True)
./program -> fails
./program -f baz -> args.foo="baz"
PET_FOO="env_bar" ./program -> args.foo="env_bar"
PET_FOO="env_bar" ./program -f baz -> args.foo="baz"
"""
def __init__(self, dest, default=None, required=False, **kwargs) -> None:
env_name = f"PET_{dest.upper()}"
default = os.environ.get(env_name, default)
# ``required`` means that it NEEDS to be present in the command-line args
# rather than "this option requires a value (either set explicitly or default"
# so if we found default then we don't "require" it to be in the command-line
# so set it to False
if default:
required = False
super().__init__(dest=dest, default=default, required=required, **kwargs)
def __call__(self, parser, namespace, values, option_string=None):
setattr(namespace, self.dest, values)
class check_env(Action):
"""
Check whether the env var ``PET_{dest}`` exists before defaulting to the given ``default`` value.
Equivalent to
``store_true`` argparse built-in action except that the argument can
be omitted from the commandline if the env var is present and has a
non-zero value.
.. note:: it is redundant to pass ``default=True`` for arguments
that use this action because a flag should be ``True``
when present and ``False`` otherwise.
Example:
::
parser.add_argument("--verbose", action=check_env)
./program -> args.verbose=False
./program --verbose -> args.verbose=True
PET_VERBOSE=1 ./program -> args.verbose=True
PET_VERBOSE=0 ./program -> args.verbose=False
PET_VERBOSE=0 ./program --verbose -> args.verbose=True
Anti-pattern (don't do this):
::
parser.add_argument("--verbose", action=check_env, default=True)
./program -> args.verbose=True
./program --verbose -> args.verbose=True
PET_VERBOSE=1 ./program -> args.verbose=True
PET_VERBOSE=0 ./program -> args.verbose=False
"""
def __init__(self, dest, default=False, **kwargs) -> None:
env_name = f"PET_{dest.upper()}"
default = bool(int(os.environ.get(env_name, "1" if default else "0")))
super().__init__(dest=dest, const=True, default=default, nargs=0, **kwargs)
def __call__(self, parser, namespace, values, option_string=None):
setattr(namespace, self.dest, self.const)