mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-21 05:34:18 +08:00
[BE] Prefer dash over underscore in command-line options (#94505)
Preferring dash over underscore in command-line options. Add `--command-arg-name` to the argument parser. The old arguments with underscores `--command_arg_name` are kept for backward compatibility.
Both dashes and underscores are used in the PyTorch codebase. Some argument parsers only have dashes or only have underscores in arguments. For example, the `torchrun` utility for distributed training only accepts underscore arguments (e.g., `--master_port`). The dashes are more common in other command-line tools. And it looks to be the default choice in the Python standard library:
`argparse.BooleanOptionalAction`: 4a9dff0e5a/Lib/argparse.py (L893-L895)
```python
class BooleanOptionalAction(Action):
def __init__(...):
if option_string.startswith('--'):
option_string = '--no-' + option_string[2:]
_option_strings.append(option_string)
```
It adds `--no-argname`, not `--no_argname`. Also typing `_` need to press the shift or the caps-lock key than `-`.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/94505
Approved by: https://github.com/ezyang, https://github.com/seemethere
This commit is contained in:
committed by
PyTorch MergeBot
parent
a63524684d
commit
a229b4526f
@ -30,11 +30,11 @@ Transitioning from torch.distributed.launch to torchrun
|
||||
|
||||
|
||||
``torchrun`` supports the same arguments as ``torch.distributed.launch`` **except**
|
||||
for ``--use_env`` which is now deprecated. To migrate from ``torch.distributed.launch``
|
||||
for ``--use-env`` which is now deprecated. To migrate from ``torch.distributed.launch``
|
||||
to ``torchrun`` follow these steps:
|
||||
|
||||
1. If your training script is already reading ``local_rank`` from the ``LOCAL_RANK`` environment variable.
|
||||
Then you need simply omit the ``--use_env`` flag, e.g.:
|
||||
Then you need simply omit the ``--use-env`` flag, e.g.:
|
||||
|
||||
+--------------------------------------------------------------------+--------------------------------------------+
|
||||
| ``torch.distributed.launch`` | ``torchrun`` |
|
||||
@ -42,11 +42,11 @@ to ``torchrun`` follow these steps:
|
||||
| | |
|
||||
| .. code-block:: shell-session | .. code-block:: shell-session |
|
||||
| | |
|
||||
| $ python -m torch.distributed.launch --use_env train_script.py | $ torchrun train_script.py |
|
||||
| $ python -m torch.distributed.launch --use-env train_script.py | $ torchrun train_script.py |
|
||||
| | |
|
||||
+--------------------------------------------------------------------+--------------------------------------------+
|
||||
|
||||
2. If your training script reads local rank from a ``--local_rank`` cmd argument.
|
||||
2. If your training script reads local rank from a ``--local-rank`` cmd argument.
|
||||
Change your training script to read from the ``LOCAL_RANK`` environment variable as
|
||||
demonstrated by the following code snippet:
|
||||
|
||||
@ -59,7 +59,7 @@ to ``torchrun`` follow these steps:
|
||||
| | |
|
||||
| import argparse | import os |
|
||||
| parser = argparse.ArgumentParser() | local_rank = int(os.environ["LOCAL_RANK"]) |
|
||||
| parser.add_argument("--local_rank", type=int) | |
|
||||
| parser.add_argument("--local-rank", type=int) | |
|
||||
| args = parser.parse_args() | |
|
||||
| | |
|
||||
| local_rank = args.local_rank | |
|
||||
@ -85,7 +85,7 @@ Single-node multi-worker
|
||||
torchrun
|
||||
--standalone
|
||||
--nnodes=1
|
||||
--nproc_per_node=$NUM_TRAINERS
|
||||
--nproc-per-node=$NUM_TRAINERS
|
||||
YOUR_TRAINING_SCRIPT.py (--arg1 ... train script args...)
|
||||
|
||||
Stacked single-node multi-worker
|
||||
@ -94,18 +94,18 @@ Stacked single-node multi-worker
|
||||
To run multiple instances (separate jobs) of single-node, multi-worker on the
|
||||
same host, we need to make sure that each instance (job) is
|
||||
setup on different ports to avoid port conflicts (or worse, two jobs being merged
|
||||
as a single job). To do this you have to run with ``--rdzv_backend=c10d``
|
||||
and specify a different port by setting ``--rdzv_endpoint=localhost:$PORT_k``.
|
||||
as a single job). To do this you have to run with ``--rdzv-backend=c10d``
|
||||
and specify a different port by setting ``--rdzv-endpoint=localhost:$PORT_k``.
|
||||
For ``--nodes=1``, its often convenient to let ``torchrun`` pick a free random
|
||||
port automatically instead of manually assgining different ports for each run.
|
||||
|
||||
::
|
||||
|
||||
torchrun
|
||||
--rdzv_backend=c10d
|
||||
--rdzv_endpoint=localhost:0
|
||||
--rdzv-backend=c10d
|
||||
--rdzv-endpoint=localhost:0
|
||||
--nnodes=1
|
||||
--nproc_per_node=$NUM_TRAINERS
|
||||
--nproc-per-node=$NUM_TRAINERS
|
||||
YOUR_TRAINING_SCRIPT.py (--arg1 ... train script args...)
|
||||
|
||||
|
||||
@ -116,11 +116,11 @@ Fault tolerant (fixed sized number of workers, no elasticity, tolerates 3 failur
|
||||
|
||||
torchrun
|
||||
--nnodes=$NUM_NODES
|
||||
--nproc_per_node=$NUM_TRAINERS
|
||||
--max_restarts=3
|
||||
--rdzv_id=$JOB_ID
|
||||
--rdzv_backend=c10d
|
||||
--rdzv_endpoint=$HOST_NODE_ADDR
|
||||
--nproc-per-node=$NUM_TRAINERS
|
||||
--max-restarts=3
|
||||
--rdzv-id=$JOB_ID
|
||||
--rdzv-backend=c10d
|
||||
--rdzv-endpoint=$HOST_NODE_ADDR
|
||||
YOUR_TRAINING_SCRIPT.py (--arg1 ... train script args...)
|
||||
|
||||
``HOST_NODE_ADDR``, in form <host>[:<port>] (e.g. node1.example.com:29400), specifies the node and
|
||||
@ -137,11 +137,11 @@ Elastic (``min=1``, ``max=4``, tolerates up to 3 membership changes or failures)
|
||||
|
||||
torchrun
|
||||
--nnodes=1:4
|
||||
--nproc_per_node=$NUM_TRAINERS
|
||||
--max_restarts=3
|
||||
--rdzv_id=$JOB_ID
|
||||
--rdzv_backend=c10d
|
||||
--rdzv_endpoint=$HOST_NODE_ADDR
|
||||
--nproc-per-node=$NUM_TRAINERS
|
||||
--max-restarts=3
|
||||
--rdzv-id=$JOB_ID
|
||||
--rdzv-backend=c10d
|
||||
--rdzv-endpoint=$HOST_NODE_ADDR
|
||||
YOUR_TRAINING_SCRIPT.py (--arg1 ... train script args...)
|
||||
|
||||
``HOST_NODE_ADDR``, in form <host>[:<port>] (e.g. node1.example.com:29400), specifies the node and
|
||||
@ -156,10 +156,10 @@ Note on rendezvous backend
|
||||
|
||||
For multi-node training you need to specify:
|
||||
|
||||
1. ``--rdzv_id``: A unique job id (shared by all nodes participating in the job)
|
||||
2. ``--rdzv_backend``: An implementation of
|
||||
1. ``--rdzv-id``: A unique job id (shared by all nodes participating in the job)
|
||||
2. ``--rdzv-backend``: An implementation of
|
||||
:py:class:`torch.distributed.elastic.rendezvous.RendezvousHandler`
|
||||
3. ``--rdzv_endpoint``: The endpoint where the rendezvous backend is running; usually in form
|
||||
3. ``--rdzv-endpoint``: The endpoint where the rendezvous backend is running; usually in form
|
||||
``host:port``.
|
||||
|
||||
Currently ``c10d`` (recommended), ``etcd-v2``, and ``etcd`` (legacy) rendezvous backends are
|
||||
@ -221,7 +221,7 @@ The following environment variables are made available to you in your script:
|
||||
of the worker is specified in the ``WorkerSpec``.
|
||||
|
||||
5. ``LOCAL_WORLD_SIZE`` - The local world size (e.g. number of workers running locally); equals to
|
||||
``--nproc_per_node`` specified on ``torchrun``.
|
||||
``--nproc-per-node`` specified on ``torchrun``.
|
||||
|
||||
6. ``WORLD_SIZE`` - The world size (total number of workers in the job).
|
||||
|
||||
@ -246,7 +246,7 @@ Deployment
|
||||
------------
|
||||
|
||||
1. (Not needed for the C10d backend) Start the rendezvous backend server and get the endpoint (to be
|
||||
passed as ``--rdzv_endpoint`` to the launcher script)
|
||||
passed as ``--rdzv-endpoint`` to the launcher script)
|
||||
|
||||
2. Single-node multi-worker: Start the launcher on the host to start the agent process which
|
||||
creates and monitors a local worker group.
|
||||
@ -406,6 +406,7 @@ def get_args_parser() -> ArgumentParser:
|
||||
help="Number of nodes, or the range of nodes in form <minimum_nodes>:<maximum_nodes>.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--nproc-per-node",
|
||||
"--nproc_per_node",
|
||||
action=env,
|
||||
type=str,
|
||||
@ -418,6 +419,7 @@ def get_args_parser() -> ArgumentParser:
|
||||
#
|
||||
|
||||
parser.add_argument(
|
||||
"--rdzv-backend",
|
||||
"--rdzv_backend",
|
||||
action=env,
|
||||
type=str,
|
||||
@ -425,6 +427,7 @@ def get_args_parser() -> ArgumentParser:
|
||||
help="Rendezvous backend.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--rdzv-endpoint",
|
||||
"--rdzv_endpoint",
|
||||
action=env,
|
||||
type=str,
|
||||
@ -432,6 +435,7 @@ def get_args_parser() -> ArgumentParser:
|
||||
help="Rendezvous backend endpoint; usually in form <host>:<port>.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--rdzv-id",
|
||||
"--rdzv_id",
|
||||
action=env,
|
||||
type=str,
|
||||
@ -439,6 +443,7 @@ def get_args_parser() -> ArgumentParser:
|
||||
help="User-defined group id.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--rdzv-conf",
|
||||
"--rdzv_conf",
|
||||
action=env,
|
||||
type=str,
|
||||
@ -450,7 +455,7 @@ def get_args_parser() -> ArgumentParser:
|
||||
action=check_env,
|
||||
help="Start a local standalone rendezvous backend that is represented by a C10d TCP store "
|
||||
"on port 29400. Useful when launching single-node, multi-worker job. If specified "
|
||||
"--rdzv_backend, --rdzv_endpoint, --rdzv_id are auto-assigned; any explicitly set values "
|
||||
"--rdzv-backend, --rdzv-endpoint, --rdzv-id are auto-assigned; any explicitly set values "
|
||||
"are ignored.",
|
||||
)
|
||||
|
||||
@ -459,6 +464,7 @@ def get_args_parser() -> ArgumentParser:
|
||||
#
|
||||
|
||||
parser.add_argument(
|
||||
"--max-restarts",
|
||||
"--max_restarts",
|
||||
action=env,
|
||||
type=int,
|
||||
@ -466,6 +472,7 @@ def get_args_parser() -> ArgumentParser:
|
||||
help="Maximum number of worker group restarts before failing.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--monitor-interval",
|
||||
"--monitor_interval",
|
||||
action=env,
|
||||
type=float,
|
||||
@ -473,6 +480,7 @@ def get_args_parser() -> ArgumentParser:
|
||||
help="Interval, in seconds, to monitor the state of workers.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--start-method",
|
||||
"--start_method",
|
||||
action=env,
|
||||
type=str,
|
||||
@ -495,6 +503,7 @@ def get_args_parser() -> ArgumentParser:
|
||||
"with the same behavior as 'python -m'.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-python",
|
||||
"--no_python",
|
||||
action=check_env,
|
||||
help="Skip prepending the training script with 'python' - just execute it directly. Useful "
|
||||
@ -502,13 +511,15 @@ def get_args_parser() -> ArgumentParser:
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--run-path",
|
||||
"--run_path",
|
||||
action=check_env,
|
||||
help="Run the training script with runpy.run_path in the same interpreter."
|
||||
" Script must be provided as an abs path (e.g. /abs/path/script.py)."
|
||||
" Takes precedence over --no_python.",
|
||||
" Takes precedence over --no-python.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--log-dir",
|
||||
"--log_dir",
|
||||
action=env,
|
||||
type=str,
|
||||
@ -541,6 +552,7 @@ def get_args_parser() -> ArgumentParser:
|
||||
#
|
||||
|
||||
parser.add_argument(
|
||||
"--node-rank",
|
||||
"--node_rank",
|
||||
type=int,
|
||||
action=env,
|
||||
@ -548,16 +560,18 @@ def get_args_parser() -> ArgumentParser:
|
||||
help="Rank of the node for multi-node distributed training.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--master-addr",
|
||||
"--master_addr",
|
||||
default="127.0.0.1",
|
||||
type=str,
|
||||
action=env,
|
||||
help="Address of the master node (rank 0) that only used for static rendezvous. It should "
|
||||
"be either the IP address or the hostname of rank 0. For single node multi-proc training "
|
||||
"the --master_addr can simply be 127.0.0.1; IPv6 should have the pattern "
|
||||
"the --master-addr can simply be 127.0.0.1; IPv6 should have the pattern "
|
||||
"`[0:0:0:0:0:0:0:1]`.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--master-port",
|
||||
"--master_port",
|
||||
default=29500,
|
||||
type=int,
|
||||
@ -566,6 +580,7 @@ def get_args_parser() -> ArgumentParser:
|
||||
"training. It is only used for static rendezvous.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--local-addr",
|
||||
"--local_addr",
|
||||
default=None,
|
||||
type=str,
|
||||
@ -652,7 +667,7 @@ def get_use_env(args) -> bool:
|
||||
"""
|
||||
Retrieves ``use_env`` from the args.
|
||||
``use_env`` is a legacy argument, if ``use_env`` is False, the
|
||||
``--node_rank`` argument will be transferred to all worker processes.
|
||||
``--node-rank`` argument will be transferred to all worker processes.
|
||||
``use_env`` is only used by the ``torch.distributed.launch`` and will
|
||||
be deprecated in future releases.
|
||||
"""
|
||||
@ -729,12 +744,12 @@ def config_from_args(args) -> Tuple[LaunchConfig, Union[Callable, str], List[str
|
||||
else:
|
||||
if args.module:
|
||||
raise ValueError(
|
||||
"Don't use both the '--no_python' flag"
|
||||
"Don't use both the '--no-python' flag"
|
||||
" and the '--module' flag at the same time."
|
||||
)
|
||||
cmd = args.training_script
|
||||
if not use_env:
|
||||
cmd_args.append(f"--local_rank={macros.local_rank}")
|
||||
cmd_args.append(f"--local-rank={macros.local_rank}")
|
||||
cmd_args.extend(args.training_script_args)
|
||||
|
||||
return config, cmd, cmd_args
|
||||
@ -760,9 +775,9 @@ def run(args):
|
||||
log.info(
|
||||
f"\n**************************************\n"
|
||||
f"Rendezvous info:\n"
|
||||
f"--rdzv_backend={args.rdzv_backend} "
|
||||
f"--rdzv_endpoint={args.rdzv_endpoint} "
|
||||
f"--rdzv_id={args.rdzv_id}\n"
|
||||
f"--rdzv-backend={args.rdzv_backend} "
|
||||
f"--rdzv-endpoint={args.rdzv_endpoint} "
|
||||
f"--rdzv-id={args.rdzv_id}\n"
|
||||
f"**************************************\n"
|
||||
)
|
||||
|
||||
|
Reference in New Issue
Block a user