mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-21 05:34:18 +08:00
Preferring dash over underscore in command-line options. Add `--command-arg-name` to the argument parser. The old arguments with underscores `--command_arg_name` are kept for backward compatibility.
Both dashes and underscores are used in the PyTorch codebase. Some argument parsers only have dashes or only have underscores in arguments. For example, the `torchrun` utility for distributed training only accepts underscore arguments (e.g., `--master_port`). The dashes are more common in other command-line tools. And it looks to be the default choice in the Python standard library:
`argparse.BooleanOptionalAction`: 4a9dff0e5a/Lib/argparse.py (L893-L895)
```python
class BooleanOptionalAction(Action):
def __init__(...):
if option_string.startswith('--'):
option_string = '--no-' + option_string[2:]
_option_strings.append(option_string)
```
It adds `--no-argname`, not `--no_argname`. Also typing `_` need to press the shift or the caps-lock key than `-`.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/94505
Approved by: https://github.com/ezyang, https://github.com/seemethere
214 lines
8.6 KiB
Python
214 lines
8.6 KiB
Python
import argparse
|
|
import os
|
|
import time
|
|
|
|
import json
|
|
import torch.distributed.rpc as rpc
|
|
import torch.multiprocessing as mp
|
|
|
|
|
|
from coordinator import CoordinatorBase
|
|
|
|
COORDINATOR_NAME = "coordinator"
|
|
AGENT_NAME = "agent"
|
|
OBSERVER_NAME = "observer{}"
|
|
|
|
TOTAL_EPISODES = 10
|
|
TOTAL_EPISODE_STEPS = 100
|
|
|
|
|
|
def str2bool(v):
|
|
if isinstance(v, bool):
|
|
return v
|
|
if v.lower() in ('yes', 'true', 't', 'y', '1'):
|
|
return True
|
|
elif v.lower() in ('no', 'false', 'f', 'n', '0'):
|
|
return False
|
|
else:
|
|
raise argparse.ArgumentTypeError('Boolean value expected.')
|
|
|
|
|
|
parser = argparse.ArgumentParser(description='PyTorch RPC RL Benchmark')
|
|
parser.add_argument('--world-size', '--world_size', type=str, default='10')
|
|
parser.add_argument('--master-addr', '--master_addr', type=str, default='127.0.0.1')
|
|
parser.add_argument('--master-port', '--master_port', type=str, default='29501')
|
|
parser.add_argument('--batch', type=str, default='True')
|
|
|
|
parser.add_argument('--state-size', '--state_size', type=str, default='10-20-10')
|
|
parser.add_argument('--nlayers', type=str, default='5')
|
|
parser.add_argument('--out-features', '--out_features', type=str, default='10')
|
|
parser.add_argument('--output-file-path', '--output_file_path', type=str, default='benchmark_report.json')
|
|
|
|
args = parser.parse_args()
|
|
args = vars(args)
|
|
|
|
def run_worker(rank, world_size, master_addr, master_port, batch, state_size, nlayers, out_features, queue):
|
|
r"""
|
|
inits an rpc worker
|
|
Args:
|
|
rank (int): Rpc rank of worker machine
|
|
world_size (int): Number of workers in rpc network (number of observers +
|
|
1 agent + 1 coordinator)
|
|
master_addr (str): Master address of cooridator
|
|
master_port (str): Master port of coordinator
|
|
batch (bool): Whether agent will use batching or process one observer
|
|
request a at a time
|
|
state_size (str): Numerical str representing state dimensions (ie: 5-15-10)
|
|
nlayers (int): Number of layers in model
|
|
out_features (int): Number of out features in model
|
|
queue (SimpleQueue): SimpleQueue from torch.multiprocessing.get_context() for
|
|
saving benchmark run results to
|
|
"""
|
|
state_size = list(map(int, state_size.split('-')))
|
|
batch_size = world_size - 2 # No. of observers
|
|
|
|
os.environ['MASTER_ADDR'] = master_addr
|
|
os.environ['MASTER_PORT'] = master_port
|
|
if rank == 0:
|
|
rpc.init_rpc(COORDINATOR_NAME, rank=rank, world_size=world_size)
|
|
|
|
coordinator = CoordinatorBase(
|
|
batch_size, batch, state_size, nlayers, out_features)
|
|
coordinator.run_coordinator(TOTAL_EPISODES, TOTAL_EPISODE_STEPS, queue)
|
|
|
|
elif rank == 1:
|
|
rpc.init_rpc(AGENT_NAME, rank=rank, world_size=world_size)
|
|
else:
|
|
rpc.init_rpc(OBSERVER_NAME.format(rank),
|
|
rank=rank, world_size=world_size)
|
|
rpc.shutdown()
|
|
|
|
def find_graph_variable(args):
|
|
r"""
|
|
Determines if user specified multiple entries for a single argument, in which case
|
|
benchmark is run for each of these entries. Comma separated values in a given argument indicate multiple entries.
|
|
Output is presented so that user can use plot repo to plot the results with each of the
|
|
variable argument's entries on the x-axis. Args is modified in accordance with this.
|
|
More than 1 argument with multiple entries is not permitted.
|
|
Args:
|
|
args (dict): Dictionary containing arguments passed by the user (and default arguments)
|
|
"""
|
|
var_types = {'world_size': int,
|
|
'state_size': str,
|
|
'nlayers': int,
|
|
'out_features': int,
|
|
'batch': str2bool}
|
|
for arg in var_types.keys():
|
|
if ',' in args[arg]:
|
|
if args.get('x_axis_name'):
|
|
raise("Only 1 x axis graph variable allowed")
|
|
args[arg] = list(map(var_types[arg], args[arg].split(','))) # convert , separated str to list
|
|
args['x_axis_name'] = arg
|
|
else:
|
|
args[arg] = var_types[arg](args[arg]) # convert string to proper type
|
|
|
|
def append_spaces(string, length):
|
|
r"""
|
|
Returns a modified string with spaces appended to the end. If length of string argument
|
|
is greater than or equal to length, a single space is appended, otherwise x spaces are appended
|
|
where x is the difference between the length of string and the length argument
|
|
Args:
|
|
string (str): String to be modified
|
|
length (int): Size of desired return string with spaces appended
|
|
Return: (str)
|
|
"""
|
|
string = str(string)
|
|
offset = length - len(string)
|
|
if offset <= 0:
|
|
offset = 1
|
|
string += ' ' * offset
|
|
return string
|
|
|
|
def print_benchmark_results(report):
|
|
r"""
|
|
Prints benchmark results
|
|
Args:
|
|
report (dict): JSON formatted dictionary containing relevant data on the run of this application
|
|
"""
|
|
print("--------------------------------------------------------------")
|
|
print("PyTorch distributed rpc benchmark reinforcement learning suite")
|
|
print("--------------------------------------------------------------")
|
|
for key, val in report.items():
|
|
if key != "benchmark_results":
|
|
print(f'{key} : {val}')
|
|
|
|
x_axis_name = report.get('x_axis_name')
|
|
col_width = 7
|
|
heading = ""
|
|
if x_axis_name:
|
|
x_axis_output_label = f'{x_axis_name} |'
|
|
heading += append_spaces(x_axis_output_label, col_width)
|
|
metric_headers = ['agent latency (seconds)', 'agent throughput',
|
|
'observer latency (seconds)', 'observer throughput']
|
|
percentile_subheaders = ['p50', 'p75', 'p90', 'p95']
|
|
subheading = ""
|
|
if x_axis_name:
|
|
subheading += append_spaces(' ' * (len(x_axis_output_label) - 1), col_width)
|
|
for header in metric_headers:
|
|
heading += append_spaces(header, col_width * len(percentile_subheaders))
|
|
for percentile in percentile_subheaders:
|
|
subheading += append_spaces(percentile, col_width)
|
|
print(heading)
|
|
print(subheading)
|
|
|
|
for benchmark_run in report['benchmark_results']:
|
|
run_results = ""
|
|
if x_axis_name:
|
|
run_results += append_spaces(benchmark_run[x_axis_name], max(col_width, len(x_axis_output_label)))
|
|
for metric_name in metric_headers:
|
|
percentile_results = benchmark_run[metric_name]
|
|
for percentile in percentile_subheaders:
|
|
run_results += append_spaces(percentile_results[percentile], col_width)
|
|
print(run_results)
|
|
|
|
def main():
|
|
r"""
|
|
Runs rpc benchmark once if no argument has multiple entries, and otherwise once for each of the multiple entries.
|
|
Multiple entries is indicated by comma separated values, and may only be done for a single argument.
|
|
Results are printed as well as saved to output file. In case of multiple entries for a single argument,
|
|
the plot repo can be used to benchmark results on the y axis with each entry on the x axis.
|
|
"""
|
|
find_graph_variable(args)
|
|
|
|
# run once if no x axis variables
|
|
x_axis_variables = args[args['x_axis_name']] if args.get('x_axis_name') else [None]
|
|
ctx = mp.get_context('spawn')
|
|
queue = ctx.SimpleQueue()
|
|
benchmark_runs = []
|
|
for i, x_axis_variable in enumerate(x_axis_variables): # run benchmark for every x axis variable
|
|
if len(x_axis_variables) > 1:
|
|
args[args['x_axis_name']] = x_axis_variable # set x axis variable for this benchmark iteration
|
|
processes = []
|
|
start_time = time.time()
|
|
for rank in range(args['world_size']):
|
|
prc = ctx.Process(
|
|
target=run_worker,
|
|
args=(
|
|
rank, args['world_size'], args['master_addr'], args['master_port'],
|
|
args['batch'], args['state_size'], args['nlayers'],
|
|
args['out_features'], queue
|
|
)
|
|
)
|
|
prc.start()
|
|
processes.append(prc)
|
|
benchmark_run_results = queue.get()
|
|
for process in processes:
|
|
process.join()
|
|
print(f"Time taken benchmark run {i} -, {time.time() - start_time}")
|
|
if args.get('x_axis_name'):
|
|
# save x axis value was for this iteration in the results
|
|
benchmark_run_results[args['x_axis_name']] = x_axis_variable
|
|
benchmark_runs.append(benchmark_run_results)
|
|
|
|
report = args
|
|
report['benchmark_results'] = benchmark_runs
|
|
if args.get('x_axis_name'):
|
|
# x_axis_name was variable so dont save a constant in the report for that variable
|
|
del report[args['x_axis_name']]
|
|
with open(args['output_file_path'], 'w') as f:
|
|
json.dump(report, f)
|
|
print_benchmark_results(report)
|
|
|
|
if __name__ == '__main__':
|
|
main()
|