mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
benchmark rpc ps (#57454)
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/57454 DDP with NCCL AllReduce for the entire model experiment from Quip https://fb.quip.com/iQUtAeKIxWpF I have been testing this on the AI cluster. There seem to be some connection problems with RPC when using multiple trainers or parameter servers. ``` Namespace(bconfig_id='3', dconfig_id='DummyData', mconfig_id='DummyModel', pconfig_id='None', tconfig_id='DdpNcclTrainer') benchmark warmup done metrics for trainer=0 +-----------------------------------+----------+---------+----------+------------+-----------+ | name | min | max | mean | variance | stdev | +===================================+==========+=========+==========+============+===========+ | backward_metric,backward | 2.45248 | 4.18304 | 3.972 | 0.097122 | 0.311644 | +-----------------------------------+----------+---------+----------+------------+-----------+ | batch_level_metric,batch_all | 4.11955 | 4.58138 | 4.31439 | 0.00229848 | 0.0479424 | +-----------------------------------+----------+---------+----------+------------+-----------+ | foward_metric,forward_pass | 0.141312 | 1.4807 | 0.222566 | 0.0555432 | 0.235676 | +-----------------------------------+----------+---------+----------+------------+-----------+ | hook_future_metric,nccl_allreduce | 0.191488 | 3.54099 | 3.11694 | 0.557106 | 0.746395 | +-----------------------------------+----------+---------+----------+------------+-----------+ metrics for trainer=1 +-----------------------------------+----------+---------+----------+-------------+------------+ | name | min | max | mean | variance | stdev | +===================================+==========+=========+==========+=============+============+ | backward_metric,backward | 2.4617 | 2.59174 | 2.51196 | 0.000938276 | 0.0306313 | +-----------------------------------+----------+---------+----------+-------------+------------+ | batch_level_metric,batch_all | 4.22605 | 4.71757 | 4.27921 | 0.00468424 | 0.0684415 | +-----------------------------------+----------+---------+----------+-------------+------------+ | foward_metric,forward_pass | 0.807936 | 1.50118 | 0.846008 | 0.00601693 | 0.0775688 | +-----------------------------------+----------+---------+----------+-------------+------------+ | hook_future_metric,nccl_allreduce | 0.108544 | 0.1536 | 0.11222 | 2.16726e-05 | 0.00465538 | +-----------------------------------+----------+---------+----------+-------------+------------+ metrics for all trainer +-----------------------------------+----------+---------+----------+------------+-----------+ | name | min | max | mean | variance | stdev | +===================================+==========+=========+==========+============+===========+ | backward_metric,backward | 2.45248 | 4.18304 | 3.24198 | 0.584391 | 0.764455 | +-----------------------------------+----------+---------+----------+------------+-----------+ | batch_level_metric,batch_all | 4.11955 | 4.71757 | 4.2968 | 0.00378467 | 0.0615197 | +-----------------------------------+----------+---------+----------+------------+-----------+ | foward_metric,forward_pass | 0.141312 | 1.50118 | 0.534287 | 0.128284 | 0.358167 | +-----------------------------------+----------+---------+----------+------------+-----------+ | hook_future_metric,nccl_allreduce | 0.108544 | 3.54099 | 1.61458 | 2.5456 | 1.59549 | +-----------------------------------+----------+---------+----------+------------+-----------+ ``` Test Plan: Imported from OSS Reviewed By: H-Huang, ngimel Differential Revision: D28296175 Pulled By: gcramer23 fbshipit-source-id: 5dd208fc86f8b5558d7c8860d685bb25c2e09fe7
This commit is contained in:
committed by
Facebook GitHub Bot
parent
94080f45ab
commit
bc2540f0be
343
benchmarks/distributed/rpc/parameter_server/launcher.py
Normal file
343
benchmarks/distributed/rpc/parameter_server/launcher.py
Normal file
@ -0,0 +1,343 @@
|
||||
import argparse
|
||||
import copy
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import torch.distributed.rpc as rpc
|
||||
import torch.multiprocessing as mp
|
||||
from torch.distributed.rpc import TensorPipeRpcBackendOptions
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
from benchmark_class_helper import (get_benchmark_data_map,
|
||||
get_benchmark_model_map,
|
||||
get_benchmark_ps_map,
|
||||
get_benchmark_trainer_map)
|
||||
from BenchmarkConfigurations import BenchmarkConfigurations
|
||||
from metrics.ProcessedMetricsPrinter import ProcessedMetricsPrinter
|
||||
|
||||
USE_CUDA_RPC = "use_cuda_rpc"
|
||||
|
||||
|
||||
def get_name(rank, configs):
|
||||
t_count = configs.trainer_count
|
||||
ps_count = configs.ps_count
|
||||
if rank < t_count:
|
||||
return f"trainer{rank}"
|
||||
elif rank < (t_count + ps_count):
|
||||
return f"ps{rank}"
|
||||
else:
|
||||
return "master"
|
||||
|
||||
|
||||
def get_parameter_server_rank(rank, config):
|
||||
# rank mod parameter server count to get parameter server number
|
||||
# add trainer_count to get parameter server rank
|
||||
rank_mod_ps_count = rank % config.ps_count
|
||||
return rank_mod_ps_count + config.trainer_count
|
||||
|
||||
|
||||
def get_ps_rref(parameter_server_rank, config):
|
||||
ps_config = config.ps_config
|
||||
ps = get_benchmark_ps_map()[str(ps_config["ps_class"])]
|
||||
name = get_name(
|
||||
parameter_server_rank,
|
||||
config
|
||||
)
|
||||
ps_args = ps_config["configurations"].values()
|
||||
ps_trainer_count = config.trainer_count / ps_config.ps_count
|
||||
rem = config.trainer_count % ps_config.ps_count
|
||||
if parameter_server_rank - config.trainer_count < rem:
|
||||
ps_trainer_count += 1
|
||||
return rpc.remote(
|
||||
name,
|
||||
ps,
|
||||
args=(
|
||||
parameter_server_rank,
|
||||
ps_trainer_count,
|
||||
*ps_args,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def run_trainer(
|
||||
config, model, data, rank, ps_rref
|
||||
):
|
||||
trainer_config = config.trainer_config
|
||||
trainer_class = get_benchmark_trainer_map()[str(trainer_config["trainer_class"])]
|
||||
trainer_args = trainer_config["configurations"].values()
|
||||
trainer = trainer_class(
|
||||
rank,
|
||||
config.trainer_count,
|
||||
ps_rref,
|
||||
*trainer_args
|
||||
)
|
||||
trainer.train(model, data)
|
||||
metrics = trainer.get_metrics()
|
||||
return [rank, metrics]
|
||||
|
||||
|
||||
def call_trainers(config, model, train_data, parameter_server_rrefs):
|
||||
futs = []
|
||||
for trainer_rank in range(0, config.trainer_count):
|
||||
trainer_name = get_name(
|
||||
trainer_rank,
|
||||
config
|
||||
)
|
||||
ps_rref = None
|
||||
if parameter_server_rrefs:
|
||||
ps_rank = get_parameter_server_rank(trainer_rank, config)
|
||||
ps_rref = parameter_server_rrefs[ps_rank]
|
||||
fut = rpc.rpc_async(
|
||||
trainer_name,
|
||||
run_trainer,
|
||||
args=(
|
||||
config,
|
||||
copy.deepcopy(model),
|
||||
train_data[trainer_rank],
|
||||
trainer_rank,
|
||||
ps_rref,
|
||||
),
|
||||
timeout=config.rpc_async_timeout
|
||||
)
|
||||
futs.append(fut)
|
||||
return futs
|
||||
|
||||
|
||||
def benchmark_warmup(
|
||||
config, model, data, parameter_server_rrefs
|
||||
):
|
||||
if config.ps_count > 0:
|
||||
ps_config = config.ps_config
|
||||
ps = get_benchmark_ps_map()[str(ps_config["ps_class"])]
|
||||
futs = call_trainers(config, model, data, parameter_server_rrefs)
|
||||
for fut in futs:
|
||||
fut.wait()
|
||||
for ps_rref in parameter_server_rrefs.values():
|
||||
rpc.rpc_sync(
|
||||
ps_rref.owner(),
|
||||
ps.reset_state,
|
||||
args=(ps_rref,)
|
||||
)
|
||||
print("benchmark warmup done\n")
|
||||
|
||||
|
||||
def split_list(arr, n):
|
||||
return [arr[i::n] for i in range(n)]
|
||||
|
||||
|
||||
def run_master(rank, model, data, config, rpc_backend_options):
|
||||
world_size = config.trainer_count + config.ps_count + 1
|
||||
rpc.init_rpc(
|
||||
get_name(
|
||||
rank,
|
||||
config
|
||||
),
|
||||
rank=rank,
|
||||
world_size=world_size,
|
||||
rpc_backend_options=rpc_backend_options
|
||||
)
|
||||
parameter_server_rrefs = {}
|
||||
for i in range(
|
||||
config.trainer_count, world_size - 1
|
||||
):
|
||||
parameter_server_rrefs[i] = get_ps_rref(i, config)
|
||||
|
||||
train_data = split_list(
|
||||
list(DataLoader(data, batch_size=config.batch_size)),
|
||||
config.trainer_count
|
||||
)
|
||||
|
||||
# warmup run the benchmark
|
||||
benchmark_warmup(
|
||||
config, model, train_data, parameter_server_rrefs
|
||||
)
|
||||
# run the benchmark
|
||||
trainer_futs = call_trainers(
|
||||
config, model, train_data, parameter_server_rrefs
|
||||
)
|
||||
# collect metrics and print
|
||||
metrics_printer = ProcessedMetricsPrinter()
|
||||
rank_metrics_list = [fut.wait() for fut in trainer_futs]
|
||||
metrics_printer.print_metrics("trainer", rank_metrics_list)
|
||||
|
||||
|
||||
def run_benchmark(rank, model, data, config):
|
||||
|
||||
world_size = config.trainer_count + config.ps_count + 1
|
||||
os.environ['MASTER_ADDR'] = config.master_addr
|
||||
os.environ['MASTER_PORT'] = config.master_port
|
||||
rpc_backend_options = TensorPipeRpcBackendOptions()
|
||||
rpc_backend_options.init_method = config.rpc_init_method
|
||||
if rank == world_size - 1:
|
||||
# master = [trainer_count + parameter_server_count, trainer_count + parameter_server_count]
|
||||
run_master(rank, model, data, config, rpc_backend_options)
|
||||
elif rank >= config.trainer_count:
|
||||
# parameter_servers = [trainer_count, trainer_count + parameter_server_count)
|
||||
rpc.init_rpc(
|
||||
get_name(
|
||||
rank,
|
||||
config
|
||||
),
|
||||
rank=rank,
|
||||
world_size=world_size,
|
||||
rpc_backend_options=rpc_backend_options
|
||||
)
|
||||
else:
|
||||
# trainers = [0, trainer_count)
|
||||
trainer_config = config.trainer_config
|
||||
ps_config = config.ps_config
|
||||
if (USE_CUDA_RPC in trainer_config and
|
||||
trainer_config[USE_CUDA_RPC] and
|
||||
USE_CUDA_RPC in ps_config and
|
||||
ps_config[USE_CUDA_RPC] and
|
||||
config.ps_count > 0):
|
||||
ps_rank = get_parameter_server_rank(rank, config)
|
||||
ps_name = get_name(
|
||||
ps_rank,
|
||||
config
|
||||
)
|
||||
rpc_backend_options.set_device_map(
|
||||
ps_name,
|
||||
{rank: ps_rank}
|
||||
)
|
||||
trainer_name = get_name(
|
||||
rank,
|
||||
config
|
||||
)
|
||||
rpc.init_rpc(
|
||||
trainer_name,
|
||||
rank=rank,
|
||||
world_size=world_size,
|
||||
rpc_backend_options=rpc_backend_options
|
||||
)
|
||||
rpc.shutdown()
|
||||
|
||||
|
||||
def get_json_config(file_name, id):
|
||||
f = open(
|
||||
os.path.join(
|
||||
Path(__file__).parent, file_name
|
||||
),
|
||||
"r"
|
||||
)
|
||||
json_config = json.load(f)[id]
|
||||
f.close()
|
||||
return json_config
|
||||
|
||||
|
||||
def load_configurations(args):
|
||||
trainer_config_file = args.trainer_config_path
|
||||
ps_config_file = args.server_config_path
|
||||
benchmark_config = get_json_config(args.benchmark_config_path, args.benchmark)
|
||||
benchmark_config["trainer_config"] = get_json_config(trainer_config_file, args.trainer)
|
||||
if args.server != "None":
|
||||
benchmark_config["ps_config"] = get_json_config(ps_config_file, args.server)
|
||||
else:
|
||||
benchmark_config["ps_config"] = None
|
||||
return BenchmarkConfigurations(**benchmark_config)
|
||||
|
||||
|
||||
def get_data(data_class, data_config):
|
||||
data_class = get_benchmark_data_map()[data_class]
|
||||
return data_class(**data_config)
|
||||
|
||||
|
||||
def load_data(args):
|
||||
data_config_file = args.data_config_path
|
||||
data_config = get_json_config(data_config_file, args.data)
|
||||
return get_data(data_config["data_class"], data_config["configurations"])
|
||||
|
||||
|
||||
def get_model(model_class, model_config):
|
||||
model_class = get_benchmark_model_map()[model_class]
|
||||
return model_class(**model_config)
|
||||
|
||||
|
||||
def load_model(args):
|
||||
model_config_file = args.model_config_path
|
||||
model_config = get_json_config(model_config_file, args.model)
|
||||
return get_model(model_config["model_class"], model_config["configurations"])
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="RPC PS Benchmark")
|
||||
|
||||
parser.add_argument(
|
||||
"--benchmark_config_path",
|
||||
type=str,
|
||||
default="configurations/benchmark_configurations.json",
|
||||
help="path to benchmark configuration file"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--data_config_path",
|
||||
type=str,
|
||||
default="configurations/data_configurations.json",
|
||||
help="path to data configuration file"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model_config_path",
|
||||
type=str,
|
||||
default="configurations/model_configurations.json",
|
||||
help="path to model configuration file"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--server_config_path",
|
||||
type=str,
|
||||
default="configurations/server_configurations.json",
|
||||
help="path to server configuration file"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--trainer_config_path",
|
||||
type=str,
|
||||
default="configurations/trainer_configurations.json",
|
||||
help="path to trainer configuration file"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--benchmark",
|
||||
type=str,
|
||||
help="id for benchmark configuration"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--data",
|
||||
type=str,
|
||||
help="id for data configuration"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model",
|
||||
type=str,
|
||||
help="id for model configuration"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--server",
|
||||
type=str,
|
||||
help="id for parameter server configuration"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--trainer",
|
||||
type=str,
|
||||
help="id for trainer configuration"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
print(f"{args}\n")
|
||||
|
||||
config = load_configurations(args)
|
||||
data = load_data(args)
|
||||
model = load_model(args)
|
||||
|
||||
world_size = config.trainer_count + config.ps_count + 1
|
||||
|
||||
mp.spawn(
|
||||
run_benchmark,
|
||||
args=(
|
||||
model,
|
||||
data,
|
||||
config,
|
||||
),
|
||||
nprocs=world_size,
|
||||
join=True
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Reference in New Issue
Block a user