mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 12:54:11 +08:00
This reverts commit 45411d1fc9a2b6d2f891b6ab0ae16409719e09fc. Reverted https://github.com/pytorch/pytorch/pull/129409 on behalf of https://github.com/jeanschmidt due to Breaking internal CI, @albanD please help get this PR merged ([comment](https://github.com/pytorch/pytorch/pull/129409#issuecomment-2571316444))
546 lines
17 KiB
Python
546 lines
17 KiB
Python
import argparse
|
|
import json
|
|
import os
|
|
from pathlib import Path
|
|
|
|
from data import data_map
|
|
from metrics.ProcessedMetricsPrinter import ProcessedMetricsPrinter
|
|
from models import model_map
|
|
from server import server_map
|
|
from trainer import (
|
|
criterion_map,
|
|
ddp_hook_map,
|
|
ddp_model_map,
|
|
hook_state_map,
|
|
iteration_step_map,
|
|
preprocess_data_map,
|
|
trainer_map,
|
|
)
|
|
|
|
import torch
|
|
import torch.distributed as c10d
|
|
import torch.distributed.rpc as rpc
|
|
import torch.multiprocessing as mp
|
|
from torch.distributed.rpc import TensorPipeRpcBackendOptions
|
|
from torch.futures import wait_all
|
|
from torch.utils.data import DataLoader
|
|
|
|
|
|
def get_name(rank, args):
|
|
r"""
|
|
A function that gets the name for the rank
|
|
argument
|
|
Args:
|
|
rank (int): process number in the world
|
|
args (parser): benchmark configurations
|
|
"""
|
|
t_count = args.ntrainer + args.ncudatrainer
|
|
s_count = args.nserver + args.ncudaserver
|
|
if rank < t_count:
|
|
return f"trainer{rank}"
|
|
elif rank < (t_count + s_count):
|
|
return f"server{rank}"
|
|
else:
|
|
return "master"
|
|
|
|
|
|
def get_server_rank(args, rank):
|
|
r"""
|
|
A function that gets the server rank for
|
|
the rank argument.
|
|
Args:
|
|
args (parser): benchmark configurations
|
|
rank (int): trainer rank
|
|
"""
|
|
s_offset = args.ntrainer + args.ncudatrainer
|
|
tps = args.ntrainer // args.nserver
|
|
return rank // tps + s_offset
|
|
|
|
|
|
def get_cuda_server_rank(args, rank):
|
|
r"""
|
|
A function that gets the cudaserver rank for
|
|
the rank argument.
|
|
Args:
|
|
args (parser): benchmark configurations
|
|
rank (int): trainer rank
|
|
"""
|
|
s_offset = args.ntrainer + args.ncudatrainer + args.nserver
|
|
t_index = rank - args.ntrainer
|
|
ctps = args.ncudatrainer // args.ncudaserver
|
|
return t_index // ctps + s_offset
|
|
|
|
|
|
def get_server_rref(server_rank, args, extra_args):
|
|
r"""
|
|
A function that creates a RRef to the server.
|
|
Args:
|
|
server_rank (int): process number in the world
|
|
args (parser): benchmark configurations
|
|
extra_args (dict): configurations added by the user
|
|
"""
|
|
server = server_map[args.server]
|
|
name = get_name(server_rank, args)
|
|
if extra_args is not None:
|
|
server_args = extra_args.values()
|
|
else:
|
|
server_args = []
|
|
if server_rank >= args.ntrainer + args.ncudatrainer + args.nserver:
|
|
trainer_count = args.ncudatrainer / args.ncudaserver
|
|
use_cuda_rpc = True
|
|
else:
|
|
trainer_count = args.ntrainer / args.nserver
|
|
use_cuda_rpc = False
|
|
return rpc.remote(
|
|
name,
|
|
server,
|
|
args=(
|
|
server_rank,
|
|
trainer_count,
|
|
use_cuda_rpc,
|
|
*server_args,
|
|
),
|
|
)
|
|
|
|
|
|
def run_trainer(args, extra_args, data, rank, server_rref):
|
|
r"""
|
|
A function that runs obtains a trainer instance and calls
|
|
the train method.
|
|
Args:
|
|
args (parser): benchmark configurations
|
|
extra_args (dict): configurations added by the user
|
|
data (list): training samples
|
|
rank (int): process number in the world
|
|
server_rref (dict): a dictionary containing server RRefs
|
|
"""
|
|
trainer_class = trainer_map[args.trainer]
|
|
if extra_args is not None:
|
|
trainer_args = extra_args.values()
|
|
else:
|
|
trainer_args = []
|
|
trainer_count = args.ntrainer + args.ncudatrainer
|
|
store = c10d.FileStore(args.filestore, trainer_count)
|
|
if args.backend == "gloo":
|
|
process_group = c10d.ProcessGroupGloo(store, rank, trainer_count)
|
|
elif args.backend == "nccl":
|
|
process_group = c10d.ProcessGroupNCCL(store, rank, trainer_count)
|
|
elif args.backend == "multi":
|
|
process_group = c10d.ProcessGroupNCCL(store, rank, trainer_count)
|
|
if c10d.is_initialized() is False:
|
|
c10d.init_process_group(backend="gloo", rank=rank, world_size=trainer_count)
|
|
|
|
model = load_model(args)
|
|
preprocess_data = preprocess_data_map[args.preprocess_data]
|
|
create_criterion = criterion_map[args.create_criterion]
|
|
create_ddp_model = ddp_model_map[args.create_ddp_model]
|
|
iteration_step = iteration_step_map[args.iteration_step]
|
|
hook_state_class = hook_state_map[args.hook_state]
|
|
hook = ddp_hook_map[args.ddp_hook]
|
|
# check if this a cudatrainer
|
|
use_cuda_rpc = rank >= args.ntrainer
|
|
trainer = trainer_class(
|
|
process_group,
|
|
use_cuda_rpc,
|
|
server_rref,
|
|
args.backend,
|
|
args.epochs,
|
|
preprocess_data,
|
|
create_criterion,
|
|
create_ddp_model,
|
|
hook_state_class,
|
|
hook,
|
|
iteration_step,
|
|
*trainer_args,
|
|
)
|
|
trainer.train(model, data)
|
|
metrics = trainer.get_metrics()
|
|
return [rank, metrics]
|
|
|
|
|
|
def call_trainers(args, extra_args, train_data, server_rrefs):
|
|
r"""
|
|
A function that starts the trainers. Each trainer is started
|
|
using an rpc_async request.
|
|
Args:
|
|
args (parser): benchmark configurations
|
|
extra_args (dict): configurations added by the user
|
|
train_data (list): training samples
|
|
server_rrefs (dict): a dictionary containing server RRefs
|
|
"""
|
|
futs = []
|
|
for trainer_rank in range(0, args.ntrainer + args.ncudatrainer):
|
|
trainer_name = get_name(trainer_rank, args)
|
|
server_rref = None
|
|
if server_rrefs:
|
|
if trainer_rank >= args.ntrainer:
|
|
server_rank = get_cuda_server_rank(args, trainer_rank)
|
|
else:
|
|
server_rank = get_server_rank(args, trainer_rank)
|
|
server_rref = server_rrefs[server_rank]
|
|
fut = rpc.rpc_async(
|
|
trainer_name,
|
|
run_trainer,
|
|
args=(
|
|
args,
|
|
extra_args,
|
|
train_data[trainer_rank],
|
|
trainer_rank,
|
|
server_rref,
|
|
),
|
|
timeout=args.rpc_timeout,
|
|
)
|
|
futs.append(fut)
|
|
return futs
|
|
|
|
|
|
def benchmark_warmup(args, extra_args, data, server_rrefs):
|
|
r"""
|
|
A function that runs the training algorithm. The goal of this
|
|
function is to warm the rpc. The server states are reset.
|
|
Args:
|
|
args (parser): benchmark configurations
|
|
extra_args (dict): configurations added by the user
|
|
data (list): training samples
|
|
server_rrefs (dict): a dictionary containing server RRefs
|
|
"""
|
|
futs = call_trainers(args, extra_args, data, server_rrefs)
|
|
wait_all(futs)
|
|
for server_rref in server_rrefs.values():
|
|
server_rref.rpc_sync().reset_state(server_rref)
|
|
print("benchmark warmup done\n")
|
|
|
|
|
|
def split_list(arr, n):
|
|
r"""
|
|
A function that splits a list into n lists
|
|
Args:
|
|
arr (list): training samples
|
|
n (int): number of output lists
|
|
"""
|
|
return [arr[i::n] for i in range(n)]
|
|
|
|
|
|
def get_server_metrics(server_rrefs):
|
|
r"""
|
|
A function that calls the remote server to obtain metrics
|
|
collected during the benchmark run.
|
|
Args:
|
|
server_rrefs (dict): a dictionary containing server RRefs
|
|
"""
|
|
rank_metrics = []
|
|
for rank, server_rref in server_rrefs.items():
|
|
metrics = server_rref.rpc_sync().get_metrics(server_rref)
|
|
rank_metrics.append([rank, metrics])
|
|
return rank_metrics
|
|
|
|
|
|
def run_master(rank, data, args, extra_configs, rpc_backend_options):
|
|
r"""
|
|
A function that runs the master process in the world. This function
|
|
obtains remote references to initialized servers, splits the data,
|
|
runs the trainers, and prints metrics.
|
|
Args:
|
|
rank (int): process number in the world
|
|
data (list): training samples
|
|
args (parser): benchmark configurations
|
|
extra_configs (dict): configurations added by the user
|
|
rpc_backend_options (rpc): configurations/options for the rpc TODO: fix
|
|
"""
|
|
world_size = args.ntrainer + args.ncudatrainer + args.nserver + args.ncudaserver + 1
|
|
rpc.init_rpc(
|
|
get_name(rank, args),
|
|
rank=rank,
|
|
world_size=world_size,
|
|
rpc_backend_options=rpc_backend_options,
|
|
)
|
|
server_rrefs = {}
|
|
for i in range(args.ntrainer + args.ncudatrainer, world_size - 1):
|
|
server_rrefs[i] = get_server_rref(i, args, extra_configs["server_config"])
|
|
train_data = split_list(
|
|
list(DataLoader(data, batch_size=args.batch_size)),
|
|
args.ntrainer + args.ncudatrainer,
|
|
)
|
|
|
|
# warmup run the benchmark
|
|
benchmark_warmup(args, extra_configs["trainer_config"], train_data, server_rrefs)
|
|
# run the benchmark
|
|
trainer_futs = call_trainers(
|
|
args, extra_configs["trainer_config"], train_data, server_rrefs
|
|
)
|
|
# collect metrics and print
|
|
metrics_printer = ProcessedMetricsPrinter()
|
|
rank_metrics_list = wait_all(trainer_futs)
|
|
metrics_printer.print_metrics("trainer", rank_metrics_list)
|
|
rank_metrics_list = get_server_metrics(server_rrefs)
|
|
metrics_printer.print_metrics("server", rank_metrics_list)
|
|
|
|
|
|
def run_benchmark(rank, args, data):
|
|
r"""
|
|
A function that runs the benchmark.
|
|
Args:
|
|
rank (int): process number in the world
|
|
args (parser): configuration args
|
|
data (list): training samples
|
|
"""
|
|
|
|
config = load_extra_configs(args)
|
|
|
|
torch.manual_seed(args.torch_seed)
|
|
torch.cuda.manual_seed_all(args.cuda_seed)
|
|
torch.backends.cudnn.benchmark = True
|
|
torch.backends.cudnn.deterministic = True
|
|
|
|
world_size = args.ntrainer + args.ncudatrainer + args.nserver + args.ncudaserver + 1
|
|
os.environ["MASTER_ADDR"] = args.master_addr
|
|
os.environ["MASTER_PORT"] = args.master_port
|
|
rpc_backend_options = TensorPipeRpcBackendOptions(rpc_timeout=args.rpc_timeout)
|
|
if rank == world_size - 1:
|
|
# master = [ntrainer + ncudatrainer + nserver + ncudaserver, ntrainer + ncudatrainer + nserver + ncudaserver]
|
|
run_master(rank, data, args, config, rpc_backend_options)
|
|
elif rank >= args.ntrainer + args.ncudatrainer:
|
|
# parameter_servers = [ntrainer + ncudatrainer, ntrainer + ncudatrainer + nserver + ncudaserver)
|
|
rpc.init_rpc(
|
|
get_name(rank, args),
|
|
rank=rank,
|
|
world_size=world_size,
|
|
rpc_backend_options=rpc_backend_options,
|
|
)
|
|
else:
|
|
# trainers = [0, ntrainer + ncudatrainer)
|
|
if rank >= args.ntrainer:
|
|
server_rank = get_cuda_server_rank(args, rank)
|
|
server_name = get_name(server_rank, args)
|
|
rpc_backend_options.set_device_map(server_name, {rank: server_rank})
|
|
trainer_name = get_name(rank, args)
|
|
rpc.init_rpc(
|
|
trainer_name,
|
|
rank=rank,
|
|
world_size=world_size,
|
|
rpc_backend_options=rpc_backend_options,
|
|
)
|
|
rpc.shutdown()
|
|
|
|
|
|
def get_json_config(file_name: str, id: str):
|
|
r"""
|
|
A function that loads a json configuration from a file.
|
|
Args:
|
|
file_name (str): name of configuration file to load
|
|
id (str): configuration that will be loaded
|
|
"""
|
|
with open(Path(__file__).parent / file_name) as f:
|
|
json_config = json.load(f)[id]
|
|
return json_config
|
|
|
|
|
|
def load_extra_configs(args):
|
|
r"""
|
|
A function that creates a dictionary that contains any extra configurations
|
|
set by the user. The dictionary will contain two keys trainer_config and
|
|
server_config, with default values None.
|
|
Args:
|
|
args (parser): launcher configurations
|
|
"""
|
|
trainer_config_file = args.trainer_config_path
|
|
server_config_file = args.server_config_path
|
|
configurations = {"trainer_config": None, "server_config": None}
|
|
if args.trainer is not None and trainer_config_file is not None:
|
|
configurations["trainer_config"] = get_json_config(
|
|
trainer_config_file, args.trainer
|
|
)
|
|
if args.server is not None and server_config_file is not None:
|
|
configurations["server_config"] = get_json_config(
|
|
server_config_file, args.server
|
|
)
|
|
return configurations
|
|
|
|
|
|
def load_data(args):
|
|
r"""
|
|
A function that creates an instance of the data class.
|
|
Args:
|
|
args (parser): launcher configurations
|
|
"""
|
|
data_config_file = args.data_config_path
|
|
data_config = get_json_config(data_config_file, args.data)
|
|
data_class = data_map[data_config["data_class"]]
|
|
return data_class(**data_config["configurations"])
|
|
|
|
|
|
def load_model(args):
|
|
r"""
|
|
A function that creates an instance of the model class.
|
|
Args:
|
|
args (parser): launcher configurations
|
|
"""
|
|
model_config_file = args.model_config_path
|
|
model_config = get_json_config(model_config_file, args.model)
|
|
model_class = model_map[model_config["model_class"]]
|
|
return model_class(**model_config["configurations"])
|
|
|
|
|
|
def main(args):
|
|
r"""
|
|
A function that creates multiple processes to run the benchmark.
|
|
Args:
|
|
args (parser): launcher configurations
|
|
"""
|
|
# CPU and RPC trainer checks
|
|
if args.ntrainer > 0 and args.ncudatrainer > 0:
|
|
assert args.nserver > 0 and args.ncudaserver > 0
|
|
if args.nserver > 0:
|
|
assert args.ntrainer > 0
|
|
assert args.ntrainer % args.nserver == 0
|
|
if args.ncudaserver > 0:
|
|
assert args.ncudatrainer > 0
|
|
assert args.ncudatrainer % args.ncudaserver == 0
|
|
|
|
world_size = args.ntrainer + args.ncudatrainer + args.nserver + args.ncudaserver + 1
|
|
|
|
data = load_data(args)
|
|
|
|
mp.spawn(
|
|
run_benchmark,
|
|
args=(
|
|
args,
|
|
data,
|
|
),
|
|
nprocs=world_size,
|
|
join=True,
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(description="RPC server Benchmark")
|
|
parser.add_argument(
|
|
"--master-addr",
|
|
"--master_addr",
|
|
type=str,
|
|
help="IP address of the machine that will host the process with rank 0",
|
|
)
|
|
parser.add_argument(
|
|
"--master-port",
|
|
"--master_port",
|
|
type=str,
|
|
help="A free port on the machine that will host the process with rank 0",
|
|
)
|
|
parser.add_argument(
|
|
"--trainer",
|
|
type=str,
|
|
help="trainer map key to get trainer class for benchmark run",
|
|
)
|
|
parser.add_argument("--ntrainer", type=int, help="trainer count for benchmark run")
|
|
parser.add_argument(
|
|
"--ncudatrainer", type=int, help="cudatrainer count for benchmark run"
|
|
)
|
|
parser.add_argument(
|
|
"--filestore", type=str, help="filestore location for process group"
|
|
)
|
|
parser.add_argument(
|
|
"--server",
|
|
type=str,
|
|
help="server map key to get trainer class for benchmark run",
|
|
)
|
|
parser.add_argument("--nserver", type=int, help="server count for benchmark run")
|
|
parser.add_argument(
|
|
"--ncudaserver", type=int, help="cudaserver count for benchmark run"
|
|
)
|
|
parser.add_argument(
|
|
"--rpc-timeout",
|
|
"--rpc_timeout",
|
|
type=int,
|
|
help="timeout in seconds to use for RPC",
|
|
)
|
|
parser.add_argument(
|
|
"--backend",
|
|
type=str,
|
|
help="distributed communication backend to use for benchmark run",
|
|
)
|
|
parser.add_argument("--epochs", type=int, help="epoch count for training")
|
|
parser.add_argument(
|
|
"--batch-size",
|
|
"--batch_size",
|
|
type=int,
|
|
help="number of training examples used in one iteration",
|
|
)
|
|
parser.add_argument("--data", type=str, help="id for data configuration")
|
|
parser.add_argument("--model", type=str, help="id for model configuration")
|
|
parser.add_argument(
|
|
"--data-config-path",
|
|
"--data_config_path",
|
|
type=str,
|
|
help="path to data configuration file",
|
|
)
|
|
parser.add_argument(
|
|
"--model-config-path",
|
|
"--model_config_path",
|
|
type=str,
|
|
help="path to model configuration file",
|
|
)
|
|
parser.add_argument(
|
|
"--server-config-path",
|
|
"--server_config_path",
|
|
type=str,
|
|
help="path to server configuration file",
|
|
)
|
|
parser.add_argument(
|
|
"--trainer-config-path",
|
|
"--trainer_config_path",
|
|
type=str,
|
|
help="path to trainer configuration file",
|
|
)
|
|
parser.add_argument(
|
|
"--torch-seed",
|
|
"--torch_seed",
|
|
type=int,
|
|
help="seed for generating random numbers to a non-deterministic random number",
|
|
)
|
|
parser.add_argument(
|
|
"--cuda-seed",
|
|
"--cuda_seed",
|
|
type=int,
|
|
help="seed for generating random numbers to a random number for the current GPU",
|
|
)
|
|
parser.add_argument(
|
|
"--preprocess-data",
|
|
"--preprocess_data",
|
|
type=str,
|
|
help="this function will be used to preprocess data before training",
|
|
)
|
|
parser.add_argument(
|
|
"--create-criterion",
|
|
"--create_criterion",
|
|
type=str,
|
|
help="this function will be used to create the criterion used for model loss calculation",
|
|
)
|
|
parser.add_argument(
|
|
"--create-ddp-model",
|
|
"--create_ddp_model",
|
|
type=str,
|
|
help="this function will be used to create the ddp model used during training",
|
|
)
|
|
parser.add_argument(
|
|
"--hook-state",
|
|
"--hook_state",
|
|
type=str,
|
|
help="this will be the state class used when registering the ddp communication hook",
|
|
)
|
|
parser.add_argument(
|
|
"--ddp-hook",
|
|
"--ddp_hook",
|
|
type=str,
|
|
default="allreduce_hook",
|
|
help="ddp communication hook",
|
|
)
|
|
parser.add_argument(
|
|
"--iteration-step",
|
|
"--iteration_step",
|
|
type=str,
|
|
help="this will be the function called for each iteration of training",
|
|
)
|
|
args = parser.parse_args()
|
|
print(f"{args}\n")
|
|
main(args)
|