mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
See https://github.com/pytorch/pytorch/pull/129751#issue-2380881501. Most changes are auto-generated by linter. You can review these PRs via: ```bash git diff --ignore-all-space --ignore-blank-lines HEAD~1 ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/129757 Approved by: https://github.com/ezyang
43 lines
1.6 KiB
Python
43 lines
1.6 KiB
Python
import argparse
|
|
import logging
|
|
import os
|
|
|
|
import torch
|
|
import torch.distributed as c10d
|
|
|
|
|
|
logging.basicConfig(
|
|
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.INFO
|
|
)
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(
|
|
description="Simple script to simulate NCCL errors. The script is "
|
|
"supposed to be run on multiple different nodes simultaneously with "
|
|
"appropriate rank and world_size. The script run an allreduce() on "
|
|
"the rank 0 node and aborts all the other nodes to simulate an error "
|
|
"in NCCL"
|
|
)
|
|
parser.add_argument("addr", help="address of the master node to connect to.")
|
|
parser.add_argument("port", help="port of the master node to connect to.")
|
|
parser.add_argument("rank", help="rank of this node")
|
|
parser.add_argument("world_size", help="number of nodes in process group")
|
|
args = parser.parse_args()
|
|
rank = int(args.rank)
|
|
world_size = int(args.world_size)
|
|
port = int(args.port)
|
|
|
|
store = c10d.TCPStore(args.addr, port, world_size, rank == 0)
|
|
process_group = c10d.ProcessGroupNCCL(store, rank, world_size)
|
|
logging.info("Running first allreduce")
|
|
process_group.allreduce(torch.rand(10).cuda(rank)).wait()
|
|
if rank == 0:
|
|
logging.info("Running second allreduce only on rank 0")
|
|
work = process_group.allreduce(torch.rand(10).cuda(rank))
|
|
logging.info("Waiting for allreduce to complete...")
|
|
work.wait()
|
|
logging.info("Second allreduce successful: %s", work.is_success())
|
|
else:
|
|
logging.info("Aborting all other ranks.")
|
|
os.abort()
|