mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
[codemod][lint][fbcode/c*] Enable BLACK by default
Test Plan: manual inspection & sandcastle Reviewed By: zertosh Differential Revision: D30279364 fbshipit-source-id: c1ed77dfe43a3bde358f92737cd5535ae5d13c9a
This commit is contained in:
committed by
Facebook GitHub Bot
parent
aac3c7bd06
commit
b004307252
@ -1,22 +1,26 @@
|
||||
|
||||
import torch.distributed as c10d
|
||||
import torch
|
||||
import argparse
|
||||
import os
|
||||
import logging
|
||||
logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO)
|
||||
import os
|
||||
|
||||
import torch
|
||||
import torch.distributed as c10d
|
||||
|
||||
logging.basicConfig(
|
||||
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.INFO
|
||||
)
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Simple script to simulate NCCL errors. The script is '
|
||||
'supposed to be run on multiple different nodes simultaneously with '
|
||||
'appropriate rank and world_size. The script run an allreduce() on '
|
||||
'the rank 0 node and aborts all the other nodes to simulate an error '
|
||||
'in NCCL')
|
||||
parser.add_argument('addr', help='address of the master node to connect to.')
|
||||
parser.add_argument('port', help='port of the master node to connect to.')
|
||||
parser.add_argument('rank', help='rank of this node')
|
||||
parser.add_argument('world_size', help='number of nodes in process group')
|
||||
description="Simple script to simulate NCCL errors. The script is "
|
||||
"supposed to be run on multiple different nodes simultaneously with "
|
||||
"appropriate rank and world_size. The script run an allreduce() on "
|
||||
"the rank 0 node and aborts all the other nodes to simulate an error "
|
||||
"in NCCL"
|
||||
)
|
||||
parser.add_argument("addr", help="address of the master node to connect to.")
|
||||
parser.add_argument("port", help="port of the master node to connect to.")
|
||||
parser.add_argument("rank", help="rank of this node")
|
||||
parser.add_argument("world_size", help="number of nodes in process group")
|
||||
args = parser.parse_args()
|
||||
rank = int(args.rank)
|
||||
world_size = int(args.world_size)
|
||||
@ -24,14 +28,14 @@ if __name__ == "__main__":
|
||||
|
||||
store = c10d.TCPStore(args.addr, port, world_size, rank == 0)
|
||||
process_group = c10d.ProcessGroupNCCL(store, rank, world_size)
|
||||
logging.info('Running first allreduce')
|
||||
logging.info("Running first allreduce")
|
||||
process_group.allreduce(torch.rand(10).cuda(rank)).wait()
|
||||
if rank == 0:
|
||||
logging.info('Running second allreduce only on rank 0')
|
||||
logging.info("Running second allreduce only on rank 0")
|
||||
work = process_group.allreduce(torch.rand(10).cuda(rank))
|
||||
logging.info('Waiting for allreduce to complete...')
|
||||
logging.info("Waiting for allreduce to complete...")
|
||||
work.wait()
|
||||
logging.info('Second allreduce successful: {}'.format(work.is_success()))
|
||||
logging.info("Second allreduce successful: {}".format(work.is_success()))
|
||||
else:
|
||||
logging.info('Aborting all other ranks.')
|
||||
logging.info("Aborting all other ranks.")
|
||||
os.abort()
|
||||
|
Reference in New Issue
Block a user