mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Use fixed MASTER_PORT in test_distributed (#13109)
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/13109 The "right" strategy of creating a socket, binding to an undefined port, closing the socket, and reusing the port it was bound to, was subject to a race condition. Another process could bind to that same port sooner than the tests would, causing an "Address already in use" failure when rank 0 would try and bind to that same port. The THD tests have been using a fixed port since forever. Time will tell if this fixes #12876. Differential Revision: D10850614 fbshipit-source-id: c19f12bb4916141187ee8ddb52880f5f418310dc
This commit is contained in:
committed by
Facebook Github Bot
parent
956e620c64
commit
2a6431ba2d
@ -18,6 +18,7 @@ import torch.nn.functional as F
|
||||
import torch.optim as optim
|
||||
from common_utils import TestCase
|
||||
from torch._utils_internal import TEST_MASTER_ADDR as MASTER_ADDR
|
||||
from torch._utils_internal import TEST_MASTER_PORT as MASTER_PORT
|
||||
from torch.autograd import Variable
|
||||
import common_utils as common
|
||||
|
||||
@ -1293,8 +1294,9 @@ if BACKEND == "gloo" or BACKEND == "nccl":
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
os.environ["MASTER_ADDR"] = MASTER_ADDR
|
||||
os.environ["WORLD_SIZE"] = WORLD_SIZE
|
||||
os.environ["MASTER_ADDR"] = str(MASTER_ADDR)
|
||||
os.environ["MASTER_PORT"] = str(MASTER_PORT)
|
||||
os.environ["WORLD_SIZE"] = str(WORLD_SIZE)
|
||||
for attr in dir(cls):
|
||||
if attr.startswith("test"):
|
||||
fn = getattr(cls, attr)
|
||||
@ -1308,10 +1310,6 @@ if BACKEND == "gloo" or BACKEND == "nccl":
|
||||
_, filename = tempfile.mkstemp(prefix=FOLDER)
|
||||
INIT_METHOD = "file://{}".format(filename)
|
||||
|
||||
if INIT_METHOD.startswith("env://"):
|
||||
port = common.find_free_port()
|
||||
os.environ["MASTER_PORT"] = str(port)
|
||||
|
||||
self.processes = []
|
||||
self.rank = self.MANAGER_PROCESS_RANK
|
||||
Barrier.init()
|
||||
|
@ -30,3 +30,4 @@ def prepare_multiprocessing_environment(path):
|
||||
|
||||
|
||||
TEST_MASTER_ADDR = '127.0.0.1'
|
||||
TEST_MASTER_PORT = 29500
|
||||
|
Reference in New Issue
Block a user