Use fixed MASTER_PORT in test_distributed (#13109)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/13109

The "right" strategy of creating a socket, binding to an undefined port, closing the socket, and reusing the port it was bound to, was subject to a race condition. Another process could bind to that same port sooner than the tests would, causing an "Address already in use" failure when rank 0 would try and bind to that same port. The THD tests have been using a fixed port since forever. Time will tell if this fixes #12876.

Differential Revision: D10850614

fbshipit-source-id: c19f12bb4916141187ee8ddb52880f5f418310dc
This commit is contained in:
Pieter Noordhuis
2018-10-25 08:49:37 -07:00
committed by Facebook Github Bot
parent 956e620c64
commit 2a6431ba2d
2 changed files with 5 additions and 6 deletions

View File

@ -18,6 +18,7 @@ import torch.nn.functional as F
import torch.optim as optim
from common_utils import TestCase
from torch._utils_internal import TEST_MASTER_ADDR as MASTER_ADDR
from torch._utils_internal import TEST_MASTER_PORT as MASTER_PORT
from torch.autograd import Variable
import common_utils as common
@ -1293,8 +1294,9 @@ if BACKEND == "gloo" or BACKEND == "nccl":
@classmethod
def setUpClass(cls):
os.environ["MASTER_ADDR"] = MASTER_ADDR
os.environ["WORLD_SIZE"] = WORLD_SIZE
os.environ["MASTER_ADDR"] = str(MASTER_ADDR)
os.environ["MASTER_PORT"] = str(MASTER_PORT)
os.environ["WORLD_SIZE"] = str(WORLD_SIZE)
for attr in dir(cls):
if attr.startswith("test"):
fn = getattr(cls, attr)
@ -1308,10 +1310,6 @@ if BACKEND == "gloo" or BACKEND == "nccl":
_, filename = tempfile.mkstemp(prefix=FOLDER)
INIT_METHOD = "file://{}".format(filename)
if INIT_METHOD.startswith("env://"):
port = common.find_free_port()
os.environ["MASTER_PORT"] = str(port)
self.processes = []
self.rank = self.MANAGER_PROCESS_RANK
Barrier.init()

View File

@ -30,3 +30,4 @@ def prepare_multiprocessing_environment(path):
TEST_MASTER_ADDR = '127.0.0.1'
TEST_MASTER_PORT = 29500