mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-26 08:34:52 +08:00
Change test_invalid_names test to only test constructor of WorkerInfo (#30620)
Summary: This tests seems to only test that we throw exceptions in the `WorkerInfo` constructor when invalid names are passed in, so I don't think we need to complicate by initializing RPC, and exposing ourselves to potential flakiness. Pull Request resolved: https://github.com/pytorch/pytorch/pull/30620 Differential Revision: D18766955 Pulled By: rohan-varma fbshipit-source-id: 11643de4d57431e5f46e096c7766de3ab0b9b05a
This commit is contained in:
committed by
Facebook Github Bot
parent
f9f54201d3
commit
5a484245d9
@ -396,86 +396,23 @@ class RpcTest(RpcAgentTestFixture):
|
||||
)
|
||||
rpc.shutdown()
|
||||
|
||||
@unittest.skip("test_invalid_names is flaky, see https://github.com/pytorch/pytorch/issues/25912")
|
||||
@dist_init(setup_rpc=False)
|
||||
def test_invalid_names(self):
|
||||
from torch.distributed.rpc import WorkerInfo
|
||||
worker_id = 0
|
||||
with self.assertRaisesRegex(RuntimeError, "Worker name must match"):
|
||||
store, _, _ = next(torch.distributed.rendezvous(
|
||||
self.init_method, rank=self.rank, world_size=self.world_size
|
||||
))
|
||||
rpc._init_rpc_backend(
|
||||
backend=self.rpc_backend,
|
||||
store=store,
|
||||
name="abc*",
|
||||
rank=self.rank,
|
||||
world_size=self.world_size,
|
||||
rpc_backend_options=self.rpc_backend_options,
|
||||
)
|
||||
|
||||
base_file_name = self.file_name
|
||||
# Use a different file path for FileStore to avoid rendezvous mismatch.
|
||||
self.file_name = base_file_name + "1"
|
||||
info = WorkerInfo("abc*", worker_id)
|
||||
|
||||
with self.assertRaisesRegex(RuntimeError, "Worker name must match"):
|
||||
store, _, _ = next(torch.distributed.rendezvous(
|
||||
self.init_method, rank=self.rank, world_size=self.world_size
|
||||
))
|
||||
rpc._init_rpc_backend(
|
||||
backend=self.rpc_backend,
|
||||
store=store,
|
||||
name=" ",
|
||||
rank=self.rank,
|
||||
world_size=self.world_size,
|
||||
rpc_backend_options=self.rpc_backend_options,
|
||||
)
|
||||
info = WorkerInfo(" ", worker_id)
|
||||
|
||||
# Use a different file path for FileStore to avoid rendezvous mismatch.
|
||||
self.file_name = base_file_name + "2"
|
||||
with self.assertRaisesRegex(RuntimeError, "must be non-empty"):
|
||||
store, _, _ = next(torch.distributed.rendezvous(
|
||||
self.init_method, rank=self.rank, world_size=self.world_size
|
||||
))
|
||||
rpc._init_rpc_backend(
|
||||
backend=self.rpc_backend,
|
||||
store=store,
|
||||
name="",
|
||||
rank=self.rank,
|
||||
world_size=self.world_size,
|
||||
rpc_backend_options=self.rpc_backend_options,
|
||||
)
|
||||
info = WorkerInfo("", worker_id)
|
||||
|
||||
# Use a different file path for FileStore to avoid rendezvous mismatch.
|
||||
self.file_name = base_file_name + "3"
|
||||
# If the number in the message does not match, it is likely that the
|
||||
# value of MAX_NAME_LEN in RPC WorkerInfo has changed.
|
||||
with self.assertRaisesRegex(RuntimeError, "shorter than 128"):
|
||||
store, _, _ = next(torch.distributed.rendezvous(
|
||||
self.init_method, rank=self.rank, world_size=self.world_size
|
||||
))
|
||||
rpc._init_rpc_backend(
|
||||
backend=self.rpc_backend,
|
||||
store=store,
|
||||
name="".join(["a" for i in range(500)]),
|
||||
rank=self.rank,
|
||||
world_size=self.world_size,
|
||||
rpc_backend_options=self.rpc_backend_options,
|
||||
)
|
||||
|
||||
from torch.distributed.rpc.api import _agent
|
||||
self.assertEqual(_agent, None)
|
||||
# shutdown() should not do anything as _agent is None
|
||||
rpc.shutdown()
|
||||
# We need this barrier here because although init_process_group is
|
||||
# blocking, it does not guarantee that all ranks are done with
|
||||
# initialization after the call. We did run into issues with it where
|
||||
# rank 3 crashed with "connection closed by peer" RuntimeError, which is
|
||||
# caused by other ranks exit before rank 3 is ready. This can be fixed
|
||||
# by adding a collective call to sync all processes.
|
||||
#
|
||||
# We decided not fixing this issue in init_process_group because it
|
||||
# would add extra overhead to the call, and normal use cases won't
|
||||
# create a progress group and exit without doing anything. Hence, it is
|
||||
# not worthy to introduce the overhead just for this test case.
|
||||
info = WorkerInfo("".join(["a" for i in range(500)]), worker_id)
|
||||
|
||||
@dist_init
|
||||
def test_add(self):
|
||||
|
||||
@ -44,6 +44,10 @@ PyObject* rpc_init(PyObject* /* unused */) {
|
||||
module,
|
||||
"WorkerInfo",
|
||||
R"(Encapsulates information of a worker in the system.)")
|
||||
.def(
|
||||
py::init<std::string, worker_id_t>(),
|
||||
py::arg("name"),
|
||||
py::arg("id"))
|
||||
.def_readonly("name", &WorkerInfo::name_, R"(Name of the worker.)")
|
||||
.def_readonly(
|
||||
"id", &WorkerInfo::id_, R"(Globally unique id of the worker.)")
|
||||
|
||||
Reference in New Issue
Block a user