Change test_invalid_names test to only test constructor of WorkerInfo (#30620)

Summary: This tests seems to only test that we throw exceptions in the `WorkerInfo` constructor when invalid names are passed in, so I don't think we need to complicate by initializing RPC, and exposing ourselves to potential flakiness. Pull Request resolved: https://github.com/pytorch/pytorch/pull/30620 Differential Revision: D18766955 Pulled By: rohan-varma fbshipit-source-id: 11643de4d57431e5f46e096c7766de3ab0b9b05a
2025-10-26 08:34:52 +08:00 · 2019-12-03 09:05:35 -08:00
parent f9f54201d3
commit 5a484245d9
2 changed files with 10 additions and 69 deletions
--- a/test/rpc_test.py
+++ b/test/rpc_test.py
@ -396,86 +396,23 @@ class RpcTest(RpcAgentTestFixture):
            )
        rpc.shutdown()

-    @unittest.skip("test_invalid_names is flaky, see https://github.com/pytorch/pytorch/issues/25912")
    @dist_init(setup_rpc=False)
    def test_invalid_names(self):
+        from torch.distributed.rpc import WorkerInfo
+        worker_id = 0
        with self.assertRaisesRegex(RuntimeError, "Worker name must match"):
-            store, _, _ = next(torch.distributed.rendezvous(
-                self.init_method, rank=self.rank, world_size=self.world_size
-            ))
-            rpc._init_rpc_backend(
-                backend=self.rpc_backend,
-                store=store,
-                name="abc*",
-                rank=self.rank,
-                world_size=self.world_size,
-                rpc_backend_options=self.rpc_backend_options,
-            )
-
-        base_file_name = self.file_name
-        # Use a different file path for FileStore to avoid rendezvous mismatch.
-        self.file_name = base_file_name + "1"
+            info = WorkerInfo("abc*", worker_id)

        with self.assertRaisesRegex(RuntimeError, "Worker name must match"):
-            store, _, _ = next(torch.distributed.rendezvous(
-                self.init_method, rank=self.rank, world_size=self.world_size
-            ))
-            rpc._init_rpc_backend(
-                backend=self.rpc_backend,
-                store=store,
-                name=" ",
-                rank=self.rank,
-                world_size=self.world_size,
-                rpc_backend_options=self.rpc_backend_options,
-            )
+            info = WorkerInfo(" ", worker_id)

-        # Use a different file path for FileStore to avoid rendezvous mismatch.
-        self.file_name = base_file_name + "2"
        with self.assertRaisesRegex(RuntimeError, "must be non-empty"):
-            store, _, _ = next(torch.distributed.rendezvous(
-                self.init_method, rank=self.rank, world_size=self.world_size
-            ))
-            rpc._init_rpc_backend(
-                backend=self.rpc_backend,
-                store=store,
-                name="",
-                rank=self.rank,
-                world_size=self.world_size,
-                rpc_backend_options=self.rpc_backend_options,
-            )
+            info = WorkerInfo("", worker_id)

-        # Use a different file path for FileStore to avoid rendezvous mismatch.
-        self.file_name = base_file_name + "3"
        # If the number in the message does not match, it is likely that the
        # value of MAX_NAME_LEN in RPC WorkerInfo has changed.
        with self.assertRaisesRegex(RuntimeError, "shorter than 128"):
-            store, _, _ = next(torch.distributed.rendezvous(
-                self.init_method, rank=self.rank, world_size=self.world_size
-            ))
-            rpc._init_rpc_backend(
-                backend=self.rpc_backend,
-                store=store,
-                name="".join(["a" for i in range(500)]),
-                rank=self.rank,
-                world_size=self.world_size,
-                rpc_backend_options=self.rpc_backend_options,
-            )
-
-        from torch.distributed.rpc.api import _agent
-        self.assertEqual(_agent, None)
-        # shutdown() should not do anything as _agent is None
-        rpc.shutdown()
-        # We need this barrier here because although init_process_group is
-        # blocking, it does not guarantee that all ranks are done with
-        # initialization after the call. We did run into issues with it where
-        # rank 3 crashed with "connection closed by peer" RuntimeError, which is
-        # caused by other ranks exit before rank 3 is ready. This can be fixed
-        # by adding a collective call to sync all processes.
-        #
-        # We decided not fixing this issue in init_process_group because it
-        # would add extra overhead to the call, and normal use cases won't
-        # create a progress group and exit without doing anything. Hence, it is
-        # not worthy to introduce the overhead just for this test case.
+            info = WorkerInfo("".join(["a" for i in range(500)]), worker_id)

    @dist_init
    def test_add(self):
--- a/torch/csrc/distributed/rpc/init.cpp
+++ b/torch/csrc/distributed/rpc/init.cpp
@ -44,6 +44,10 @@ PyObject* rpc_init(PyObject* /* unused */) {
          module,
          "WorkerInfo",
          R"(Encapsulates information of a worker in the system.)")
+          .def(
+              py::init<std::string, worker_id_t>(),
+              py::arg("name"),
+              py::arg("id"))
          .def_readonly("name", &WorkerInfo::name_, R"(Name of the worker.)")
          .def_readonly(
              "id", &WorkerInfo::id_, R"(Globally unique id of the worker.)")