Adding entry-point based support for out-of-tree rendezvous plugins (#132633)

Fixes #127519

Currently in torchrun rendezvous, there are only two rendezvous backends supported out of the box: `C10d` and `Etcd`. The changes in this PR enables the distributed elastic users to bring their out-of-tree rendezvous backend implementations as Python packages.

#### AUTHORING NEW PLUGIN
Any new plugin will be a python package exposing entry-points. For example, the structure of redis plugin is as follows:

```
plugin_root
|_ pyproject.toml
|_ src
   |_ redis
      |_ __init__.py
      |_ redis_store.py
      |_ redis_backend.py
```

The contents of the `pyproject.toml` should indicate that this is exposes a torchrun entry-point by mentioning the group name `torchrun.plugins`. The `pyproject.toml` for redis plugin would be as follows:

```
[project]
name = "redis"
version = "0.0.1"

[project.entry-points.'torchrun.plugins']
redis = 'redis'
```

The `src/redis/__init__.py` file would contain functions that return the plugin name and plugin handler. The contents of `__init__.py` for redis would be as follows:

```
def getPluginHandler():
    def _create_redis_handler(params: RendezvousParameters):
        from redis_rendezvous_backend import create_backend
        backend, store = create_backend(params)
        return create_handler(store, backend, params)
    return _create_redis_handler
```

The files `redis_store` and `redis_backend` contain the implementation of [Store](41189b0da4/torch/_C/_distributed_c10d.pyi (L171)) and [RendezvousBackend](e782918b8e/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py (L61)) respectively.

#### USER EXPERIENCE
Before using the plugin for the first time, the user has to install the plugin packages. For example, the published packages can be installed using `pip3 install <plugin-name>` and the plugin is in local file systemcan be installed using `pip3 install -e <plugin-location>`.

Once installed, the new backend can be used in torchrun as follows:

```
torchrun --rdzv-backend=redis --rdzv-endpoint=redis-container:6379 --nnodes=3 --nproc-per-node=1 --max-restarts=3 --rdzv-id=1 test.py
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/132633
Approved by: https://github.com/wconstab
This commit is contained in:
Sathyanarayanan Saravanamuthu
2024-08-27 07:09:39 +00:00
committed by PyTorch MergeBot
parent 4a18fcf7af
commit 136b19b062
5 changed files with 73 additions and 1 deletions

View File

@ -0,0 +1,38 @@
# Owner(s): ["oncall: r2p"]
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
import pathlib
import sys
import unittest
import torch.distributed.elastic.rendezvous as rdvz
BACKEND_NAME = "testbackend"
TEST_PACKAGE_PATH = "/out_of_tree_test_package/src"
class OutOfTreeRendezvousTest(unittest.TestCase):
def test_out_of_tree_handler_loading(self):
current_path = str(pathlib.Path(__file__).parent.resolve())
rdvz._register_out_of_tree_handlers()
registry_dict = rdvz.rendezvous_handler_registry._registry
# test backend should not be registered as a backend
self.assertFalse(BACKEND_NAME in registry_dict)
# Including testbackend in python path
sys.path.append(current_path + TEST_PACKAGE_PATH)
# Registering the out of tree handlers again
rdvz._register_out_of_tree_handlers()
# test backend should be registered as a backend
self.assertTrue(BACKEND_NAME in registry_dict)
# Removing testbackend from python path
sys.path.remove(current_path + TEST_PACKAGE_PATH)

View File

@ -0,0 +1,6 @@
[project]
name = "testbackend"
version = "0.0.1"
[project.entry-points.'torchrun.handlers']
testbackend = 'testbackend:test_handler'

View File

@ -0,0 +1,2 @@
def test_handler():
return ""

View File

@ -143,10 +143,11 @@ from .api import (
RendezvousStoreInfo,
RendezvousTimeoutError,
)
from .registry import _register_default_handlers
from .registry import _register_default_handlers, _register_out_of_tree_handlers
_register_default_handlers()
_register_out_of_tree_handlers()
__all__ = [

View File

@ -4,6 +4,9 @@
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
import logging
import sys
from .api import (
rendezvous_handler_registry as handler_registry,
RendezvousHandler,
@ -12,6 +15,13 @@ from .api import (
from .dynamic_rendezvous import create_handler
if sys.version_info < (3, 10):
from importlib_metadata import entry_points
else:
from importlib.metadata import entry_points
log = logging.getLogger(__name__)
__all__ = ["get_rendezvous_handler"]
@ -50,6 +60,21 @@ def _register_default_handlers() -> None:
handler_registry.register("static", _create_static_handler)
def _register_out_of_tree_handlers() -> None:
discovered_handler_generators = entry_points(group="torchrun.handlers")
for handler_generator in discovered_handler_generators:
try:
get_handler = discovered_handler_generators[handler_generator.name].load()
handler_registry.register(handler_generator.name, get_handler())
except Exception:
log.warning(
"Exception while registering out of tree plugin %s: ",
handler_generator.name,
exc_info=True,
)
def get_rendezvous_handler(params: RendezvousParameters) -> RendezvousHandler:
"""
Obtain a reference to a :py:class`RendezvousHandler`.