mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Adding entry-point based support for out-of-tree rendezvous plugins (#132633)
Fixes #127519 Currently in torchrun rendezvous, there are only two rendezvous backends supported out of the box: `C10d` and `Etcd`. The changes in this PR enables the distributed elastic users to bring their out-of-tree rendezvous backend implementations as Python packages. #### AUTHORING NEW PLUGIN Any new plugin will be a python package exposing entry-points. For example, the structure of redis plugin is as follows: ``` plugin_root |_ pyproject.toml |_ src |_ redis |_ __init__.py |_ redis_store.py |_ redis_backend.py ``` The contents of the `pyproject.toml` should indicate that this is exposes a torchrun entry-point by mentioning the group name `torchrun.plugins`. The `pyproject.toml` for redis plugin would be as follows: ``` [project] name = "redis" version = "0.0.1" [project.entry-points.'torchrun.plugins'] redis = 'redis' ``` The `src/redis/__init__.py` file would contain functions that return the plugin name and plugin handler. The contents of `__init__.py` for redis would be as follows: ``` def getPluginHandler(): def _create_redis_handler(params: RendezvousParameters): from redis_rendezvous_backend import create_backend backend, store = create_backend(params) return create_handler(store, backend, params) return _create_redis_handler ``` The files `redis_store` and `redis_backend` contain the implementation of [Store](41189b0da4/torch/_C/_distributed_c10d.pyi (L171)
) and [RendezvousBackend](e782918b8e/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py (L61)
) respectively. #### USER EXPERIENCE Before using the plugin for the first time, the user has to install the plugin packages. For example, the published packages can be installed using `pip3 install <plugin-name>` and the plugin is in local file systemcan be installed using `pip3 install -e <plugin-location>`. Once installed, the new backend can be used in torchrun as follows: ``` torchrun --rdzv-backend=redis --rdzv-endpoint=redis-container:6379 --nnodes=3 --nproc-per-node=1 --max-restarts=3 --rdzv-id=1 test.py ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/132633 Approved by: https://github.com/wconstab
This commit is contained in:
committed by
PyTorch MergeBot
parent
4a18fcf7af
commit
136b19b062
@ -4,6 +4,9 @@
|
||||
# This source code is licensed under the BSD-style license found in the
|
||||
# LICENSE file in the root directory of this source tree.
|
||||
|
||||
import logging
|
||||
import sys
|
||||
|
||||
from .api import (
|
||||
rendezvous_handler_registry as handler_registry,
|
||||
RendezvousHandler,
|
||||
@ -12,6 +15,13 @@ from .api import (
|
||||
from .dynamic_rendezvous import create_handler
|
||||
|
||||
|
||||
if sys.version_info < (3, 10):
|
||||
from importlib_metadata import entry_points
|
||||
else:
|
||||
from importlib.metadata import entry_points
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
__all__ = ["get_rendezvous_handler"]
|
||||
|
||||
|
||||
@ -50,6 +60,21 @@ def _register_default_handlers() -> None:
|
||||
handler_registry.register("static", _create_static_handler)
|
||||
|
||||
|
||||
def _register_out_of_tree_handlers() -> None:
|
||||
discovered_handler_generators = entry_points(group="torchrun.handlers")
|
||||
|
||||
for handler_generator in discovered_handler_generators:
|
||||
try:
|
||||
get_handler = discovered_handler_generators[handler_generator.name].load()
|
||||
handler_registry.register(handler_generator.name, get_handler())
|
||||
except Exception:
|
||||
log.warning(
|
||||
"Exception while registering out of tree plugin %s: ",
|
||||
handler_generator.name,
|
||||
exc_info=True,
|
||||
)
|
||||
|
||||
|
||||
def get_rendezvous_handler(params: RendezvousParameters) -> RendezvousHandler:
|
||||
"""
|
||||
Obtain a reference to a :py:class`RendezvousHandler`.
|
||||
|
Reference in New Issue
Block a user