NUMA binding integration with elastic agent and torchrun (#149334)

Implements #148689

Pull Request resolved: https://github.com/pytorch/pytorch/pull/149334
Approved by: https://github.com/d4l3k

Co-authored-by: Paul de Supinski <pdesupinski@gmail.com>
This commit is contained in:
raghavhrishi
2025-07-25 21:19:45 +00:00
committed by PyTorch MergeBot
parent 24b1f10ca1
commit 7ef3c3357d
13 changed files with 1383 additions and 5 deletions

View File

@ -37,6 +37,7 @@ from torch.distributed.elastic.multiprocessing.subprocess_handler import (
SubprocessHandler,
)
from torch.distributed.elastic.multiprocessing.tail_log import TailLog
from torch.distributed.numa.binding import maybe_wrap_with_numa_bindings, NumaOptions
IS_WINDOWS = sys.platform == "win32"
@ -811,7 +812,12 @@ class SubprocessContext(PContext):
envs: dict[int, dict[str, str]],
logs_specs: LogsSpecs,
log_line_prefixes: Optional[dict[int, str]] = None,
numa_options: Optional[NumaOptions] = None,
):
entrypoint, args = maybe_wrap_with_numa_bindings(
entrypoint=entrypoint, local_rank_to_args=args, numa_options=numa_options
)
super().__init__(
name,
entrypoint,