[TorchElastic] Add option to configure log prefix for each rank (#112357)

Summary: Add an ability to customize log lines and addtional template like behavior to enrich log information. Motivation: a) Log stream processing/aggregation gains additional value when it includes information about the global rank. Extension to that is that it will be easier to map ranks to hosts from log stream information (less relevant at the moment) b) Users can easily map the failure to the right rank without matching node rank offset+local rank. Implementation - BC change - keeps the logs line prefix as `[<role name><local rank>]:` - Optional env variable TORCHELASTIC_LOG_LINE_HEADER that will be used as a prefix when specified and currently exposes `role_name`, `rank` and `local_rank` variables that will be bound when agent assigns the ranks. Test Plan: CI https://fburl.com/mlhub/mzx5xspv Differential Revision: D50584590 Pull Request resolved: https://github.com/pytorch/pytorch/pull/112357 Approved by: https://github.com/kiukchung
2025-10-20 21:14:14 +08:00 · 2023-11-08 01:00:22 +00:00
parent d1c092ae1b
commit bae8506589
9 changed files with 142 additions and 16 deletions
--- a/torch/distributed/elastic/multiprocessing/api.py
+++ b/torch/distributed/elastic/multiprocessing/api.py
@ -208,6 +208,7 @@ class PContext(abc.ABC):
        tee_stdouts: Dict[int, str],
        tee_stderrs: Dict[int, str],
        error_files: Dict[int, str],
+        log_line_prefixes: Optional[Dict[int, str]] = None,
    ):
        self.name = name
        # validate that all mappings have the same number of keys and
@ -224,8 +225,8 @@ class PContext(abc.ABC):
        self.error_files = error_files
        self.nprocs = nprocs

-        self._stdout_tail = TailLog(name, tee_stdouts, sys.stdout)
-        self._stderr_tail = TailLog(name, tee_stderrs, sys.stderr)
+        self._stdout_tail = TailLog(name, tee_stdouts, sys.stdout, log_line_prefixes)
+        self._stderr_tail = TailLog(name, tee_stderrs, sys.stderr, log_line_prefixes)

    def start(self) -> None:
        """
@ -389,6 +390,7 @@ class MultiprocessContext(PContext):
        tee_stderrs: Dict[int, str],
        error_files: Dict[int, str],
        start_method: str,
+        log_line_prefixes: Optional[Dict[int, str]] = None,
    ):
        super().__init__(
            name,
@ -400,6 +402,7 @@ class MultiprocessContext(PContext):
            tee_stdouts,
            tee_stderrs,
            error_files,
+            log_line_prefixes,
        )

        self.start_method = start_method
@ -611,6 +614,7 @@ class SubprocessContext(PContext):
        tee_stdouts: Dict[int, str],
        tee_stderrs: Dict[int, str],
        error_files: Dict[int, str],
+        log_line_prefixes: Optional[Dict[int, str]] = None,
    ):
        super().__init__(
            name,
@ -622,6 +626,7 @@ class SubprocessContext(PContext):
            tee_stdouts,
            tee_stderrs,
            error_files,
+            log_line_prefixes,
        )

        # state vector; _vdone[local_rank] -> is local_rank finished or not