mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
[TorchElastic] Add option to configure log prefix for each rank (#112357)
Summary: Add an ability to customize log lines and addtional template like behavior to enrich log information. Motivation: a) Log stream processing/aggregation gains additional value when it includes information about the global rank. Extension to that is that it will be easier to map ranks to hosts from log stream information (less relevant at the moment) b) Users can easily map the failure to the right rank without matching node rank offset+local rank. Implementation - BC change - keeps the logs line prefix as `[<role name><local rank>]:` - Optional env variable TORCHELASTIC_LOG_LINE_HEADER that will be used as a prefix when specified and currently exposes `role_name`, `rank` and `local_rank` variables that will be bound when agent assigns the ranks. Test Plan: CI https://fburl.com/mlhub/mzx5xspv Differential Revision: D50584590 Pull Request resolved: https://github.com/pytorch/pytorch/pull/112357 Approved by: https://github.com/kiukchung
This commit is contained in:
committed by
PyTorch MergeBot
parent
d1c092ae1b
commit
bae8506589
@ -208,6 +208,7 @@ class PContext(abc.ABC):
|
||||
tee_stdouts: Dict[int, str],
|
||||
tee_stderrs: Dict[int, str],
|
||||
error_files: Dict[int, str],
|
||||
log_line_prefixes: Optional[Dict[int, str]] = None,
|
||||
):
|
||||
self.name = name
|
||||
# validate that all mappings have the same number of keys and
|
||||
@ -224,8 +225,8 @@ class PContext(abc.ABC):
|
||||
self.error_files = error_files
|
||||
self.nprocs = nprocs
|
||||
|
||||
self._stdout_tail = TailLog(name, tee_stdouts, sys.stdout)
|
||||
self._stderr_tail = TailLog(name, tee_stderrs, sys.stderr)
|
||||
self._stdout_tail = TailLog(name, tee_stdouts, sys.stdout, log_line_prefixes)
|
||||
self._stderr_tail = TailLog(name, tee_stderrs, sys.stderr, log_line_prefixes)
|
||||
|
||||
def start(self) -> None:
|
||||
"""
|
||||
@ -389,6 +390,7 @@ class MultiprocessContext(PContext):
|
||||
tee_stderrs: Dict[int, str],
|
||||
error_files: Dict[int, str],
|
||||
start_method: str,
|
||||
log_line_prefixes: Optional[Dict[int, str]] = None,
|
||||
):
|
||||
super().__init__(
|
||||
name,
|
||||
@ -400,6 +402,7 @@ class MultiprocessContext(PContext):
|
||||
tee_stdouts,
|
||||
tee_stderrs,
|
||||
error_files,
|
||||
log_line_prefixes,
|
||||
)
|
||||
|
||||
self.start_method = start_method
|
||||
@ -611,6 +614,7 @@ class SubprocessContext(PContext):
|
||||
tee_stdouts: Dict[int, str],
|
||||
tee_stderrs: Dict[int, str],
|
||||
error_files: Dict[int, str],
|
||||
log_line_prefixes: Optional[Dict[int, str]] = None,
|
||||
):
|
||||
super().__init__(
|
||||
name,
|
||||
@ -622,6 +626,7 @@ class SubprocessContext(PContext):
|
||||
tee_stdouts,
|
||||
tee_stderrs,
|
||||
error_files,
|
||||
log_line_prefixes,
|
||||
)
|
||||
|
||||
# state vector; _vdone[local_rank] -> is local_rank finished or not
|
||||
|
Reference in New Issue
Block a user