mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-21 05:34:18 +08:00
[APF Logging][Error Trait] To fill the errorTraits for ChildFailedError with signal abort (re-attempt of #165476) (#165688)
**Summary** Land @guoding83128 's PR https://github.com/pytorch/pytorch/pull/165476 on his behalf due to EasyCLA blocking. Refer his original PR for detail. But in short, elastic leaves 'errorTraits' as unknown when the error dump file is missing, this PR adds a "system terminated error" to such case so the internal scuba table can correctly aggregate. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165688 Approved by: https://github.com/fduwjj
This commit is contained in:
committed by
PyTorch MergeBot
parent
b44fb14906
commit
d0c24b392c
@ -79,9 +79,9 @@ __all__ = [
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
JSON = dict
|
||||
JSON = dict[str, Any]
|
||||
|
||||
_EMPTY_ERROR_DATA = {"message": "<NONE>"}
|
||||
_EMPTY_ERROR_DATA: dict[str, Any] = {"message": "<NONE>"}
|
||||
_NOT_AVAILABLE = "<N/A>"
|
||||
|
||||
_R = TypeVar("_R")
|
||||
@ -143,6 +143,10 @@ class ProcessFailure:
|
||||
f" received by PID {self.pid}"
|
||||
)
|
||||
else:
|
||||
self.error_file_data["errorTraits"] = {
|
||||
"category": "system_terminated_error",
|
||||
"retryability": "False",
|
||||
}
|
||||
self.message = "To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html"
|
||||
|
||||
def _get_error_data(self, error_file_data: dict[str, Any]) -> tuple[str, int]:
|
||||
|
Reference in New Issue
Block a user