mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
[fr] Added protection against missing stack frames in fr cont. (#150133)
Summary: Previously we had D70358287, which didn't fully resolved the issue. Test Plan: # FR `buck2 run @//mode/opt //caffe2/fb/flight_recorder:fr_trace -- --mast_job_id f710320638-TrainingApplication --mast_job_version 0 --mast_job_attempt 0 --bucket tlcm_log_blob --world_size 128 --dump_file_name_offset 0 --allow-incomplete-ranks` Confirm no error # FR analyzer `buck2 run @//mode/opt //investigations/dr_patternson/analyzers/ai_observability:ai_observability-all-analyzers-cli -- flight_recorder_analyzer --mast_job_name f710320638-TrainingApplication --mast_job_version 0 --mast_job_attempt 0` Confirm no error Differential Revision: D71998980 Pull Request resolved: https://github.com/pytorch/pytorch/pull/150133 Approved by: https://github.com/fduwjj
This commit is contained in:
committed by
PyTorch MergeBot
parent
827b730f4e
commit
31634b8c6a
@ -224,7 +224,7 @@ class EntryState:
|
||||
self.input_sizes = entry["input_sizes"]
|
||||
self.output_sizes = entry["output_sizes"]
|
||||
self.collective_state = entry["state"]
|
||||
self.collective_frames = entry["frames"]
|
||||
self.collective_frames = entry.get("frames", [])
|
||||
self.expected_ranks = expected_ranks
|
||||
self.missing_ranks: set[int]
|
||||
self.input_numel: int
|
||||
@ -316,7 +316,7 @@ class EntryState:
|
||||
output_sizes=entry["output_sizes"],
|
||||
expected_ranks=self.expected_ranks,
|
||||
collective_state=entry["state"],
|
||||
collective_frames=entry["frames"],
|
||||
collective_frames=entry.get("frames", []),
|
||||
type_of_mismatch=error,
|
||||
)
|
||||
return Collective(
|
||||
|
Reference in New Issue
Block a user