[fr] Added protection against missing stack frames in fr cont. (#150133)

Summary: Previously we had D70358287, which didn't fully resolved the issue.

Test Plan:
# FR
`buck2 run @//mode/opt //caffe2/fb/flight_recorder:fr_trace -- --mast_job_id f710320638-TrainingApplication --mast_job_version 0 --mast_job_attempt 0 --bucket tlcm_log_blob --world_size 128 --dump_file_name_offset 0 --allow-incomplete-ranks`
Confirm no error
# FR analyzer
`buck2 run @//mode/opt //investigations/dr_patternson/analyzers/ai_observability:ai_observability-all-analyzers-cli -- flight_recorder_analyzer --mast_job_name f710320638-TrainingApplication --mast_job_version 0 --mast_job_attempt 0`
Confirm no error

Differential Revision: D71998980

Pull Request resolved: https://github.com/pytorch/pytorch/pull/150133
Approved by: https://github.com/fduwjj
This commit is contained in:
Phillip Liu
2025-04-01 03:07:55 +00:00
committed by PyTorch MergeBot
parent 827b730f4e
commit 31634b8c6a

View File

@ -224,7 +224,7 @@ class EntryState:
self.input_sizes = entry["input_sizes"]
self.output_sizes = entry["output_sizes"]
self.collective_state = entry["state"]
self.collective_frames = entry["frames"]
self.collective_frames = entry.get("frames", [])
self.expected_ranks = expected_ranks
self.missing_ranks: set[int]
self.input_numel: int
@ -316,7 +316,7 @@ class EntryState:
output_sizes=entry["output_sizes"],
expected_ranks=self.expected_ranks,
collective_state=entry["state"],
collective_frames=entry["frames"],
collective_frames=entry.get("frames", []),
type_of_mismatch=error,
)
return Collective(