[FR] Make pg_name unique, show P2P collective status and fix bugs when running the script as command (#134780)

Fixes a bunches of bugs in the script when running with the generated command and 3D parallel.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/134780
Approved by: https://github.com/c-p-i-o
ghstack dependencies: #134528
This commit is contained in:
fduwjj
2024-08-29 20:25:14 -07:00
committed by PyTorch MergeBot
parent 15f5a4858b
commit 1993a2aa9e
7 changed files with 118 additions and 59 deletions

View File

@ -28,8 +28,8 @@ python fr_trace.py -d <dump dir containing trace files> [-o <output file>]
- This script is versioned so that we can ensure our future changes to flight recorder are backwards compatible.
"""
import argparse
import pickle
from typing import Optional, Sequence
from tools.flight_recorder.components.builder import build_db
from tools.flight_recorder.components.config_manager import JobConfig
@ -37,7 +37,9 @@ from tools.flight_recorder.components.loader import read_dir
from tools.flight_recorder.components.types import types
def main(args: argparse.Namespace) -> None:
def main(args: Optional[Sequence[str]] = None) -> None:
config = JobConfig()
args = config.parse_args(args)
details = read_dir(args.prefix, args.dir)
db = build_db(details, args)
if args.output:
@ -46,5 +48,4 @@ def main(args: argparse.Namespace) -> None:
if __name__ == "__main__":
config = JobConfig()
main(config.parse_args())
main()