[profiler] record nccl version in distributed info (#121044)

Summary: Add a field of NCCL version in distributed info if backend is NCCL

Differential Revision: D54432888

Pull Request resolved: https://github.com/pytorch/pytorch/pull/121044
Approved by: https://github.com/aaronenyeshi
This commit is contained in:
Shengbao Zheng
2024-03-07 15:56:02 +00:00
committed by PyTorch MergeBot
parent 3aa512cd72
commit eea37c6db4

View File

@ -268,13 +268,18 @@ class _KinetoProfile:
if not dist.is_available() or not dist.is_initialized():
return None
return {
"backend": dist.get_backend(),
backend = dist.get_backend()
dist_info = {
"backend": backend,
"rank": dist.get_rank(),
"world_size": dist.get_world_size(),
"pg_count": dist.get_pg_count(),
"pg_config": dist.distributed_c10d._get_all_pg_configs(),
}
if backend == "nccl":
nccl_version = torch.cuda.nccl.version()
dist_info["nccl_version"] = ".".join(str(v) for v in nccl_version)
return dist_info
def _memory_profile(self) -> MemoryProfile:
required = ("record_shapes", "profile_memory", "with_stack")