mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 12:54:11 +08:00
[PTD] Dump rcclexp proxy trace in pytorch (#143678)
Summary: Dump the active proxyOp status per rank and per communicator when WatchDog timeout or aborts. Added `#if defined(USE_ROCM) && defined(NCCL_COMM_DUMP)` guard in the print function, so only rcclexp users will see this dump in console. This is the changes of the PTD. Test Plan: Job with A2A hang due to receiver failing to post receive operations https://fburl.com/mlhub/95vg12r3 {F1971449692} Reviewed By: c-p-i-o Differential Revision: D67036093 Pull Request resolved: https://github.com/pytorch/pytorch/pull/143678 Approved by: https://github.com/c-p-i-o
This commit is contained in:
committed by
PyTorch MergeBot
parent
aa7d01ea22
commit
a881954b0c
@ -242,7 +242,7 @@ class NCCLComm {
|
||||
std::vector<uint64_t>& ranks_ull);
|
||||
#endif
|
||||
|
||||
#if defined(IS_NCCLX) && defined(NCCL_COMM_DUMP)
|
||||
#if (defined(IS_NCCLX) || defined(USE_ROCM)) && defined(NCCL_COMM_DUMP)
|
||||
std::unordered_map<std::string, std::string> ncclCommDump();
|
||||
#endif
|
||||
|
||||
@ -356,6 +356,9 @@ struct ncclRedOpRAII {
|
||||
bool premul_sum_ = false;
|
||||
};
|
||||
|
||||
void printNcclCommProxyTrace(
|
||||
std::string dumpReason,
|
||||
const std::unordered_map<std::string, std::string>& dumpMap);
|
||||
} // namespace c10d
|
||||
|
||||
#endif // USE_C10D_NCCL
|
||||
|
Reference in New Issue
Block a user