[c10d][NCCL] Implement ncclCommInitRankScalable (merging #136789) (#144794)

Try to land https://github.com/pytorch/pytorch/pull/136789/files on our end and fix any remaining issues.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/144794
Approved by: https://github.com/kwen2501, https://github.com/eqy, https://github.com/atalman
This commit is contained in:
fduwjj
2025-01-30 22:30:02 -08:00
committed by PyTorch MergeBot
parent af2a39849d
commit eb029fba13
5 changed files with 203 additions and 15 deletions

View File

@ -26,6 +26,10 @@ constexpr int64_t kCommInitBusyWaitMillis = 2;
#define NCCL_HAS_COMM_SPLIT
#endif
#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 23, 0)
#define NCCL_HAS_INIT_RANK_SCALABLE
#endif
// ncclGetLastError() is enabled only for NCCL versions 2.13+
// ncclRemoteError only exists in NCCL versions 2.13+
#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 13, 0)
@ -212,6 +216,13 @@ class NCCLComm {
ncclUniqueId commId,
at::DeviceIndex deviceIndex,
ncclConfig_t& config);
#ifdef NCCL_HAS_INIT_RANK_SCALABLE
static std::shared_ptr<NCCLComm> create_scalable(
int numRanks,
int rank,
std::vector<ncclUniqueId>& commIds,
ncclConfig_t& config);
#endif // NCCL_HAS_INIT_RANK_SCALABLE
#endif // NCCL_HAS_CONFIG
#ifdef NCCL_HAS_COMM_SPLIT