Update on "[c10d] Integrate ncclGather into PT"

NCCL 2.28.3 now supports `ncclGather`, this PR aims at integrating it. (release note: https://docs.nvidia.com/deeplearning/nccl/release-notes/rel_2-28-3.html#rel_2-28-3) [ghstack-poisoned]
2025-11-03 07:24:58 +08:00 · 2025-09-30 12:19:43 -07:00
parent 02193228f1 ccd5093362
commit a9753a8465
1 changed files with 2 additions and 2 deletions
--- a/torch/csrc/cuda/nccl.cpp
+++ b/torch/csrc/cuda/nccl.cpp
@ -1092,8 +1092,8 @@ void gather(
  void* recv_ptr = nullptr;
  at::Tensor flat; // keep alive until after NCCL call
  if (cur_rank == root) {
-    TORCH_CHECK(
-        (int)outputs.size() == numranks,
+    TORCH_CHECK_VALUE(
+        static_cast<int>(outputs.size()) == numranks,
        "root must provide inputs.size()==numranks");
    // Allocate one flat buffer [world_size * count]
    flat = at::empty({numranks * count}, inputs.options());