[c10d] Lessen density of barrier warning (#162015)

Warnings are great, but too dense when there are many ranks. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162015 Approved by: https://github.com/d4l3k, https://github.com/H-Huang
2025-10-20 21:14:14 +08:00 · 2025-09-02 15:51:54 -07:00
parent 90f50f7e68
commit 9b81fe281d
2 changed files with 11 additions and 11 deletions
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@ -5075,14 +5075,12 @@ c10::DeviceIndex ProcessGroupNCCL::guessDeviceId() const {
  // offset wrt the device id if intra-node GPUs are sharded into multiple
  // dimensions.
  int devIdx = globalRank() % localDeviceCount_;
-  LOG(WARNING)
-      << logPrefix()
-      << c10::str(
-             " using GPU ",
-             devIdx,
-             " as device used by this process is currently unknown. ",
-             "This can potentially cause a hang if this rank to GPU mapping is incorrect. ",
-             "You can specify device_id in init_process_group() to force use of a particular device.");
+  if (devIdx == 0) { // only log on first rank of each node
+    LOG(WARNING) << c10::str(
+        "Guessing device ID based on global rank. ",
+        "This can cause a hang if rank to GPU mapping is heterogeneous. ",
+        "You can specify device_id in init_process_group()");
+  }
  return static_cast<c10::DeviceIndex>(devIdx);
 }

--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@ -4867,9 +4867,11 @@ def barrier(
        # may use default device 0, causing issues like hang or all processes
        # creating context on device 0.
        opts.device = device
-        warnings.warn(  # warn only once
-            "No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. "
-        )
+        if group.rank() == 0:
+            warnings.warn(  # warn only once
+                "barrier(): using the device under current context. "
+                "You can specify `device_id` in `init_process_group` to mute this warning."
+            )

    work = group.barrier(opts=opts)