[c10d] Lessen density of barrier warning (#162015)

Warnings are great, but too dense when there are many ranks.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/162015
Approved by: https://github.com/d4l3k, https://github.com/H-Huang
This commit is contained in:
Ke Wen
2025-09-02 15:51:54 -07:00
committed by PyTorch MergeBot
parent 90f50f7e68
commit 9b81fe281d
2 changed files with 11 additions and 11 deletions

View File

@ -5075,14 +5075,12 @@ c10::DeviceIndex ProcessGroupNCCL::guessDeviceId() const {
// offset wrt the device id if intra-node GPUs are sharded into multiple
// dimensions.
int devIdx = globalRank() % localDeviceCount_;
LOG(WARNING)
<< logPrefix()
<< c10::str(
" using GPU ",
devIdx,
" as device used by this process is currently unknown. ",
"This can potentially cause a hang if this rank to GPU mapping is incorrect. ",
"You can specify device_id in init_process_group() to force use of a particular device.");
if (devIdx == 0) { // only log on first rank of each node
LOG(WARNING) << c10::str(
"Guessing device ID based on global rank. ",
"This can cause a hang if rank to GPU mapping is heterogeneous. ",
"You can specify device_id in init_process_group()");
}
return static_cast<c10::DeviceIndex>(devIdx);
}

View File

@ -4867,9 +4867,11 @@ def barrier(
# may use default device 0, causing issues like hang or all processes
# creating context on device 0.
opts.device = device
warnings.warn( # warn only once
"No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. "
)
if group.rank() == 0:
warnings.warn( # warn only once
"barrier(): using the device under current context. "
"You can specify `device_id` in `init_process_group` to mute this warning."
)
work = group.barrier(opts=opts)