mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
[c10d] Lessen density of barrier warning (#162015)
Warnings are great, but too dense when there are many ranks. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162015 Approved by: https://github.com/d4l3k, https://github.com/H-Huang
This commit is contained in:
@ -5075,14 +5075,12 @@ c10::DeviceIndex ProcessGroupNCCL::guessDeviceId() const {
|
||||
// offset wrt the device id if intra-node GPUs are sharded into multiple
|
||||
// dimensions.
|
||||
int devIdx = globalRank() % localDeviceCount_;
|
||||
LOG(WARNING)
|
||||
<< logPrefix()
|
||||
<< c10::str(
|
||||
" using GPU ",
|
||||
devIdx,
|
||||
" as device used by this process is currently unknown. ",
|
||||
"This can potentially cause a hang if this rank to GPU mapping is incorrect. ",
|
||||
"You can specify device_id in init_process_group() to force use of a particular device.");
|
||||
if (devIdx == 0) { // only log on first rank of each node
|
||||
LOG(WARNING) << c10::str(
|
||||
"Guessing device ID based on global rank. ",
|
||||
"This can cause a hang if rank to GPU mapping is heterogeneous. ",
|
||||
"You can specify device_id in init_process_group()");
|
||||
}
|
||||
return static_cast<c10::DeviceIndex>(devIdx);
|
||||
}
|
||||
|
||||
|
@ -4867,9 +4867,11 @@ def barrier(
|
||||
# may use default device 0, causing issues like hang or all processes
|
||||
# creating context on device 0.
|
||||
opts.device = device
|
||||
warnings.warn( # warn only once
|
||||
"No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. "
|
||||
)
|
||||
if group.rank() == 0:
|
||||
warnings.warn( # warn only once
|
||||
"barrier(): using the device under current context. "
|
||||
"You can specify `device_id` in `init_process_group` to mute this warning."
|
||||
)
|
||||
|
||||
work = group.barrier(opts=opts)
|
||||
|
||||
|
Reference in New Issue
Block a user