mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
[c10d] ProcessGroupGloo: support per operation timeouts (#158128)
This updates ProcessGroupGloo to support per operation timeouts. Previously the timeouts were ignored even if they were set. * This checks if the timeout is `kUnsetTimeout` and conditionally uses the provided timeout or the default timeout from the context. * This exposes `set_timeout` as a standard method on ProcessGroup/Backend so we can test the global timeout. Test plan: ``` pytest test/distributed/test_c10d_gloo.py -v -k allreduce_timeout ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/158128 Approved by: https://github.com/H-Huang, https://github.com/fduwjj
This commit is contained in:
committed by
PyTorch MergeBot
parent
a8ec7babcf
commit
2a8795a981
@ -164,6 +164,12 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
|
||||
return false;
|
||||
}
|
||||
|
||||
virtual void setTimeout(std::chrono::milliseconds timeout) {
|
||||
for (auto& backend : backendTypeToBackend_) {
|
||||
backend.second->setTimeout(timeout);
|
||||
}
|
||||
}
|
||||
|
||||
virtual void startCoalescing(c10::DeviceType deviceType) {
|
||||
// only nccl has implemented startCoalescing so only execute for nccl
|
||||
// backends
|
||||
|
||||
Reference in New Issue
Block a user