[c10d]Prototype of remote_group_merge (#158287)

Tentative implementation of merge_remote_group per the proposal here: [docs.google.com/document/d/13R-1t_yESTvmAjcCN-wQjQQadIEu0JNIdS65uZawZzY/edit?tab=t.0#heading=h.3ctbqqopzc89](https://docs.google.com/document/d/13R-1t_yESTvmAjcCN-wQjQQadIEu0JNIdS65uZawZzY/edit?tab=t.0#heading=h.3ctbqqopzc89)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/158287
Approved by: https://github.com/d4l3k
ghstack dependencies: #157716
This commit is contained in:
fduwjj
2025-07-16 07:13:57 -07:00
committed by PyTorch MergeBot
parent 944a140e90
commit f58a680d09
11 changed files with 194 additions and 9 deletions

View File

@ -2071,6 +2071,26 @@ communication mechanism.
py::arg("opts") = std::nullopt,
py::arg("groupDesc") = std::nullopt,
py::call_guard<py::gil_scoped_release>())
.def(
"merge_remote_group",
[](const c10::intrusive_ptr<::c10d::ProcessGroup>& self,
const c10::intrusive_ptr<::c10d::Store>& store,
int size,
std::chrono::milliseconds timeout,
std::optional<std::string> groupName,
std::optional<std::string> groupDesc) {
::c10d::ProcessGroup::MergeOptions opts;
opts.timeout = timeout;
opts.group_name = groupName;
opts.group_desc = groupDesc;
return self->mergeRemoteGroup(store, opts, size);
},
py::arg("store"),
py::arg("size"),
py::arg("timeout") = kProcessGroupDefaultTimeout,
py::arg("group_name") = std::nullopt,
py::arg("group_desc") = std::nullopt,
py::call_guard<py::gil_scoped_release>())
.def(
"abort",
&::c10d::ProcessGroup::abort,