mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-21 05:34:18 +08:00
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/65946 Add new function in agent_utils to perform a synchronization of active call counts using store. This is intended to replace the barrier and all_reduce used by the process group in RPC shutdown. `test_ddp_comparison` and `test_ddp_comparison_uneven_inputs` test fail with these changes. It seems like the RPC agents are not accessing the same store, so the total count of processes never reaches the world size to exit the barrier, still ened to investigate why it is like this only for these test cases. Setting clean_shutdown to false ignores this code path which allows the test to pass. cc pietern mrshenli pritamdamania87 zhaojuanmao satgera rohan-varma gqchen aazzolini osalpekar jiayisuse SciPioneer H-Huang Test Plan: Imported from OSS Reviewed By: jbschlosser Differential Revision: D31762736 Pulled By: H-Huang fbshipit-source-id: cb5d0efe196f72726c63393c4293e97ec4f18548
29 lines
839 B
C++
29 lines
839 B
C++
#pragma once
|
|
|
|
#include <c10d/PrefixStore.hpp>
|
|
#include <torch/csrc/distributed/rpc/utils.h>
|
|
|
|
namespace torch {
|
|
namespace distributed {
|
|
namespace rpc {
|
|
|
|
// All RPC peers should call into this function at the same time. Each peer
|
|
// provides its own id and name, and this function uses the given Store to
|
|
// gather global name-to-id mapping on all peers.
|
|
std::unordered_map<std::string, worker_id_t> collectNames(
|
|
::c10d::PrefixStore store,
|
|
const worker_id_t selfId,
|
|
const std::string& selfName,
|
|
const int worldSize);
|
|
|
|
// This performs a synchronization of all call counts by using store.
|
|
// All RPC peers wait for others to join to exit at the same time.
|
|
int syncCallCount(
|
|
::c10d::PrefixStore store,
|
|
const int worldSize,
|
|
int activeCalls = 0);
|
|
|
|
} // namespace rpc
|
|
} // namespace distributed
|
|
} // namespace torch
|