mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-21 05:34:18 +08:00
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/27022 This change implements the "FAST" mode distributed autograd backward pass as described in https://github.com/pytorch/pytorch/issues/23110. At a high level the backward pass works as follows: 1. We start by computing dependencies on the node that calls `torch.distributed.backward`. 2. This node computes the dependencies starting from the root nodes provided in the backward call and all the 'send' functions present in the current autograd context. The "FAST" mode assumes all 'send' functions are part of the autograd computation. 3. Once the dependency computation is done, the distributed autograd engine calls the local autograd engine to execute the autograd graph. Note that the autograd graph on a single node is not necessarily connected because of inter-node communication. As a result, we have special handling to ensure the local autograd engine ensures we execute the entire graph starting from the provided roots and all 'send' functions on the node. 4. When the local autograd engine hits a 'recv' function, it performs an async RPC to send the gradients over to the appropriate node and stores a future in the autograd context to keep track of this RPC. 5. On the destination node, the appropriate 'send' function is looked up and enqueued on the local autograd engine. If this is the first time the node is hearing about this autograd context id on the backward pass, then the node computes dependencies for the local autograd engine. 6. As part of compute dependencies, the distributed autograd engine discovers all leaf nodes and ensures those are passed as 'outputs' to the local autograd engine. This avoids running the 'AccumulateGrad' function. 7. The gradients computed for the leaf nodes are then actually accumulated in `DistAutogradContext` for the appropriate autograd context id. 8. The distributed autograd engine waits for the local autograd engine to complete and also waits for all the 'Futures' (stored in 4.) for respective RPCs to finish. We have made the following changes to the local autograd engine for this purpose: 1. Expose GraphTask and NodeTask so that the distributed autograd engine can use them. 2. Expose a `execute_with_graph_task` API which gives the distributed engine to build a GraphTask and pass it to the local autograd engine. 3. Expose a `enqueue_on_cpu` API, which allows the distributed engine to build a `NodeTask` for a 'send' function and enqueue it on the local autograd engine. In addition to this a few general improvements: 1. Added a `PropagateGradients` RPC call for the 'recv' function to pass gradients to the appropriate node during the backward pass. 2. Use IValues as much as possible in serialization for RpcWithAutograd. 3. If Future.wait(), contains a message type EXCEPTION, we throw an appropriate exception instead of just returning the message. This is inline with what most Future.wait() APIs do. 4. Added a `get_gradients(context_id)` API which allows users to retrieve a map from Tensor to respective gradient for the provided context_id on the local node. ghstack-source-id: 91794926 Test Plan: unit tests. Differential Revision: D17652615 fbshipit-source-id: 96f65c52adb2706ee29f4b49e1655afaa0a3bec3
190 lines
5.9 KiB
C++
190 lines
5.9 KiB
C++
#include <torch/csrc/distributed/rpc/rref.h>
|
|
|
|
#include <torch/csrc/distributed/rpc/python_rpc_handler.h>
|
|
#include <torch/csrc/distributed/rpc/rref_context.h>
|
|
#include <torch/csrc/distributed/rpc/rref_proto.h>
|
|
|
|
namespace torch {
|
|
namespace distributed {
|
|
namespace rpc {
|
|
|
|
namespace {
|
|
|
|
constexpr int OWNER_IDX = 0; // index of ownerId in the tuple
|
|
constexpr int RREFID_ON_IDX = 1; // index of RRefId.createdOn_ in the tuple
|
|
constexpr int RREFID_ID_IDX = 2; // index of RRefId.localId_ in the tuple
|
|
constexpr int FORKID_ON_IDX = 3; // index of ForkId.createdOn_ in the tuple
|
|
constexpr int FORKID_ID_IDX = 4; // index of ForkId.localId_ in the tuple
|
|
constexpr int PARENT_IDX = 5; // index of parent in the tuple
|
|
|
|
// NB: if more fields are added, make sure this field is also bumped
|
|
constexpr int RFD_TUPLE_SIZE = 6; // number of RRefForkData fields in py::tuple
|
|
|
|
} // namespace
|
|
|
|
std::atomic<local_id_t> RRefContext::nextLocalId_{0};
|
|
|
|
////////////////////////// RRefForkData /////////////////////////////////
|
|
|
|
RRefForkData::RRefForkData(
|
|
worker_id_t ownerId,
|
|
const RRefId& rrefId,
|
|
const ForkId& forkId,
|
|
worker_id_t parent)
|
|
: ownerId_(ownerId), rrefId_(rrefId), forkId_(forkId), parent_(parent) {}
|
|
|
|
py::tuple RRefForkData::toPyTuple() const {
|
|
return py::make_tuple(
|
|
ownerId_,
|
|
rrefId_.createdOn_,
|
|
rrefId_.localId_,
|
|
forkId_.createdOn_,
|
|
forkId_.localId_,
|
|
parent_);
|
|
}
|
|
|
|
RRefForkData RRefForkData::fromPyTuple(const py::tuple& t) {
|
|
TORCH_INTERNAL_ASSERT(
|
|
t.size() == RFD_TUPLE_SIZE,
|
|
"Pickled RRefForkData must contain 6 numbers.");
|
|
worker_id_t ownerId = t[OWNER_IDX].cast<worker_id_t>();
|
|
// const reference will extend the lifetime of the temporary variable
|
|
const RRefId& rrefId = RRefId(
|
|
t[RREFID_ON_IDX].cast<worker_id_t>(),
|
|
t[RREFID_ID_IDX].cast<local_id_t>());
|
|
const RRefId& forkId = RRefId(
|
|
t[FORKID_ON_IDX].cast<worker_id_t>(),
|
|
t[FORKID_ID_IDX].cast<local_id_t>());
|
|
worker_id_t parent = t[PARENT_IDX].cast<worker_id_t>();
|
|
return RRefForkData(ownerId, rrefId, forkId, parent);
|
|
}
|
|
|
|
RRefForkData RRefForkData::fromIValue(const at::IValue& ivalue) {
|
|
auto ivalues = ivalue.toTuple()->elements();
|
|
|
|
TORCH_INTERNAL_ASSERT(
|
|
ivalues.size() == 4,
|
|
"Constructing RRefForkData from ivalue "
|
|
"expects a GenericList of 4 elements, but got ",
|
|
ivalues.size());
|
|
|
|
int64_t ownerId = ivalues[0].toInt();
|
|
TORCH_INTERNAL_ASSERT(
|
|
ownerId < std::numeric_limits<worker_id_t>::max(),
|
|
"RRefId createdOn out of range, got ",
|
|
ownerId);
|
|
|
|
RRefId rrefId = RRefId::fromIValue(ivalues[1]);
|
|
ForkId forkId = ForkId::fromIValue(ivalues[2]);
|
|
|
|
int64_t parent = ivalues[3].toInt();
|
|
TORCH_INTERNAL_ASSERT(
|
|
parent < std::numeric_limits<worker_id_t>::max(),
|
|
"RRefId createdOn out of range, got ",
|
|
parent);
|
|
return RRefForkData(ownerId, rrefId, forkId, parent);
|
|
}
|
|
|
|
////////////////////////////// RRef /////////////////////////////////////
|
|
|
|
RRef::RRef(worker_id_t ownerId, const RRefId& rrefId)
|
|
: ownerId_(ownerId), rrefId_(rrefId) {}
|
|
|
|
RRefForkData RRef::fork() const {
|
|
auto& ctx = RRefContext::getInstance();
|
|
return RRefForkData(
|
|
ownerId_, rrefId_, ctx.genGloballyUniqueId(), ctx.getWorkerId());
|
|
}
|
|
|
|
////////////////////////// UserRRef /////////////////////////////////////
|
|
|
|
template <typename T>
|
|
UserRRef<T>::UserRRef(
|
|
worker_id_t ownerId,
|
|
const RRefId& rrefId,
|
|
const ForkId& forkId)
|
|
: RRef(ownerId, rrefId), forkId_(forkId) {
|
|
// Do nothing,
|
|
// (1) If this UserRRef is a fork of an existing RRef, RRefContext will send
|
|
// a RREF_FORK_REQUEST message to the owner.
|
|
// (2) If this the creator UserRRef, ScriptRemoteCall or PythonRemoteCall will
|
|
// properly notify the owner.
|
|
}
|
|
|
|
template <typename T>
|
|
UserRRef<T>::~UserRRef() {
|
|
// TODO: queue this in RRefContext instead of doing it here.
|
|
auto& ctx = RRefContext::getInstance();
|
|
if (ctx.getWorkerId() != ownerId_) {
|
|
auto fm = ctx.agent()->send(
|
|
ctx.agent()->getWorkerInfo(ownerId_),
|
|
RRefUserDelete(rrefId_, forkId_).toMessage());
|
|
|
|
fm->addCallback(
|
|
[](const Message& message) { RRefContext::handleException(message); });
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
const ForkId& UserRRef<T>::forkId() const {
|
|
return forkId_;
|
|
}
|
|
|
|
template <>
|
|
IValue UserRRef<IValue>::toHere() {
|
|
auto agent = RpcAgent::getDefaultRpcAgent();
|
|
std::shared_ptr<FutureMessage> fm = agent->send(
|
|
agent->getWorkerInfo(ownerId_),
|
|
ScriptRRefFetchCall(rrefId()).toMessage());
|
|
const Message& message = fm->wait();
|
|
RRefContext::handleException(message);
|
|
auto rfr = RRefFetchRet::fromMessage(message);
|
|
TORCH_INTERNAL_ASSERT(
|
|
rfr->values().size() == 1,
|
|
"RRef of IValue should contain a single IValue, but got ",
|
|
rfr->values().size());
|
|
return rfr->values().front();
|
|
}
|
|
|
|
template <>
|
|
py::object UserRRef<py::object>::toHere() {
|
|
auto agent = RpcAgent::getDefaultRpcAgent();
|
|
std::shared_ptr<FutureMessage> fm = agent->send(
|
|
agent->getWorkerInfo(ownerId_),
|
|
PythonRRefFetchCall(rrefId()).toMessage());
|
|
const Message& message = fm->wait();
|
|
RRefContext::handleException(message);
|
|
auto rfr = RRefFetchRet::fromMessage(message);
|
|
return PythonRpcHandler::getInstance().deserialize(
|
|
SerializedPyObj::fromIValues(rfr->values()));
|
|
}
|
|
|
|
template class UserRRef<IValue>;
|
|
template class UserRRef<py::object>;
|
|
|
|
////////////////////////// OwnerRRef /////////////////////////////////////
|
|
|
|
template <typename T>
|
|
const T& OwnerRRef<T>::getValue() const {
|
|
// TODO: use callback to make this non-blocking
|
|
std::unique_lock<std::mutex> lock(mutex_);
|
|
valueCV_.wait(lock, [this] { return value_.has_value(); });
|
|
return value_.value();
|
|
}
|
|
|
|
template <typename T>
|
|
void OwnerRRef<T>::setValue(T&& value) {
|
|
{
|
|
std::lock_guard<std::mutex> lock(mutex_);
|
|
value_ = std::move(value);
|
|
}
|
|
valueCV_.notify_all();
|
|
}
|
|
|
|
template class OwnerRRef<IValue>;
|
|
template class OwnerRRef<py::object>;
|
|
|
|
} // namespace rpc
|
|
} // namespace distributed
|
|
} // namespace torch
|