mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
**Summary** The reducer of `DistributedDataParallel` is implemented with C++ and it is not easy to trace the allreduce launched in the reducer. This PR modifies `DistributedDataParallel` to launch one allreduce per gradient when `compiled_autograd` is enabled. The changes allow us to use `compiled_autograd` to trace the allreduce and later be optimized (fused) in the Inductor. **Key Logic** 1. If `ddp_python_hook` is True, we assume `compiled_autograd` is used. `DistributedDataParallel` registers `compiled_accum_grad_hook` for all parameters. 2. In the first forward() call, if `DistributedDataParallel` is not compiled, all `compiled_accum_grad_hook` are deregistered. If `DistributedDataParallel` is compiled, all `compiled_accum_grad_hook` will be compiled by `compiled_autograd`. 3. `compiled_accum_grad_hook` launches an allreduce to reduce the gradient of the parameter. **Bucketing** The compiled backward is slow because there is no bucketing for the allreduces. We rely on Inductor to bucket the allreduces. The bucketing is done in a separate PR. Differential Revision: [D49428482](https://our.internmc.facebook.com/intern/diff/D49428482/) Pull Request resolved: https://github.com/pytorch/pytorch/pull/110662 Approved by: https://github.com/wconstab
41 lines
1.2 KiB
C++
41 lines
1.2 KiB
C++
#pragma once
|
|
|
|
#include <torch/csrc/autograd/function_hook.h>
|
|
|
|
namespace torch {
|
|
namespace autograd {
|
|
namespace utils {
|
|
|
|
// Turns lambda into a torch::autograd::FunctionPostHook.
|
|
class LambdaPostHook : public torch::autograd::FunctionPostHook {
|
|
using variable_list = std::vector<torch::autograd::Variable>;
|
|
using fn_type =
|
|
std::function<variable_list(const variable_list&, const variable_list&)>;
|
|
using compiled_fn_type = std::function<void(CompiledNodeArgs&)>;
|
|
|
|
public:
|
|
// The lambda function takes as arguments the outputs and inputs of the
|
|
// autograd function and can modify the outputs of the autograd function by
|
|
// returning a new output if needed.
|
|
/* implicit */ LambdaPostHook(fn_type fn) : fn_(std::move(fn)) {}
|
|
|
|
LambdaPostHook(fn_type fn, compiled_fn_type compiled_fn)
|
|
: fn_(std::move(fn)), compiled_fn_(std::move(compiled_fn)) {}
|
|
|
|
variable_list operator()(
|
|
const variable_list& outputs,
|
|
const variable_list& inputs) override {
|
|
return fn_(outputs, inputs);
|
|
}
|
|
|
|
void compiled_args(CompiledNodeArgs& args) override {}
|
|
|
|
protected:
|
|
std::function<variable_list(const variable_list&, const variable_list&)> fn_;
|
|
compiled_fn_type compiled_fn_;
|
|
};
|
|
|
|
} // namespace utils
|
|
} // namespace autograd
|
|
} // namespace torch
|