mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Fixes #ISSUE_NUMBER Pull Request resolved: https://github.com/pytorch/pytorch/pull/139605 Approved by: https://github.com/ezyang
920 lines
33 KiB
C++
920 lines
33 KiB
C++
#pragma once
|
|
|
|
#include <torch/csrc/distributed/c10d/Backend.hpp>
|
|
#include <torch/csrc/distributed/c10d/Work.hpp>
|
|
#include <memory>
|
|
#include <unordered_map>
|
|
#include <utility>
|
|
#include <vector>
|
|
|
|
#include <ATen/ATen.h>
|
|
#include <ATen/core/dispatch/Dispatcher.h>
|
|
#include <c10/macros/Macros.h>
|
|
|
|
#include <torch/csrc/distributed/c10d/Work.hpp>
|
|
// *************************************************************************
|
|
// PROCESS GROUP collective communication API IS BEING CHANGED BETWEEN
|
|
// versions 1.7 and 1.8.
|
|
// PLEASE DO NOT ADD ANY DEPENDENCIES.
|
|
// SEE RFC: https://github.com/pytorch/pytorch/issues/39662
|
|
// *************************************************************************
|
|
|
|
constexpr auto kProcessGroupDefaultTimeout =
|
|
std::chrono::milliseconds(30 * 60 * 1000);
|
|
|
|
namespace c10d {
|
|
|
|
// We only call `register_work()` in two cases:
|
|
// 1. If the work object is created from a functional collective call.
|
|
// 2. If the work object is created from a non-functional collective call within
|
|
// the `with allow_inflight_collective_as_graph_input_ctx()` context manager.
|
|
C10_EXPORT void register_work(
|
|
const at::Tensor& tensor,
|
|
const c10::intrusive_ptr<c10d::Work>& work);
|
|
|
|
C10_EXPORT at::Tensor wait_tensor(const at::Tensor& tensor);
|
|
|
|
// We only call `unregister_work()` in one case:
|
|
// 1. If the work object is created from a non-functional collective call within
|
|
// the `with allow_inflight_collective_as_graph_input_ctx()` context manager.
|
|
//
|
|
// Q: What about the functional collective case?
|
|
// A: The unregistration of work object for functional collective is done in
|
|
// the required user-side explicit call to `wait_tensor()`.
|
|
C10_EXPORT void unregister_work(const c10::intrusive_ptr<c10d::Work>& work);
|
|
|
|
C10_EXPORT size_t get_work_registry_size();
|
|
|
|
C10_EXPORT void set_allow_inflight_collective_as_graph_input(bool value);
|
|
|
|
C10_EXPORT bool allow_inflight_collective_as_graph_input();
|
|
|
|
// ProcessGroup is a base class that captures collective and point to
|
|
// point communication in a fixed set of processes.
|
|
//
|
|
// The functions specified in the class below describe the API alone;
|
|
// implementations are provided in subclasses.
|
|
//
|
|
// Every function that performs I/O is executed asynchronously by a
|
|
// thread pool owned by the ProcessGroup (by default). They return an
|
|
// object that can be used to wait for completion or error.
|
|
//
|
|
// The ProcessGroup can instantiate subgroups with fewer or an equal
|
|
// number of members. Implementations must take care that multiple
|
|
// process groups can be used in parallel and synchronize accordingly.
|
|
//
|
|
// The ProcessGroup assumes a fixed set of processes. If the set
|
|
// changes, existing instances must be destructed and instantiation
|
|
// and initialization must start from scratch. For members of the
|
|
// process group to find each other (referred to as rendezvous from
|
|
// hereon)
|
|
//
|
|
class TORCH_API ProcessGroup : public torch::CustomClassHolder {
|
|
public:
|
|
enum BackendType : uint8_t {
|
|
UNDEFINED = 0,
|
|
GLOO = 1,
|
|
NCCL = 2,
|
|
UCC = 3,
|
|
MPI = 4,
|
|
CUSTOM = 5,
|
|
};
|
|
|
|
static std::string backendTypeToString(const BackendType& type) {
|
|
switch (type) {
|
|
case BackendType::GLOO:
|
|
return "gloo";
|
|
case BackendType::NCCL:
|
|
return "nccl";
|
|
case BackendType::UCC:
|
|
return "ucc";
|
|
case BackendType::MPI:
|
|
return "mpi";
|
|
case BackendType::UNDEFINED:
|
|
return "undefined";
|
|
case BackendType::CUSTOM:
|
|
return "custom";
|
|
default:
|
|
TORCH_CHECK(false, "THis should never happen!");
|
|
}
|
|
}
|
|
|
|
static BackendType strToBackendType(const std::string& backend) {
|
|
if (backend == "undefined") {
|
|
return BackendType::UNDEFINED;
|
|
} else if (backend == "gloo") {
|
|
return BackendType::GLOO;
|
|
} else if (backend == "nccl") {
|
|
return BackendType::NCCL;
|
|
} else if (backend == "ucc") {
|
|
return BackendType::UCC;
|
|
} else if (backend == "mpi") {
|
|
return BackendType::MPI;
|
|
} else {
|
|
return BackendType::CUSTOM;
|
|
}
|
|
}
|
|
|
|
// Not used, set for backwards compatibility and only used for TypeDef in
|
|
// Ops.cpp
|
|
explicit ProcessGroup(int rank, int size);
|
|
|
|
explicit ProcessGroup(
|
|
c10::intrusive_ptr<::c10d::Store> store,
|
|
int rank,
|
|
int size);
|
|
~ProcessGroup() override;
|
|
|
|
int getRank() const {
|
|
return rank_;
|
|
}
|
|
|
|
int getSize() const {
|
|
return size_;
|
|
}
|
|
|
|
// Returns an unique opaque ID of this process group object.
|
|
int64_t getID() const {
|
|
return reinterpret_cast<std::intptr_t>(this);
|
|
}
|
|
|
|
// Returns an unique opaque ID of a backend for the specific backend type
|
|
// that can correlate with this process group's collectives.
|
|
int64_t getBackendID(BackendType backend_type) const {
|
|
return reinterpret_cast<std::intptr_t>(getBackend(backend_type).get());
|
|
}
|
|
|
|
virtual const std::string getBackendName() const {
|
|
return backendTypeToString(backendType_);
|
|
}
|
|
|
|
BackendType getBackendType() const {
|
|
return backendType_;
|
|
}
|
|
|
|
virtual void startCoalescing(c10::DeviceType deviceType) {
|
|
// only nccl has implemented startCoalescing so only execute for nccl
|
|
// backends
|
|
auto backend = getBackend(deviceType);
|
|
backend->startCoalescing();
|
|
}
|
|
|
|
virtual c10::intrusive_ptr<Work> endCoalescing(c10::DeviceType deviceType) {
|
|
// only nccl has implemented endCoalescing so only execute for nccl
|
|
// backends
|
|
auto backend = getBackend(deviceType);
|
|
auto work = backend->endCoalescing();
|
|
return work;
|
|
}
|
|
|
|
virtual c10::intrusive_ptr<Work> broadcast(
|
|
std::vector<at::Tensor>& tensors,
|
|
const BroadcastOptions& opts = BroadcastOptions()) {
|
|
static auto op =
|
|
c10::Dispatcher::singleton()
|
|
.findSchemaOrThrow("c10d::broadcast_", "")
|
|
.typed<
|
|
std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>(
|
|
at::TensorList,
|
|
const c10::intrusive_ptr<::c10d::ProcessGroup>&,
|
|
int64_t,
|
|
int64_t,
|
|
bool,
|
|
int64_t)>();
|
|
// It's awakward to unbox the opts here and box them again in the custom C++
|
|
// op. But it's also complicated to make opts as a CustomClassHolder. Leave
|
|
// it as it is now.
|
|
auto work = std::get<1>(op.call(
|
|
tensors,
|
|
c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
|
|
opts.rootRank,
|
|
opts.rootTensor,
|
|
opts.asyncOp,
|
|
opts.timeout.count()));
|
|
|
|
if (c10d::allow_inflight_collective_as_graph_input()) {
|
|
for (const auto& tensor : tensors) {
|
|
c10d::register_work(tensor, work);
|
|
}
|
|
}
|
|
return work;
|
|
}
|
|
|
|
virtual c10::intrusive_ptr<Work> allreduce(
|
|
std::vector<at::Tensor>& tensors,
|
|
const AllreduceOptions& opts = AllreduceOptions()) {
|
|
static auto op =
|
|
c10::Dispatcher::singleton()
|
|
.findSchemaOrThrow("c10d::allreduce_", "")
|
|
.typed<
|
|
std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>(
|
|
at::TensorList,
|
|
const c10::intrusive_ptr<::c10d::ProcessGroup>&,
|
|
const c10::intrusive_ptr<::c10d::ReduceOp>&,
|
|
const std::optional<at::Tensor>& sparse_indices,
|
|
int64_t)>();
|
|
|
|
auto work = std::get<1>(op.call(
|
|
tensors,
|
|
c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
|
|
c10::make_intrusive<ReduceOp>(opts.reduceOp),
|
|
opts.sparseIndices,
|
|
opts.timeout.count()));
|
|
|
|
if (c10d::allow_inflight_collective_as_graph_input()) {
|
|
for (const auto& tensor : tensors) {
|
|
c10d::register_work(tensor, work);
|
|
}
|
|
}
|
|
return work;
|
|
}
|
|
|
|
virtual c10::intrusive_ptr<Work> allreduce_coalesced(
|
|
std::vector<at::Tensor>& tensors,
|
|
const AllreduceCoalescedOptions& opts = AllreduceCoalescedOptions()) {
|
|
static auto op = c10::Dispatcher::singleton()
|
|
.findSchemaOrThrow("c10d::allreduce_coalesced_", "")
|
|
.typed<c10::intrusive_ptr<::c10d::Work>(
|
|
at::TensorList,
|
|
const c10::intrusive_ptr<::c10d::ProcessGroup>&,
|
|
const c10::intrusive_ptr<::c10d::ReduceOp>&,
|
|
int64_t)>();
|
|
|
|
auto work = op.call(
|
|
tensors,
|
|
c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
|
|
c10::make_intrusive<ReduceOp>(opts.reduceOp),
|
|
opts.timeout.count());
|
|
|
|
if (c10d::allow_inflight_collective_as_graph_input()) {
|
|
for (const auto& tensor : tensors) {
|
|
c10d::register_work(tensor, work);
|
|
}
|
|
}
|
|
return work;
|
|
}
|
|
|
|
virtual c10::intrusive_ptr<Work> reduce(
|
|
std::vector<at::Tensor>& tensors,
|
|
const ReduceOptions& opts = ReduceOptions()) {
|
|
static auto op = c10::Dispatcher::singleton()
|
|
.findSchemaOrThrow("c10d::reduce_", "")
|
|
.typed<c10::intrusive_ptr<::c10d::Work>(
|
|
at::TensorList,
|
|
const c10::intrusive_ptr<::c10d::ProcessGroup>&,
|
|
const c10::intrusive_ptr<::c10d::ReduceOp>&,
|
|
int64_t,
|
|
int64_t,
|
|
int64_t)>();
|
|
auto work = op.call(
|
|
tensors,
|
|
c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
|
|
c10::make_intrusive<ReduceOp>(opts.reduceOp),
|
|
opts.rootRank,
|
|
opts.rootTensor,
|
|
opts.timeout.count());
|
|
|
|
if (c10d::allow_inflight_collective_as_graph_input()) {
|
|
for (const auto& tensor : tensors) {
|
|
c10d::register_work(tensor, work);
|
|
}
|
|
}
|
|
return work;
|
|
}
|
|
|
|
virtual c10::intrusive_ptr<Work> allgather(
|
|
std::vector<std::vector<at::Tensor>>& outputTensors,
|
|
std::vector<at::Tensor>& inputTensors,
|
|
const AllgatherOptions& opts = AllgatherOptions()) {
|
|
static auto op = c10::Dispatcher::singleton()
|
|
.findSchemaOrThrow("c10d::allgather_", "")
|
|
.typed<std::tuple<
|
|
std::vector<std::vector<at::Tensor>>,
|
|
c10::intrusive_ptr<Work>>(
|
|
const std::vector<std::vector<at::Tensor>>&,
|
|
at::TensorList,
|
|
const c10::intrusive_ptr<::c10d::ProcessGroup>&,
|
|
int64_t)>();
|
|
|
|
auto work = std::get<1>(op.call(
|
|
outputTensors,
|
|
inputTensors,
|
|
c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
|
|
opts.timeout.count()));
|
|
|
|
if (c10d::allow_inflight_collective_as_graph_input()) {
|
|
for (const auto& tensor_list : outputTensors) {
|
|
for (const auto& tensor : tensor_list) {
|
|
c10d::register_work(tensor, work);
|
|
}
|
|
}
|
|
}
|
|
return work;
|
|
}
|
|
|
|
// Gathers a single tensor inputBuffer into a single buffer outputBuffer that
|
|
// is interpreted as a contiguous collection of size inputBuffer * WORLD_SIZE.
|
|
// For implementers of ProcessGroup API and advanced users only.
|
|
// Note: this function will be deprecated in near future.
|
|
virtual c10::intrusive_ptr<Work> _allgather_base(
|
|
at::Tensor& outputBuffer,
|
|
at::Tensor& inputBuffer,
|
|
const AllgatherOptions& opts = AllgatherOptions()) {
|
|
static auto op =
|
|
c10::Dispatcher::singleton()
|
|
.findSchemaOrThrow("c10d::_allgather_base_", "")
|
|
.typed<std::tuple<at::Tensor, c10::intrusive_ptr<Work>>(
|
|
at::Tensor&,
|
|
at::Tensor&,
|
|
const c10::intrusive_ptr<::c10d::ProcessGroup>&,
|
|
bool,
|
|
int64_t)>();
|
|
|
|
auto work = std::get<1>(op.call(
|
|
outputBuffer,
|
|
inputBuffer,
|
|
c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
|
|
opts.asyncOp,
|
|
opts.timeout.count()));
|
|
|
|
if (c10d::allow_inflight_collective_as_graph_input()) {
|
|
c10d::register_work(outputBuffer, work);
|
|
}
|
|
return work;
|
|
}
|
|
|
|
// This function is deprecated and will be moved out of ProcessGroup to comms:
|
|
// * do not add dependencies on this function,
|
|
// * do not implement it in your ProcessGroup, implement _allgather_base
|
|
// instead.
|
|
virtual c10::intrusive_ptr<Work> allgather_coalesced(
|
|
std::vector<std::vector<at::Tensor>>& outputTensorLists,
|
|
std::vector<at::Tensor>& inputTensors,
|
|
const AllgatherOptions& opts = AllgatherOptions()) {
|
|
static auto op =
|
|
c10::Dispatcher::singleton()
|
|
.findSchemaOrThrow("c10d::allgather_coalesced_", "")
|
|
.typed<c10::intrusive_ptr<Work>(
|
|
const std::vector<std::vector<at::Tensor>>&,
|
|
const at::TensorList&,
|
|
const c10::intrusive_ptr<::c10d::ProcessGroup>&)>();
|
|
|
|
auto work = op.call(
|
|
outputTensorLists,
|
|
inputTensors,
|
|
c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this));
|
|
|
|
if (c10d::allow_inflight_collective_as_graph_input()) {
|
|
for (const auto& tensor_list : outputTensorLists) {
|
|
for (const auto& tensor : tensor_list) {
|
|
c10d::register_work(tensor, work);
|
|
}
|
|
}
|
|
}
|
|
return work;
|
|
}
|
|
|
|
// This function is a coalesced version of `allgather_into_tensor` (currently
|
|
// still named as `_allgather_base`). Each tensor in the vector corresponds to
|
|
// an input/output of one `allgather_into_tensor` operation.
|
|
virtual c10::intrusive_ptr<Work> allgather_into_tensor_coalesced(
|
|
std::vector<at::Tensor>& outputTensors,
|
|
std::vector<at::Tensor>& inputTensors,
|
|
const AllgatherOptions& opts = AllgatherOptions()) {
|
|
static auto op =
|
|
c10::Dispatcher::singleton()
|
|
.findSchemaOrThrow("c10d::allgather_into_tensor_coalesced_", "")
|
|
.typed<c10::intrusive_ptr<Work>(
|
|
const at::TensorList,
|
|
const at::TensorList,
|
|
const c10::intrusive_ptr<::c10d::ProcessGroup>&)>();
|
|
|
|
auto work = op.call(
|
|
outputTensors,
|
|
inputTensors,
|
|
c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this));
|
|
|
|
if (c10d::allow_inflight_collective_as_graph_input()) {
|
|
for (const auto& tensor : outputTensors) {
|
|
c10d::register_work(tensor, work);
|
|
}
|
|
}
|
|
return work;
|
|
}
|
|
|
|
virtual c10::intrusive_ptr<Work> gather(
|
|
std::vector<std::vector<at::Tensor>>& outputTensors,
|
|
std::vector<at::Tensor>& inputTensors,
|
|
const GatherOptions& opts = GatherOptions()) {
|
|
static auto op = c10::Dispatcher::singleton()
|
|
.findSchemaOrThrow("c10d::gather_", "")
|
|
.typed<c10::intrusive_ptr<::c10d::Work>(
|
|
const std::vector<std::vector<at::Tensor>>&,
|
|
const at::TensorList&,
|
|
const c10::intrusive_ptr<::c10d::ProcessGroup>&,
|
|
int64_t,
|
|
int64_t)>();
|
|
auto work = op.call(
|
|
outputTensors,
|
|
inputTensors,
|
|
c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
|
|
opts.rootRank,
|
|
opts.timeout.count());
|
|
|
|
if (c10d::allow_inflight_collective_as_graph_input()) {
|
|
for (const auto& tensor_list : outputTensors) {
|
|
for (const auto& tensor : tensor_list) {
|
|
c10d::register_work(tensor, work);
|
|
}
|
|
}
|
|
}
|
|
return work;
|
|
}
|
|
|
|
virtual c10::intrusive_ptr<Work> scatter(
|
|
std::vector<at::Tensor>& outputTensors,
|
|
std::vector<std::vector<at::Tensor>>& inputTensors,
|
|
const ScatterOptions& opts = ScatterOptions()) {
|
|
static auto op =
|
|
c10::Dispatcher::singleton()
|
|
.findSchemaOrThrow("c10d::scatter_", "")
|
|
.typed<
|
|
std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>(
|
|
const at::TensorList&,
|
|
const std::vector<std::vector<at::Tensor>>&,
|
|
const c10::intrusive_ptr<::c10d::ProcessGroup>&,
|
|
int64_t,
|
|
bool,
|
|
int64_t)>();
|
|
auto work = std::get<1>(op.call(
|
|
outputTensors,
|
|
inputTensors,
|
|
c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
|
|
opts.rootRank,
|
|
opts.asyncOp,
|
|
opts.timeout.count()));
|
|
|
|
if (c10d::allow_inflight_collective_as_graph_input()) {
|
|
for (const auto& tensor : outputTensors) {
|
|
c10d::register_work(tensor, work);
|
|
}
|
|
}
|
|
return work;
|
|
}
|
|
|
|
virtual c10::intrusive_ptr<Work> reduce_scatter(
|
|
std::vector<at::Tensor>& outputTensors,
|
|
std::vector<std::vector<at::Tensor>>& inputTensors,
|
|
const ReduceScatterOptions& opts = ReduceScatterOptions()) {
|
|
static auto op =
|
|
c10::Dispatcher::singleton()
|
|
.findSchemaOrThrow("c10d::reduce_scatter_", "")
|
|
.typed<
|
|
std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>(
|
|
const at::TensorList&,
|
|
const std::vector<std::vector<at::Tensor>>&,
|
|
const c10::intrusive_ptr<::c10d::ProcessGroup>&,
|
|
const c10::intrusive_ptr<::c10d::ReduceOp>&,
|
|
int64_t)>();
|
|
auto work = std::get<1>(op.call(
|
|
outputTensors,
|
|
inputTensors,
|
|
c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
|
|
c10::make_intrusive<::c10d::ReduceOp>(opts.reduceOp),
|
|
opts.timeout.count()));
|
|
|
|
if (c10d::allow_inflight_collective_as_graph_input()) {
|
|
for (const auto& tensor : outputTensors) {
|
|
c10d::register_work(tensor, work);
|
|
}
|
|
}
|
|
return work;
|
|
}
|
|
|
|
virtual c10::intrusive_ptr<Work> _reduce_scatter_base(
|
|
at::Tensor& outputBuffer,
|
|
at::Tensor& inputBuffer,
|
|
const ReduceScatterOptions& opts = ReduceScatterOptions()) {
|
|
static auto op =
|
|
c10::Dispatcher::singleton()
|
|
.findSchemaOrThrow("c10d::_reduce_scatter_base_", "")
|
|
.typed<std::tuple<at::Tensor, c10::intrusive_ptr<Work>>(
|
|
at::Tensor&,
|
|
at::Tensor&,
|
|
const c10::intrusive_ptr<::c10d::ProcessGroup>&,
|
|
const c10::intrusive_ptr<::c10d::ReduceOp>&,
|
|
bool,
|
|
int64_t)>();
|
|
auto work = std::get<1>(op.call(
|
|
outputBuffer,
|
|
inputBuffer,
|
|
c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
|
|
c10::make_intrusive<::c10d::ReduceOp>(opts.reduceOp),
|
|
opts.asyncOp,
|
|
opts.timeout.count()));
|
|
|
|
if (c10d::allow_inflight_collective_as_graph_input()) {
|
|
c10d::register_work(outputBuffer, work);
|
|
}
|
|
return work;
|
|
}
|
|
|
|
// This function is a coalesced version of `reduce_scatter_tensor` (currently
|
|
// still named as `_reduce_scatter_base`). Each tensor in the vector
|
|
// corresponds to an input/output of one `reduce_scatter_tensor` operation.
|
|
virtual c10::intrusive_ptr<Work> reduce_scatter_tensor_coalesced(
|
|
std::vector<at::Tensor>& outputTensors,
|
|
std::vector<at::Tensor>& inputTensors,
|
|
const ReduceScatterOptions& opts = ReduceScatterOptions()) {
|
|
static auto op =
|
|
c10::Dispatcher::singleton()
|
|
.findSchemaOrThrow("c10d::reduce_scatter_tensor_coalesced_", "")
|
|
.typed<c10::intrusive_ptr<Work>(
|
|
const at::TensorList,
|
|
const at::TensorList,
|
|
const c10::intrusive_ptr<::c10d::ProcessGroup>&,
|
|
const c10::intrusive_ptr<::c10d::ReduceOp>&,
|
|
int64_t)>();
|
|
|
|
auto work = op.call(
|
|
outputTensors,
|
|
inputTensors,
|
|
c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
|
|
c10::make_intrusive<::c10d::ReduceOp>(opts.reduceOp),
|
|
opts.timeout.count());
|
|
|
|
if (c10d::allow_inflight_collective_as_graph_input()) {
|
|
for (const auto& tensor : outputTensors) {
|
|
c10d::register_work(tensor, work);
|
|
}
|
|
}
|
|
return work;
|
|
}
|
|
|
|
virtual c10::intrusive_ptr<Work> alltoall_base(
|
|
at::Tensor& outputBuffer,
|
|
at::Tensor& inputBuffer,
|
|
std::vector<int64_t>& outputSplitSizes,
|
|
std::vector<int64_t>& inputSplitSizes,
|
|
const AllToAllOptions& opts = AllToAllOptions()) {
|
|
static auto op = c10::Dispatcher::singleton()
|
|
.findSchemaOrThrow("c10d::alltoall_base_", "")
|
|
.typed<c10::intrusive_ptr<::c10d::Work>(
|
|
at::Tensor&,
|
|
at::Tensor&,
|
|
const c10::intrusive_ptr<::c10d::ProcessGroup>&,
|
|
std::vector<int64_t>,
|
|
std::vector<int64_t>,
|
|
int64_t)>();
|
|
auto work = op.call(
|
|
outputBuffer,
|
|
inputBuffer,
|
|
c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
|
|
outputSplitSizes,
|
|
inputSplitSizes,
|
|
opts.timeout.count());
|
|
|
|
if (c10d::allow_inflight_collective_as_graph_input()) {
|
|
c10d::register_work(outputBuffer, work);
|
|
}
|
|
return work;
|
|
}
|
|
|
|
virtual c10::intrusive_ptr<Work> alltoall(
|
|
std::vector<at::Tensor>& outputTensors,
|
|
std::vector<at::Tensor>& inputTensors,
|
|
const AllToAllOptions& opts = AllToAllOptions()) {
|
|
static auto op =
|
|
c10::Dispatcher::singleton()
|
|
.findSchemaOrThrow("c10d::alltoall_", "")
|
|
.typed<
|
|
std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>(
|
|
const at::TensorList&,
|
|
const at::TensorList&,
|
|
const c10::intrusive_ptr<::c10d::ProcessGroup>&,
|
|
int64_t)>();
|
|
auto work = std::get<1>(op.call(
|
|
outputTensors,
|
|
inputTensors,
|
|
c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
|
|
opts.timeout.count()));
|
|
|
|
if (c10d::allow_inflight_collective_as_graph_input()) {
|
|
for (const auto& tensor : outputTensors) {
|
|
c10d::register_work(tensor, work);
|
|
}
|
|
}
|
|
return work;
|
|
}
|
|
|
|
virtual void monitoredBarrier(
|
|
const BarrierOptions& opts,
|
|
bool wait_all_ranks = false) {
|
|
static auto op = c10::Dispatcher::singleton()
|
|
.findSchemaOrThrow("c10d::monitored_barrier_", "")
|
|
.typed<void(
|
|
at::Tensor,
|
|
const c10::intrusive_ptr<::c10d::ProcessGroup>&,
|
|
const std::vector<int64_t>&,
|
|
int64_t,
|
|
bool)>();
|
|
// Default to using cpu implementation, monitored barrier is only for GLOO
|
|
at::Tensor tensor = at::empty({0}, at::TensorOptions().device(at::kCPU));
|
|
op.call(
|
|
tensor,
|
|
c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
|
|
opts.device_ids,
|
|
opts.timeout.count(),
|
|
wait_all_ranks);
|
|
}
|
|
|
|
// Agrees on an initial sequence number for the whole group by having rank 0
|
|
// create it and broadcast it to other ranks using the store. Only implemented
|
|
// for GLOO and NCCL backends currently.
|
|
virtual void setSequenceNumberForGroup() {
|
|
auto backendType = getBackendType();
|
|
// TODO: HACK for backend name to get sequence number for that backend.
|
|
if (backendType == ProcessGroup::BackendType::GLOO ||
|
|
backendType == ProcessGroup::BackendType::NCCL ||
|
|
backendType == ProcessGroup::BackendType::UCC) {
|
|
getDefaultBackend()->setSequenceNumberForGroup();
|
|
} else {
|
|
TORCH_CHECK(
|
|
false,
|
|
c10::str(
|
|
"ProcessGroup ",
|
|
getBackendName(),
|
|
" does not yet support sequence numbers."));
|
|
}
|
|
}
|
|
|
|
// Retrieves the current sequence number for the whole group, which should be
|
|
// in sync. If the returned number is not consistent across the group, it
|
|
// may indicate that there is some sort of collective desynchronization.
|
|
virtual uint64_t getSequenceNumberForGroup() {
|
|
auto backendType = getBackendType();
|
|
|
|
// TODO: HACK for backend name to get sequence number for that backend.
|
|
if (backendType == ProcessGroup::BackendType::GLOO ||
|
|
backendType == ProcessGroup::BackendType::NCCL ||
|
|
backendType == ProcessGroup::BackendType::UCC) {
|
|
return getDefaultBackend()->getSequenceNumberForGroup();
|
|
} else {
|
|
TORCH_CHECK(
|
|
false,
|
|
c10::str(
|
|
"ProcessGroup ",
|
|
getBackendName(),
|
|
" does not yet support sequence numbers."));
|
|
}
|
|
}
|
|
|
|
virtual c10::intrusive_ptr<Work> send(
|
|
std::vector<at::Tensor>& tensors,
|
|
int dstRank,
|
|
int tag) {
|
|
static auto op = c10::Dispatcher::singleton()
|
|
.findSchemaOrThrow("c10d::send", "")
|
|
.typed<c10::intrusive_ptr<::c10d::Work>(
|
|
at::TensorList,
|
|
const c10::intrusive_ptr<::c10d::ProcessGroup>&,
|
|
int64_t,
|
|
int64_t)>();
|
|
auto work = op.call(
|
|
tensors,
|
|
c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
|
|
dstRank,
|
|
tag);
|
|
if (c10d::allow_inflight_collective_as_graph_input()) {
|
|
for (const auto& tensor : tensors) {
|
|
c10d::register_work(tensor, work);
|
|
}
|
|
}
|
|
return work;
|
|
}
|
|
|
|
virtual c10::intrusive_ptr<Work> recv(
|
|
std::vector<at::Tensor>& tensors,
|
|
int srcRank,
|
|
int tag) {
|
|
static auto op = c10::Dispatcher::singleton()
|
|
.findSchemaOrThrow("c10d::recv_", "")
|
|
.typed<c10::intrusive_ptr<::c10d::Work>(
|
|
at::TensorList,
|
|
const c10::intrusive_ptr<::c10d::ProcessGroup>&,
|
|
int64_t,
|
|
int64_t)>();
|
|
auto work = op.call(
|
|
tensors,
|
|
c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
|
|
srcRank,
|
|
tag);
|
|
if (c10d::allow_inflight_collective_as_graph_input()) {
|
|
for (const auto& tensor : tensors) {
|
|
c10d::register_work(tensor, work);
|
|
}
|
|
}
|
|
return work;
|
|
}
|
|
|
|
virtual c10::intrusive_ptr<Work> recvAnysource(
|
|
std::vector<at::Tensor>& tensors,
|
|
int tag) {
|
|
static auto op = c10::Dispatcher::singleton()
|
|
.findSchemaOrThrow("c10d::recv_any_source_", "")
|
|
.typed<c10::intrusive_ptr<::c10d::Work>(
|
|
at::TensorList,
|
|
const c10::intrusive_ptr<::c10d::ProcessGroup>&,
|
|
int64_t)>();
|
|
auto work = op.call(
|
|
tensors,
|
|
c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
|
|
tag);
|
|
if (c10d::allow_inflight_collective_as_graph_input()) {
|
|
for (const auto& tensor : tensors) {
|
|
c10d::register_work(tensor, work);
|
|
}
|
|
}
|
|
return work;
|
|
}
|
|
|
|
virtual c10::intrusive_ptr<Work> barrier(
|
|
const BarrierOptions& opts = BarrierOptions()) {
|
|
static at::Tensor tensor;
|
|
// TODO: if nccl was specified then use it
|
|
auto device = opts.device;
|
|
if (device.has_value()) {
|
|
// set device tensor from argument
|
|
tensor = at::empty(
|
|
{1}, at::TensorOptions().device(device.value()).dtype(at::kByte));
|
|
} else if (backendType_ == c10d::ProcessGroup::BackendType::NCCL) {
|
|
// set cuda tensor
|
|
tensor = at::empty(
|
|
{1},
|
|
at::TensorOptions().device(at::DeviceType::CUDA).dtype(at::kByte));
|
|
} else {
|
|
// Default to using cpu implementation
|
|
tensor = at::empty(
|
|
{1},
|
|
at::TensorOptions().device(at::DeviceType::CPU).dtype(at::kByte));
|
|
}
|
|
|
|
static auto op = c10::Dispatcher::singleton()
|
|
.findSchemaOrThrow("c10d::barrier", "")
|
|
.typed<c10::intrusive_ptr<::c10d::Work>(
|
|
at::Tensor,
|
|
const c10::intrusive_ptr<::c10d::ProcessGroup>&,
|
|
const std::vector<int64_t>&,
|
|
int64_t)>();
|
|
|
|
auto work = op.call(
|
|
tensor,
|
|
c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
|
|
opts.device_ids,
|
|
opts.timeout.count());
|
|
if (c10d::allow_inflight_collective_as_graph_input()) {
|
|
c10d::register_work(tensor, work);
|
|
}
|
|
return work;
|
|
}
|
|
|
|
bool hasBackends() {
|
|
return !deviceTypeToBackendType_.empty();
|
|
}
|
|
|
|
void setBackend(
|
|
c10::DeviceType deviceType,
|
|
BackendType backendType,
|
|
const std::optional<c10::intrusive_ptr<Backend>>& backend) {
|
|
// TODO: should we add these entries after the backend setting succeeds?
|
|
deviceTypeToBackendType_[deviceType] = backendType;
|
|
deviceTypes_.insert(deviceType);
|
|
// if the backendType is already set then reuse it for this device
|
|
if (backendTypeToBackend_.find(backendType) !=
|
|
backendTypeToBackend_.end()) {
|
|
auto existingBackend = backendTypeToBackend_.at(backendType);
|
|
deviceTypeToBackend_[deviceType] = existingBackend;
|
|
TORCH_CHECK(
|
|
existingBackend->getBoundDeviceId() ==
|
|
(*backend)->getBoundDeviceId());
|
|
} else {
|
|
// check if backend has value
|
|
if (backend.has_value()) {
|
|
deviceTypeToBackend_[deviceType] = backend.value();
|
|
backendTypeToBackend_[backendType] = backend.value();
|
|
(*backend)->setBoundDeviceId(bound_device_id_);
|
|
}
|
|
}
|
|
}
|
|
|
|
c10::intrusive_ptr<Backend> getDefaultBackend() const {
|
|
TORCH_CHECK(
|
|
backendTypeToBackend_.find(backendType_) != backendTypeToBackend_.end(),
|
|
"Could not find the default backend type ",
|
|
backendType_,
|
|
" for Process Group with name ",
|
|
getBackendName(),
|
|
".");
|
|
return backendTypeToBackend_.at(backendType_);
|
|
}
|
|
|
|
void setDefaultBackend(const BackendType& backendType) {
|
|
backendType_ = backendType;
|
|
}
|
|
|
|
void setDefaultBackend(const std::string& backend) {
|
|
backendType_ = strToBackendType(backend);
|
|
}
|
|
|
|
c10::intrusive_ptr<Backend> getBackend(c10::DeviceType deviceType);
|
|
|
|
c10::intrusive_ptr<Backend> getBackend(BackendType backendType) const {
|
|
TORCH_CHECK(
|
|
backendTypeToBackend_.find(backendType) != backendTypeToBackend_.end(),
|
|
"Could not find backend type ",
|
|
backendType,
|
|
".");
|
|
return backendTypeToBackend_.at(backendType);
|
|
}
|
|
|
|
// Return device types supported by this ProcessGroup.
|
|
// Note: the return type is `Device` rather than `DeviceType` for the purpose
|
|
// of easy comparison at Python level. The `Device` will have default index
|
|
// (-1).
|
|
std::vector<c10::Device> getDeviceTypes() const {
|
|
std::vector<c10::Device> devices;
|
|
devices.reserve(deviceTypes_.size());
|
|
for (auto& dt : deviceTypes_) {
|
|
devices.emplace_back(dt);
|
|
}
|
|
return devices;
|
|
}
|
|
|
|
void registerOnCompletionHook(
|
|
std::function<void(std::shared_ptr<WorkInfo>)>&& hook) {
|
|
getDefaultBackend()->registerOnCompletionHook(std::move(hook));
|
|
}
|
|
|
|
void waitForPendingWorks() {
|
|
getDefaultBackend()->waitForPendingWorks();
|
|
}
|
|
|
|
bool hasHooks() const {
|
|
return getDefaultBackend()->hasHooks();
|
|
}
|
|
|
|
const std::string& getGroupName() const;
|
|
void setGroupName(const std::string& name);
|
|
const std::string& getGroupDesc() const;
|
|
void setGroupDesc(const std::string& name);
|
|
void enableCollectivesTiming();
|
|
|
|
void release_resources() override;
|
|
|
|
// ProcessGroups optionally can be "bound" to a specific device.
|
|
// Currently this is only for nccl and allows for some opt-in
|
|
// optimizations such as automatic use of ncclCommSplit. The device
|
|
// is specified in `init_process_group` and eventually makes it
|
|
// here and then down into the actual backend instances.
|
|
std::optional<at::Device> getBoundDeviceId() const {
|
|
return bound_device_id_;
|
|
}
|
|
|
|
void setBoundDeviceId(std::optional<at::Device> device) {
|
|
if (device) {
|
|
TORCH_CHECK(device->has_index(), "setBoundDeviceId must have an index");
|
|
}
|
|
bound_device_id_ = device;
|
|
}
|
|
|
|
protected:
|
|
// Implementations of this interface need to call this to setup
|
|
// appropriate logging etc.
|
|
void init();
|
|
|
|
c10::intrusive_ptr<c10d::Store> store_;
|
|
// NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
|
|
const int rank_;
|
|
// NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
|
|
const int size_;
|
|
// NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
|
|
BackendType backendType_;
|
|
std::string pg_desc_;
|
|
|
|
// Debug level setting. It is parsed once when ProcessGroup is constructed and
|
|
// remains the same across use of this process group.
|
|
DebugLevel dist_debug_level_{DebugLevel::Off};
|
|
|
|
// Backend classes for this ProcessGroup
|
|
std::unordered_set<c10::DeviceType> deviceTypes_;
|
|
std::unordered_map<c10::DeviceType, BackendType> deviceTypeToBackendType_;
|
|
std::unordered_map<c10::DeviceType, c10::intrusive_ptr<Backend>>
|
|
deviceTypeToBackend_;
|
|
std::unordered_map<BackendType, c10::intrusive_ptr<Backend>>
|
|
backendTypeToBackend_;
|
|
|
|
std::optional<at::Device> bound_device_id_;
|
|
};
|
|
|
|
} // namespace c10d
|