pytorch/torch/csrc/distributed/c10d/Backend.hpp

#pragma once

#include <memory>
#include <utility>
#include <vector>

#include <ATen/ATen.h>
#include <c10/core/Allocator.h>
#include <c10/macros/Macros.h>

#include <torch/csrc/distributed/c10d/Types.hpp>
#include <torch/csrc/distributed/c10d/Utils.hpp>
#include <torch/csrc/distributed/c10d/Work.hpp>
#include <torch/csrc/distributed/c10d/debug.h>

constexpr auto kBackendDefaultTimeout =
    std::chrono::milliseconds(30 * 60 * 1000);

namespace c10d {

enum class ErrorType {
  SUCCESS = 0,
  TIMEOUT = 1,
  // e.g., NCCL error, etc
  COMM_ERROR = 2,
  // TODO, do we need to distinguish between remote timeout or remote COMM
  // errors?
  REMOTE_ERROR = 3
};

class TORCH_API Backend : public torch::CustomClassHolder {
 public:
  // Backend Options is a base struct that defines the basic options
  // when constructing a Backend. Each Backend subclass should
  // extend this struct and define its options if it wants to provide more
  // config options (beyond basic ones defined here) to end user.
  struct TORCH_API Options : torch::CustomClassHolder {
    explicit Options(
        std::string backend,
        std::chrono::milliseconds timeout = kBackendDefaultTimeout)
        : timeout(timeout), backend(std::move(backend)) {}
    ~Options() override = default;

    std::chrono::milliseconds timeout;

    // backend name
    // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
    const std::string backend;
    std::string group_name;
    std::vector<uint64_t> global_ranks_in_group;
  };

  explicit Backend(int rank, int size);
  ~Backend() override = 0;

  int getRank() const {
    return rank_;
  }

  int getSize() const {
    return size_;
  }

  // Returns an unique opaque ID of this backend that can be used to correlate
  // with its collectives.
  int64_t getID() const {
    return reinterpret_cast<std::intptr_t>(this);
  }

  virtual bool supportsSplitting() const {
    return false;
  }

  virtual bool supportsCoalescing() const {
    return false;
  }

  virtual bool supportsTimeEstimation() const {
    return false;
  }

  virtual void setTimeout(std::chrono::milliseconds timeout) {
    TORCH_CHECK(
        false,
        c10::str(
            "Backend ", getBackendName(), " does not support setting timeout"));
  }

  virtual void startCoalescing() {
    TORCH_CHECK(
        false,
        c10::str(
            "Backend ",
            getBackendName(),
            " does not implement startCoalescing"));
  }

  virtual c10::intrusive_ptr<Work> endCoalescing() {
    TORCH_CHECK(
        false,
        c10::str(
            "Backend ", getBackendName(), " does not implement endCoalescing"));
  }

  // Subclasses must override this method to return the backend name
  virtual const std::string getBackendName() const {
    TORCH_INTERNAL_ASSERT(false, "getBackendName is not implemented.");
  }

  // Subclasses must override this method to return the backend name
  virtual c10::intrusive_ptr<Options> getBackendOptions() {
    TORCH_CHECK(
        false,
        c10::str(
            "Backend ",
            getBackendName(),
            " does not implement getBackendOptions."));
  }

  virtual c10::intrusive_ptr<Work> broadcast(
      std::vector<at::Tensor>& /* tensors */,
      const BroadcastOptions& /* opts */ = BroadcastOptions()) {
    TORCH_CHECK(
        false,
        c10::str("Backend ", getBackendName(), " does not support broadcast"));
  }

  virtual c10::intrusive_ptr<Work> allreduce(
      std::vector<at::Tensor>& /* tensors */,
      const AllreduceOptions& /* opts */ = AllreduceOptions()) {
    TORCH_CHECK(
        false,
        c10::str("Backend ", getBackendName(), " does not support allreduce"));
  }

  virtual c10::intrusive_ptr<Work> allreduce_sparse(
      std::vector<at::Tensor>& /* tensors */,
      const AllreduceOptions& /* opts */ = AllreduceOptions()) {
    TORCH_CHECK(
        false,
        c10::str(
            "Backend ",
            getBackendName(),
            " does not support allreduce sparse"));
  }

  virtual c10::intrusive_ptr<Work> allreduce_coalesced(
      std::vector<at::Tensor>& /* tensors */,
      const AllreduceCoalescedOptions& /* opts */ =
          AllreduceCoalescedOptions()) {
    TORCH_CHECK(
        false,
        c10::str(
            "Backend ",
            getBackendName(),
            " does not support allreduce_coalesced"));
  }

  virtual c10::intrusive_ptr<Work> reduce(
      std::vector<at::Tensor>& /* tensors */,
      const ReduceOptions& /* opts */ = ReduceOptions()) {
    TORCH_CHECK(
        false,
        c10::str("Backend ", getBackendName(), " does not support reduce"));
  }

  virtual c10::intrusive_ptr<Work> allgather(
      std::vector<std::vector<at::Tensor>>& /* outputTensors */,
      std::vector<at::Tensor>& /* inputTensors */,
      const AllgatherOptions& /* opts */ = AllgatherOptions()) {
    TORCH_CHECK(
        false,
        c10::str("Backend ", getBackendName(), " does not support allgather"));
  }

  // Gathers a single tensor inputBuffer into a single buffer outputBuffer that
  // is interpreted as a contiguous collection of size inputBuffer * WORLD_SIZE.
  // For implementers of ProcessGroup API and advanced users only.
  // Note: this function will be deprecated in near future.
  virtual c10::intrusive_ptr<Work> _allgather_base(
      at::Tensor& /* outputBuffer */,
      at::Tensor& /* inputBuffer */,
      const AllgatherOptions& /* opts */ = AllgatherOptions()) {
    TORCH_CHECK(
        false,
        c10::str(
            "Backend ", getBackendName(), " does not support _allgather_base"));
  }

  // This function is deprecated and will be moved out of Backend to comms:
  // * do not add dependencies on this function,
  // * do not implement it in your Backend, implement _allgather_base
  //   instead.
  virtual c10::intrusive_ptr<Work> allgather_coalesced(
      std::vector<std::vector<at::Tensor>>& /* outputTensorLists */,
      std::vector<at::Tensor>& /* inputTensors */,
      const AllgatherOptions& /* opts */ = AllgatherOptions()) {
    TORCH_CHECK(
        false,
        c10::str(
            "Backend ",
            getBackendName(),
            " does not support allgather_coalesced"));
  }

  // This function is a coalesced version of `allgather_into_tensor` (currently
  // still named as `_allgather_base`). Each tensor in the vector corresponds to
  // an input/output of one `allgather_into_tensor` operation.
  virtual c10::intrusive_ptr<Work> allgather_into_tensor_coalesced(
      std::vector<at::Tensor>& /* outputs */,
      std::vector<at::Tensor>& /* inputs */,
      const AllgatherOptions& /* opts */ = AllgatherOptions()) {
    TORCH_CHECK(
        false,
        c10::str(
            "Backend ",
            getBackendName(),
            " does not support allgather_into_tensor_coalesced"));
  }

  virtual c10::intrusive_ptr<Work> gather(
      std::vector<std::vector<at::Tensor>>& /* outputTensors */,
      std::vector<at::Tensor>& /* inputTensors */,
      const GatherOptions& /* opts */ = GatherOptions()) {
    TORCH_CHECK(
        false,
        c10::str("Backend ", getBackendName(), " does not support gather"));
  }

  virtual c10::intrusive_ptr<Work> scatter(
      std::vector<at::Tensor>& /* outputTensors */,
      std::vector<std::vector<at::Tensor>>& /* inputTensors */,
      const ScatterOptions& /* opts */ = ScatterOptions()) {
    TORCH_CHECK(
        false,
        c10::str("Backend ", getBackendName(), " does not support scatter"));
  }

  virtual c10::intrusive_ptr<Work> reduce_scatter(
      std::vector<at::Tensor>& /* outputTensors */,
      std::vector<std::vector<at::Tensor>>& /* inputTensors */,
      const ReduceScatterOptions& /* opts */ = ReduceScatterOptions()) {
    TORCH_CHECK(
        false,
        c10::str(
            "Backend ", getBackendName(), " does not support reduce_scatter"));
  }

  virtual c10::intrusive_ptr<Work> _reduce_scatter_base(
      at::Tensor& /* outputBuffer */,
      at::Tensor& /* inputBuffer */,
      const ReduceScatterOptions& /* opts */ = ReduceScatterOptions()) {
    TORCH_CHECK(
        false,
        c10::str(
            "Backend ",
            getBackendName(),
            " does not support _reduce_scatter_base"));
  }

  // This function is a coalesced version of `reduce_scatter_tensor` (currently
  // still named as `_reduce_scatter_base`). Each tensor in the vector
  // corresponds to an input/output of one `reduce_scatter_tensor` operation.
  virtual c10::intrusive_ptr<Work> reduce_scatter_tensor_coalesced(
      std::vector<at::Tensor>& /* outputs */,
      std::vector<at::Tensor>& /* inputs */,
      const ReduceScatterOptions& /* opts */ = ReduceScatterOptions()) {
    TORCH_CHECK(
        false,
        c10::str(
            "Backend ",
            getBackendName(),
            " does not support reduce_scatter_tensor_coalesced"));
  }

  virtual c10::intrusive_ptr<Work> alltoall_base(
      at::Tensor& /* outputBuffer */,
      at::Tensor& /* inputBuffer */,
      std::vector<int64_t>& /* outputSplitSizes */,
      std::vector<int64_t>& /* inputSplitSizes */,
      const AllToAllOptions& /* opts */ = AllToAllOptions()) {
    TORCH_CHECK(
        false,
        c10::str(
            "Backend ", getBackendName(), " does not support alltoall_base"));
  }

  virtual c10::intrusive_ptr<Work> alltoall(
      std::vector<at::Tensor>& /* outputTensors */,
      std::vector<at::Tensor>& /* inputTensors */,
      const AllToAllOptions& opts = AllToAllOptions()) {
    TORCH_CHECK(
        false,
        c10::str("Backend ", getBackendName(), " does not support alltoall"));
  }

  virtual void monitoredBarrier(
      const BarrierOptions& /* unused */,
      bool /* unused */ = false) {
    auto backendName = getBackendName();
    TORCH_CHECK(
        false,
        c10::str(
            "Backend ",
            backendName,
            " does not support monitoredBarrier, only GLOO supports monitored barrier."));
  }

  // Agrees on an initial sequence number for the whole group by having rank 0
  // create it and broadcast it to other ranks using the store. Only implemented
  // for GLOO and NCCL backends currently.
  virtual void setSequenceNumberForGroup() {
    auto backendName = getBackendName();
    TORCH_CHECK(
        false,
        c10::str(
            "Backend ",
            backendName,
            " does not yet support sequence numbers."));
  }

  // Retrieves the current sequence number for the whole group, which should be
  // in sync. If the returned number is not consistent across the group, it
  // may indicate that there is some sort of collective desynchronization.
  virtual uint64_t getSequenceNumberForGroup() {
    auto backendName = getBackendName();
    TORCH_CHECK(
        false,
        c10::str(
            "Backend ",
            backendName,
            " does not yet support sequence numbers."));
  }

  virtual c10::intrusive_ptr<Work> send(
      std::vector<at::Tensor>& /* tensors */,
      int /* dstRank */,
      int /* tag */) {
    TORCH_CHECK(
        false,
        c10::str("Backend ", getBackendName(), " does not support send"));
  }

  virtual c10::intrusive_ptr<Work> recv(
      std::vector<at::Tensor>& /* tensors */,
      int /* srcRank */,
      int /* tag */) {
    TORCH_CHECK(
        false,
        c10::str("Backend ", getBackendName(), " does not support recv"));
  }

  virtual c10::intrusive_ptr<Work> recvAnysource(
      std::vector<at::Tensor>& /* tensors */,
      int /* tag */) {
    TORCH_CHECK(
        false,
        c10::str(
            "Backend ", getBackendName(), " does not support recvAnysource"));
  }

  virtual c10::intrusive_ptr<Work> barrier(
      const BarrierOptions& /* opts */ = BarrierOptions()) {
    TORCH_CHECK(
        false,
        c10::str("Backend ", getBackendName(), " does not support barrier"));
  }

  virtual void registerOnCompletionHook(
      std::function<void(std::shared_ptr<WorkInfo>)>&& hook) {
    TORCH_CHECK(
        false,
        "Only ProcessGrouppNCCL supports onCompletion hook, but got ",
        getBackendName(),
        " backend.");
  }

  virtual void waitForPendingWorks() {
    TORCH_CHECK(
        false,
        "Only ProcessGrouppNCCL supports waitForPendingWorks, but got ",
        getBackendName(),
        " backend.");
  }

  virtual void enableCollectivesTiming() {
    TORCH_CHECK(
        false,
        "Backend ",
        getBackendName(),
        " is missing implementation of enableCollectivesTiming.");
  }

  virtual c10::intrusive_ptr<Backend> split(
      const c10::intrusive_ptr<Store>& store,
      const std::vector<int>& ranks,
      const c10::intrusive_ptr<Options>& opts) {
    TORCH_CHECK(
        false,
        "Backend ",
        getBackendName(),
        " is missing implementation of split.");
  }

  virtual c10::intrusive_ptr<Backend> merge(
      const c10::intrusive_ptr<Store>& store,
      const c10::intrusive_ptr<Options>& opts,
      const int& rank,
      const int& size) {
    TORCH_CHECK(
        false,
        "Backend ",
        getBackendName(),
        " is missing implementation of merge.");
  }

  bool hasHooks() const {
    return onCompletionHook_ != nullptr;
  }

  // Do not call this directly, use ProcessGroup::setGroupName instead.
  void setGroupUid(const std::string& pg_uid) {
    pg_uid_ = pg_uid;
  }

  const std::string& getGroupUid() const {
    return pg_uid_;
  }

  void setGroupDesc(const std::string& desc) {
    pg_desc_ = desc;
  }

  const std::string& getGroupDesc() const {
    return pg_desc_;
  }

  // See similar functions in ProcessGroup.hpp for context.
  std::optional<at::Device> getBoundDeviceId() const {
    return bound_device_id_;
  }

  // Perform an eager connect to the specified device if the backend supports
  // it.
  virtual void eagerConnectSingleDevice(at::Device device) {
    // no-op in the default case; this is an optimization some
    // backends may perform
  }

  void setBoundDeviceId(std::optional<at::Device> device) {
    if (device) {
      TORCH_CHECK(device->has_index(), "setBoundDeviceId must have an index");
    }
    bound_device_id_ = device;
  }

  virtual ErrorType getError() {
    TORCH_CHECK(
        false,
        c10::str("Backend ", getBackendName(), " does not support getError"));
  }

  virtual std::shared_ptr<c10::Allocator> getMemAllocator() {
    TORCH_CHECK(
        false,
        c10::str(
            "Backend ", getBackendName(), " does not support getMemAllocator"));
  }

  // Allocate tensor (aten::empty) from backend's communication-optimized memory
  // pool
  virtual at::Tensor allocateTensor(long size, at::TensorOptions options = {}) {
    TORCH_CHECK(
        false,
        c10::str(
            "Backend ", getBackendName(), " does not support allocateTensor"));
  }

  // Returns true if backend supports tensor allocation
  virtual bool supportsTensorAlloc(c10::DeviceIndex deviceIdx) {
    // Change to true in concrete backend if supported
    return false;
  }

  // Aborts all pending operations and connections in the backend if the backend
  // supports it.
  virtual void abort() {}

  // Shutdown the backend if the backend supports it. This should be used for
  // normal shutdown.
  virtual void shutdown() {}

 protected:
  // Implementations of this interface need to call this to setup
  // appropriate logging etc.
  void init();

  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
  const int rank_;
  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
  const int size_;
  // Debug level setting. It is parsed once when ProcessGroup is constructed and
  // remains the same across use of this process group.
  DebugLevel dist_debug_level_;
  std::string pg_uid_;
  std::string pg_desc_;

  std::function<void(std::shared_ptr<WorkInfo>)> onCompletionHook_;

  std::optional<at::Device> bound_device_id_;
};

} // namespace c10d