[Distributed] [19/N] Fix clang-tidy warnings in torch/csrc/distributed/ (#138903)

Fixes #ISSUE_NUMBER Pull Request resolved: https://github.com/pytorch/pytorch/pull/138903 Approved by: https://github.com/ezyang
2025-10-20 21:14:14 +08:00 · 2024-10-28 05:29:23 +00:00
parent 39aa3cb8d6
commit f9ae3fac8c
16 changed files with 43 additions and 34 deletions
--- a/torch/csrc/distributed/c10d/CUDASymmetricMemory.hpp
+++ b/torch/csrc/distributed/c10d/CUDASymmetricMemory.hpp
@ -4,8 +4,7 @@
 #include <torch/csrc/distributed/c10d/Store.hpp>
 #include <torch/csrc/distributed/c10d/SymmetricMemory.hpp>

-namespace c10d {
-namespace symmetric_memory {
+namespace c10d::symmetric_memory {

 #if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
 using HandleType = CUmemGenericAllocationHandle;
@ -85,13 +84,13 @@ struct Block : public c10::intrusive_ptr_target {
      size_t block_size,
      size_t buffer_size,
      size_t signal_pad_offset,
-      const std::string& group_name)
+      std::string group_name)
      : handle(handle),
        device_idx(device_idx),
        block_size(block_size),
        buffer_size(buffer_size),
        signal_pad_offset(signal_pad_offset),
-        group_name(group_name),
+        group_name(std::move(group_name)),
        symm_mem(nullptr) {}
 };

@ -113,5 +112,4 @@ class CUDASymmetricMemoryAllocator : public SymmetricMemoryAllocator {
  std::unordered_map<void*, c10::intrusive_ptr<Block>> ptr_to_block_;
 };

-} // namespace symmetric_memory
-} // namespace c10d
+} // namespace c10d::symmetric_memory
--- a/torch/csrc/distributed/c10d/DMAConnectivity.hpp
+++ b/torch/csrc/distributed/c10d/DMAConnectivity.hpp
@ -1,7 +1,5 @@
 #pragma once

-#include <optional>
-
 #include <ATen/ATen.h>

 namespace c10d {
@ -25,7 +23,7 @@ struct TORCH_API DMAConnectivity : c10::intrusive_ptr_target {

 struct DMAConnectivityDetector : c10::intrusive_ptr_target {
  virtual c10::intrusive_ptr<DMAConnectivity> detect() = 0;
-  virtual ~DMAConnectivityDetector() {}
+  ~DMAConnectivityDetector() override = default;
 };

 C10_EXPORT void register_dma_connectivity_detector(
--- a/torch/csrc/distributed/c10d/GroupRegistry.cpp
+++ b/torch/csrc/distributed/c10d/GroupRegistry.cpp
@ -11,6 +11,7 @@ class GroupRegistry {
 public:
  void register_group(
      std::string group_name,
+      // NOLINTNEXTLINE(performance-unnecessary-value-param)
      c10::intrusive_ptr<c10d::ProcessGroup> group) {
    std::unique_lock write_lock(lock_);
    auto [_, inserted] =
--- a/torch/csrc/distributed/c10d/NCCLUtils.hpp
+++ b/torch/csrc/distributed/c10d/NCCLUtils.hpp
@ -370,7 +370,7 @@ class NCCLComm {
  NCCLComm& operator=(NCCLComm&& other) = delete;

  // Move constructable
-  // NOLINTNEXTLINE(.*-noexcept-move-.*)
+  // NOLINTNEXTLINE(*-noexcept-move-*)
  NCCLComm(NCCLComm&& other) {
    // Using other's lock, as it reads other's states
    // Can not use this.mutex_, as this object is being constructed.
--- a/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp
@ -6,6 +6,7 @@
 #include <deque>
 #include <mutex>
 #include <thread>
+#include <utility>
 #include <vector>

 #include <gloo/algorithm.h>
@ -91,7 +92,8 @@ class TORCH_API ProcessGroupGloo : public Backend {
  // Wrap c10d store as Gloo store
  class TORCH_API GlooStore : public ::gloo::rendezvous::Store {
   public:
-    GlooStore(const c10::intrusive_ptr<::c10d::Store>& store) : store_(store) {}
+    GlooStore(c10::intrusive_ptr<::c10d::Store> store)
+        : store_(std::move(store)) {}

    void setUint(const std::string& key, const std::vector<uint8_t>& value) {
      store_->set(key, value);
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@ -87,6 +87,7 @@ ncclDataType_t getNcclDataType(at::ScalarType type) {

 bool complexViewAsRealAllowed(const ReduceOp& reduceOp) {
  switch (reduceOp) {
+    // NOLINTNEXTLINE(bugprone-branch-clone)
    case ReduceOp::SUM:
      return true;
    case ReduceOp::AVG:
@ -119,6 +120,7 @@ ncclRedOpRAII unpackPreMulSum(
      &preMulSum,
      // https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/ops.html#ncclredopcreatepremulsum
      // tells us that the scalar input is strictly a multiplier.
+      // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
      /*scalar=*/has_tensor ? const_cast<T*>(ptr_factor) : &scalar_factor,
      dataType,
      residence,
@ -318,6 +320,7 @@ static void cacheAllocatorRegisterHook(
    auto& ncclComm = it.first;
    auto& devIdx = it.second;
    if (te.device_ == devIdx) {
+      // NOLINTNEXTLINE(performance-no-int-to-ptr)
      ncclComm->registerSegment(reinterpret_cast<void*>(te.addr_), te.size_);
    }
  }
@ -336,6 +339,7 @@ static void cacheAllocatorDeregisterHook(
    auto& ncclComm = it.first;
    auto& devIdx = it.second;
    if (te.device_ == devIdx) {
+      // NOLINTNEXTLINE(performance-no-int-to-ptr)
      ncclComm->deregisterSegment(reinterpret_cast<void*>(te.addr_));
    }
  }
@ -869,7 +873,6 @@ ProcessGroupNCCL::ProcessGroupNCCL(
    : Backend(rank, size),
      store_(std::move(store)),
      options_(std::move(options)),
-
      traceKeyStart_(getTraceStartKey("NCCL", rank)),
      traceKeyEnd_(getTraceEndKey("NCCL", rank)),
      terminateProcessGroup_(false),
@ -888,7 +891,7 @@ ProcessGroupNCCL::ProcessGroupNCCL(
  // other threads and cause segfaults.
  const auto ncclVersion = getNcclVersion();
  this->setGroupUid(options_->group_name);
-  this->localDeviceCount_ = at::cuda::getNumGPUs();
+  this->localDeviceCount_ = static_cast<int>(at::cuda::getNumGPUs());
  logPrefix_ = createLogPrefix();
  blockingWait_ = getCvarBool(TORCH_NCCL_BLOCKING_WAIT, false);
  asyncErrorHandling_ = static_cast<ErrorHandlingMode>(
@ -1013,8 +1016,8 @@ ProcessGroupNCCL::ProcessGroupNCCL(
    this->globalRankStride = 0;
  } else {
    bool ranksAreStrided = true;
-    int startRank = options_->global_ranks_in_group[0];
-    int stride =
+    auto startRank = options_->global_ranks_in_group[0];
+    auto stride =
        options_->global_ranks_in_group[1] - options_->global_ranks_in_group[0];
    for (std::vector<uint64_t>::size_type i = 0;
         i < options_->global_ranks_in_group.size();
@ -1377,6 +1380,7 @@ void ProcessGroupNCCL::shutdown() {
  this->abort();
 }

+// NOLINTNEXTLINE(bugprone-exception-escape)
 ProcessGroupNCCL::~ProcessGroupNCCL() {
  LOG(INFO) << logPrefix() << "ProcessGroupNCCL destructor entered.";

--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
@ -198,7 +198,8 @@ struct DumpPipe {
    if (fd_ == -1) {
      return false;
    }
-    char buf[128];
+    // NOLINTNEXTLINE(*array*)
+    char buf[128]{};
    // non-blocking from O_NONBLOCK above.
    // Ignore EINTR because we already will poll this
    // again later.
@ -461,8 +462,8 @@ class TORCH_API ProcessGroupNCCL : public Backend {
    // NOTE: We intentionally store raw pointers so that
    // we do not attempt to destroy the event objects on process exit,
    // because cuda may be gone.
-    std::deque<at::cuda::CUDAEvent*>
-        eventsArray_[2]; // 0 for timing=false, 1 for timing=true
+    std::array<std::deque<at::cuda::CUDAEvent*>, 2>
+        eventsArray_; // 0 for timing=false, 1 for timing=true
  };

  struct Options : Backend::Options {
@ -1181,7 +1182,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
  bool dumpOnTimeoutOrEx_;

  // Whether or not to sleep after an exception is thrown in the watchdog.
-  bool sleepAfterException_;
+  bool sleepAfterException_{};

  // Whether or not to enable nan check for input tensors to collectives.
  bool enableNanCheck_;
--- a/torch/csrc/distributed/c10d/PyProcessGroup.hpp
+++ b/torch/csrc/distributed/c10d/PyProcessGroup.hpp
@ -212,6 +212,7 @@ class TORCH_PYTHON_API PythonOnCompletionHook {
  PythonOnCompletionHook(py::object hook) : hook_(std::move(hook)) {}
  PythonOnCompletionHook(const PythonOnCompletionHook&) = default;

+  // NOLINTNEXTLINE(bugprone-exception-escape)
  ~PythonOnCompletionHook() {
    py::gil_scoped_acquire ag;
    hook_.dec_ref();
--- a/torch/csrc/distributed/c10d/RankLocal.hpp
+++ b/torch/csrc/distributed/c10d/RankLocal.hpp
@ -55,7 +55,7 @@ class RankLocal {
  }

 private:
-  RankLocal(){};
+  RankLocal() = default;
  thread_local static T* cached_;
  static std::unordered_map<uint64_t, T> thread_id_to_rank_local_;
  static std::shared_mutex lock_;
--- a/torch/csrc/distributed/c10d/Store.hpp
+++ b/torch/csrc/distributed/c10d/Store.hpp
@ -75,6 +75,7 @@ class TORCH_API Store : public torch::CustomClassHolder {
  // watchKey() is deprecated and no longer supported.
  virtual void watchKey(
      const std::string& /* unused */,
+      // NOLINTNEXTLINE(performance-unnecessary-value-param)
      WatchKeyCallback /* unused */) {
    TORCH_CHECK(false, "watchKey is deprecated, no implementation support it.");
  }
--- a/torch/csrc/distributed/c10d/SymmetricMemory.hpp
+++ b/torch/csrc/distributed/c10d/SymmetricMemory.hpp
@ -3,8 +3,7 @@
 #include <ATen/ATen.h>
 #include <torch/csrc/distributed/c10d/Store.hpp>

-namespace c10d {
-namespace symmetric_memory {
+namespace c10d::symmetric_memory {

 // SymmetricMemory represents symmetric allocations across a group of devices.
 // The allocations represented by a SymmetricMemory object are accessible by
@ -38,7 +37,7 @@ namespace symmetric_memory {
 // for these two barriers, they can operate correctly in parallel.
 class TORCH_API SymmetricMemory : public c10::intrusive_ptr_target {
 public:
-  virtual ~SymmetricMemory() {}
+  ~SymmetricMemory() override = default;

  virtual std::vector<void*> get_buffer_ptrs() = 0;
  virtual std::vector<void*> get_signal_pad_ptrs() = 0;
@ -72,7 +71,7 @@ class TORCH_API SymmetricMemory : public c10::intrusive_ptr_target {

 class SymmetricMemoryAllocator : public c10::intrusive_ptr_target {
 public:
-  virtual ~SymmetricMemoryAllocator(){};
+  ~SymmetricMemoryAllocator() override = default;

  virtual void* alloc(
      size_t size,
@ -159,5 +158,4 @@ TORCH_API c10::intrusive_ptr<SymmetricMemory> get_symmetric_memory(
 TORCH_API bool has_multicast_support(
    c10::DeviceType device_type,
    int device_idx);
-} // namespace symmetric_memory
-} // namespace c10d
+} // namespace c10d::symmetric_memory
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@ -109,6 +109,7 @@ class IntrusivePtrNoGilDestructor {
  explicit IntrusivePtrNoGilDestructor(T* impl)
      // NOLINTNEXTLINE(bugprone-exception-escape)
      : impl_(c10::intrusive_ptr<T>::unsafe_steal_from_new(impl)) {}
+  // NOLINTNEXTLINE(bugprone-exception-escape)
  ~IntrusivePtrNoGilDestructor() {
    if (impl_) {
      if (PyGILState_Check()) {
@ -338,6 +339,7 @@ class PythonRequest : public ::c10d::control_plane::Request {
 };
 class PythonResponse : public ::c10d::control_plane::Response {
 public:
+  // NOLINTNEXTLINE(cppcoreguidelines-rvalue-reference-param-not-moved)
  void setContent(std::string&& content, const std::string& content_type)
      override {
    PYBIND11_OVERRIDE_PURE_NAME(
@ -1610,7 +1612,7 @@ Arguments:
            if (!store) {
              throw py::value_error("store argument cannot be None");
            }
-            return new ::c10d::PrefixStore(prefix, store);
+            return new ::c10d::PrefixStore(prefix, std::move(store));
          }),
          py::arg("prefix"),
          py::arg("store"))
@ -2755,7 +2757,7 @@ options :class:`~torch.distributed.ProcessGroupNCCL.Options`).
          .def(
              "_verify_work_timeout",
              [](const c10::intrusive_ptr<::c10d::ProcessGroupNCCL>& self,
-                 const c10::intrusive_ptr<::c10d::Work> work,
+                 const c10::intrusive_ptr<::c10d::Work>& work,
                 const std::chrono::milliseconds& timeout) {
                return self->verifyWorkTimeoutForTest(work, timeout);
              },
@ -3415,6 +3417,7 @@ static PyMethodDef methods[] = { // NOLINT
    {"_c10d_init", c10d_init, METH_NOARGS, nullptr},
    {nullptr, nullptr, 0, nullptr}};

+// NOLINTNEXTLINE(misc-use-internal-linkage)
 PyMethodDef* python_functions() {
  return methods;
 }
--- a/torch/csrc/distributed/c10d/intra_node_comm.cpp
+++ b/torch/csrc/distributed/c10d/intra_node_comm.cpp
@ -7,6 +7,7 @@

 namespace c10d::intra_node_comm {

+// NOLINTNEXTLINE(misc-use-internal-linkage)
 bool isIntraNodeCommSupported();

 static std::vector<std::string> ENABLE_INTRA_NODE_COMM = {
@ -86,7 +87,7 @@ bool IntraNodeComm::isEnabled() {
 * Use c10d::Store to perform allgather on a trivially copyable type.
 */
 template <typename T>
-std::vector<T> storeAllGather(
+static std::vector<T> storeAllGather(
    const c10::intrusive_ptr<c10d::Store>& store,
    const std::string& prefix,
    size_t rank,
@ -134,10 +135,12 @@ bool IntraNodeComm::rendezvous() {
    return false;
  }

+  // NOLINTNEXTLINE(bugprone-signed-char-misuse)
  deviceIdx_ = at::cuda::current_device();

  // Exchange hostname and device bus ID
  struct DevInfo {
+    // NOLINTNEXTLINE
    char hostname[HOST_NAME_MAX + 1];
    int deviceIdx;
  };
@ -170,7 +173,8 @@ bool IntraNodeComm::rendezvous() {
  }

  auto groupName = "IntraNodeComm" + std::to_string(intraNodeCommIdx++);
-  set_group_info(groupName, rank_, worldSize_, store_);
+  set_group_info(
+      groupName, static_cast<int>(rank_), static_cast<int>(worldSize_), store_);
  auto allocator = get_allocator(c10::DeviceType::CUDA);
  symmetricMemoryPtr_ = allocator->alloc(bufferSize_, deviceIdx_, groupName);
  symmetricMemory_ = allocator->rendezvous(symmetricMemoryPtr_);
--- a/torch/csrc/distributed/c10d/quantization/quantization.h
+++ b/torch/csrc/distributed/c10d/quantization/quantization.h
@ -6,7 +6,6 @@
 #pragma once

 #include <ATen/ATen.h>
-#include <vector>

 namespace torch::distributed::c10d::quantization {

--- a/torch/csrc/distributed/c10d/quantization/quantization_gpu.h
+++ b/torch/csrc/distributed/c10d/quantization/quantization_gpu.h
@ -6,7 +6,6 @@
 #pragma once

 #include <ATen/ATen.h>
-#include <vector>

 namespace torch::distributed::c10d::quantization {

--- a/torch/csrc/distributed/c10d/sequence_num.hpp
+++ b/torch/csrc/distributed/c10d/sequence_num.hpp
@ -7,11 +7,11 @@
 #include <vector>

 namespace c10d {
-const int kUnsetSeqNum = 0;
+constexpr int kUnsetSeqNum = 0;

 namespace {
 constexpr int kByteOffset = 8;
-}
+} // namespace

 // Converts from int to char vec to write in store
 template <typename T>