From f9ae3fac8c129838554751eb102d1fc933406370 Mon Sep 17 00:00:00 2001
From: cyy <cyyever@outlook.com>
Date: Mon, 28 Oct 2024 05:29:23 +0000
Subject: [PATCH] [Distributed] [19/N] Fix clang-tidy warnings in
 torch/csrc/distributed/  (#138903)

Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138903
Approved by: https://github.com/ezyang
---
 torch/csrc/distributed/c10d/CUDASymmetricMemory.hpp  | 10 ++++------
 torch/csrc/distributed/c10d/DMAConnectivity.hpp      |  4 +---
 torch/csrc/distributed/c10d/GroupRegistry.cpp        |  1 +
 torch/csrc/distributed/c10d/NCCLUtils.hpp            |  2 +-
 torch/csrc/distributed/c10d/ProcessGroupGloo.hpp     |  4 +++-
 torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp     | 12 ++++++++----
 torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp     |  9 +++++----
 torch/csrc/distributed/c10d/PyProcessGroup.hpp       |  1 +
 torch/csrc/distributed/c10d/RankLocal.hpp            |  2 +-
 torch/csrc/distributed/c10d/Store.hpp                |  1 +
 torch/csrc/distributed/c10d/SymmetricMemory.hpp      | 10 ++++------
 torch/csrc/distributed/c10d/init.cpp                 |  7 +++++--
 torch/csrc/distributed/c10d/intra_node_comm.cpp      |  8 ++++++--
 .../distributed/c10d/quantization/quantization.h     |  1 -
 .../distributed/c10d/quantization/quantization_gpu.h |  1 -
 torch/csrc/distributed/c10d/sequence_num.hpp         |  4 ++--
 16 files changed, 43 insertions(+), 34 deletions(-)
diff --git a/torch/csrc/distributed/c10d/CUDASymmetricMemory.hpp b/torch/csrc/distributed/c10d/CUDASymmetricMemory.hpp
index 4fa907a95288..6efdcf627767 100644
--- a/torch/csrc/distributed/c10d/CUDASymmetricMemory.hpp
+++ b/torch/csrc/distributed/c10d/CUDASymmetricMemory.hpp
@@ -4,8 +4,7 @@
 #include <torch/csrc/distributed/c10d/Store.hpp>
 #include <torch/csrc/distributed/c10d/SymmetricMemory.hpp>
 
-namespace c10d {
-namespace symmetric_memory {
+namespace c10d::symmetric_memory {
 
 #if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
 using HandleType = CUmemGenericAllocationHandle;
@@ -85,13 +84,13 @@ struct Block : public c10::intrusive_ptr_target {
       size_t block_size,
       size_t buffer_size,
       size_t signal_pad_offset,
-      const std::string& group_name)
+      std::string group_name)
       : handle(handle),
         device_idx(device_idx),
         block_size(block_size),
         buffer_size(buffer_size),
         signal_pad_offset(signal_pad_offset),
-        group_name(group_name),
+        group_name(std::move(group_name)),
         symm_mem(nullptr) {}
 };
 
@@ -113,5 +112,4 @@ class CUDASymmetricMemoryAllocator : public SymmetricMemoryAllocator {
   std::unordered_map<void*, c10::intrusive_ptr<Block>> ptr_to_block_;
 };
 
-} // namespace symmetric_memory
-} // namespace c10d
+} // namespace c10d::symmetric_memory
diff --git a/torch/csrc/distributed/c10d/DMAConnectivity.hpp b/torch/csrc/distributed/c10d/DMAConnectivity.hpp
index cede6aa265c7..db6baa3969ef 100644
--- a/torch/csrc/distributed/c10d/DMAConnectivity.hpp
+++ b/torch/csrc/distributed/c10d/DMAConnectivity.hpp
@@ -1,7 +1,5 @@
 #pragma once
 
-#include <optional>
-
 #include <ATen/ATen.h>
 
 namespace c10d {
@@ -25,7 +23,7 @@ struct TORCH_API DMAConnectivity : c10::intrusive_ptr_target {
 
 struct DMAConnectivityDetector : c10::intrusive_ptr_target {
   virtual c10::intrusive_ptr<DMAConnectivity> detect() = 0;
-  virtual ~DMAConnectivityDetector() {}
+  ~DMAConnectivityDetector() override = default;
 };
 
 C10_EXPORT void register_dma_connectivity_detector(
diff --git a/torch/csrc/distributed/c10d/GroupRegistry.cpp b/torch/csrc/distributed/c10d/GroupRegistry.cpp
index 2a735a4c9959..c56c91ef6ec3 100644
--- a/torch/csrc/distributed/c10d/GroupRegistry.cpp
+++ b/torch/csrc/distributed/c10d/GroupRegistry.cpp
@@ -11,6 +11,7 @@ class GroupRegistry {
  public:
   void register_group(
       std::string group_name,
+      // NOLINTNEXTLINE(performance-unnecessary-value-param)
       c10::intrusive_ptr<c10d::ProcessGroup> group) {
     std::unique_lock write_lock(lock_);
     auto [_, inserted] =
diff --git a/torch/csrc/distributed/c10d/NCCLUtils.hpp b/torch/csrc/distributed/c10d/NCCLUtils.hpp
index 0089d453bb85..32361e17580f 100644
--- a/torch/csrc/distributed/c10d/NCCLUtils.hpp
+++ b/torch/csrc/distributed/c10d/NCCLUtils.hpp
@@ -370,7 +370,7 @@ class NCCLComm {
   NCCLComm& operator=(NCCLComm&& other) = delete;
 
   // Move constructable
-  // NOLINTNEXTLINE(.*-noexcept-move-.*)
+  // NOLINTNEXTLINE(*-noexcept-move-*)
   NCCLComm(NCCLComm&& other) {
     // Using other's lock, as it reads other's states
     // Can not use this.mutex_, as this object is being constructed.
diff --git a/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp b/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp
index 1ebead6b598e..111cf14bb080 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp
@@ -6,6 +6,7 @@
 #include <deque>
 #include <mutex>
 #include <thread>
+#include <utility>
 #include <vector>
 
 #include <gloo/algorithm.h>
@@ -91,7 +92,8 @@ class TORCH_API ProcessGroupGloo : public Backend {
   // Wrap c10d store as Gloo store
   class TORCH_API GlooStore : public ::gloo::rendezvous::Store {
    public:
-    GlooStore(const c10::intrusive_ptr<::c10d::Store>& store) : store_(store) {}
+    GlooStore(c10::intrusive_ptr<::c10d::Store> store)
+        : store_(std::move(store)) {}
 
     void setUint(const std::string& key, const std::vector<uint8_t>& value) {
       store_->set(key, value);
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
index c9564a31f057..c98e76b1fd7a 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@@ -87,6 +87,7 @@ ncclDataType_t getNcclDataType(at::ScalarType type) {
 
 bool complexViewAsRealAllowed(const ReduceOp& reduceOp) {
   switch (reduceOp) {
+    // NOLINTNEXTLINE(bugprone-branch-clone)
     case ReduceOp::SUM:
       return true;
     case ReduceOp::AVG:
@@ -119,6 +120,7 @@ ncclRedOpRAII unpackPreMulSum(
       &preMulSum,
       // https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/ops.html#ncclredopcreatepremulsum
       // tells us that the scalar input is strictly a multiplier.
+      // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
       /*scalar=*/has_tensor ? const_cast<T*>(ptr_factor) : &scalar_factor,
       dataType,
       residence,
@@ -318,6 +320,7 @@ static void cacheAllocatorRegisterHook(
     auto& ncclComm = it.first;
     auto& devIdx = it.second;
     if (te.device_ == devIdx) {
+      // NOLINTNEXTLINE(performance-no-int-to-ptr)
       ncclComm->registerSegment(reinterpret_cast<void*>(te.addr_), te.size_);
     }
   }
@@ -336,6 +339,7 @@ static void cacheAllocatorDeregisterHook(
     auto& ncclComm = it.first;
     auto& devIdx = it.second;
     if (te.device_ == devIdx) {
+      // NOLINTNEXTLINE(performance-no-int-to-ptr)
       ncclComm->deregisterSegment(reinterpret_cast<void*>(te.addr_));
     }
   }
@@ -869,7 +873,6 @@ ProcessGroupNCCL::ProcessGroupNCCL(
     : Backend(rank, size),
       store_(std::move(store)),
       options_(std::move(options)),
-
       traceKeyStart_(getTraceStartKey("NCCL", rank)),
       traceKeyEnd_(getTraceEndKey("NCCL", rank)),
       terminateProcessGroup_(false),
@@ -888,7 +891,7 @@ ProcessGroupNCCL::ProcessGroupNCCL(
   // other threads and cause segfaults.
   const auto ncclVersion = getNcclVersion();
   this->setGroupUid(options_->group_name);
-  this->localDeviceCount_ = at::cuda::getNumGPUs();
+  this->localDeviceCount_ = static_cast<int>(at::cuda::getNumGPUs());
   logPrefix_ = createLogPrefix();
   blockingWait_ = getCvarBool(TORCH_NCCL_BLOCKING_WAIT, false);
   asyncErrorHandling_ = static_cast<ErrorHandlingMode>(
@@ -1013,8 +1016,8 @@ ProcessGroupNCCL::ProcessGroupNCCL(
     this->globalRankStride = 0;
   } else {
     bool ranksAreStrided = true;
-    int startRank = options_->global_ranks_in_group[0];
-    int stride =
+    auto startRank = options_->global_ranks_in_group[0];
+    auto stride =
         options_->global_ranks_in_group[1] - options_->global_ranks_in_group[0];
     for (std::vector<uint64_t>::size_type i = 0;
          i < options_->global_ranks_in_group.size();
@@ -1377,6 +1380,7 @@ void ProcessGroupNCCL::shutdown() {
   this->abort();
 }
 
+// NOLINTNEXTLINE(bugprone-exception-escape)
 ProcessGroupNCCL::~ProcessGroupNCCL() {
   LOG(INFO) << logPrefix() << "ProcessGroupNCCL destructor entered.";
 
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
index 839463a9d8be..9ec169d13863 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
@@ -198,7 +198,8 @@ struct DumpPipe {
     if (fd_ == -1) {
       return false;
     }
-    char buf[128];
+    // NOLINTNEXTLINE(*array*)
+    char buf[128]{};
     // non-blocking from O_NONBLOCK above.
     // Ignore EINTR because we already will poll this
     // again later.
@@ -461,8 +462,8 @@ class TORCH_API ProcessGroupNCCL : public Backend {
     // NOTE: We intentionally store raw pointers so that
     // we do not attempt to destroy the event objects on process exit,
     // because cuda may be gone.
-    std::deque<at::cuda::CUDAEvent*>
-        eventsArray_[2]; // 0 for timing=false, 1 for timing=true
+    std::array<std::deque<at::cuda::CUDAEvent*>, 2>
+        eventsArray_; // 0 for timing=false, 1 for timing=true
   };
 
   struct Options : Backend::Options {
@@ -1181,7 +1182,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   bool dumpOnTimeoutOrEx_;
 
   // Whether or not to sleep after an exception is thrown in the watchdog.
-  bool sleepAfterException_;
+  bool sleepAfterException_{};
 
   // Whether or not to enable nan check for input tensors to collectives.
   bool enableNanCheck_;
diff --git a/torch/csrc/distributed/c10d/PyProcessGroup.hpp b/torch/csrc/distributed/c10d/PyProcessGroup.hpp
index 3655984d452a..81021bdf2c9a 100644
--- a/torch/csrc/distributed/c10d/PyProcessGroup.hpp
+++ b/torch/csrc/distributed/c10d/PyProcessGroup.hpp
@@ -212,6 +212,7 @@ class TORCH_PYTHON_API PythonOnCompletionHook {
   PythonOnCompletionHook(py::object hook) : hook_(std::move(hook)) {}
   PythonOnCompletionHook(const PythonOnCompletionHook&) = default;
 
+  // NOLINTNEXTLINE(bugprone-exception-escape)
   ~PythonOnCompletionHook() {
     py::gil_scoped_acquire ag;
     hook_.dec_ref();
diff --git a/torch/csrc/distributed/c10d/RankLocal.hpp b/torch/csrc/distributed/c10d/RankLocal.hpp
index b3a649659af4..33f074746d28 100644
--- a/torch/csrc/distributed/c10d/RankLocal.hpp
+++ b/torch/csrc/distributed/c10d/RankLocal.hpp
@@ -55,7 +55,7 @@ class RankLocal {
   }
 
  private:
-  RankLocal(){};
+  RankLocal() = default;
   thread_local static T* cached_;
   static std::unordered_map<uint64_t, T> thread_id_to_rank_local_;
   static std::shared_mutex lock_;
diff --git a/torch/csrc/distributed/c10d/Store.hpp b/torch/csrc/distributed/c10d/Store.hpp
index d18de830ff7f..0b6dfe48d0d0 100644
--- a/torch/csrc/distributed/c10d/Store.hpp
+++ b/torch/csrc/distributed/c10d/Store.hpp
@@ -75,6 +75,7 @@ class TORCH_API Store : public torch::CustomClassHolder {
   // watchKey() is deprecated and no longer supported.
   virtual void watchKey(
       const std::string& /* unused */,
+      // NOLINTNEXTLINE(performance-unnecessary-value-param)
       WatchKeyCallback /* unused */) {
     TORCH_CHECK(false, "watchKey is deprecated, no implementation support it.");
   }
diff --git a/torch/csrc/distributed/c10d/SymmetricMemory.hpp b/torch/csrc/distributed/c10d/SymmetricMemory.hpp
index 55b212ef9015..2eaecfce2c8c 100644
--- a/torch/csrc/distributed/c10d/SymmetricMemory.hpp
+++ b/torch/csrc/distributed/c10d/SymmetricMemory.hpp
@@ -3,8 +3,7 @@
 #include <ATen/ATen.h>
 #include <torch/csrc/distributed/c10d/Store.hpp>
 
-namespace c10d {
-namespace symmetric_memory {
+namespace c10d::symmetric_memory {
 
 // SymmetricMemory represents symmetric allocations across a group of devices.
 // The allocations represented by a SymmetricMemory object are accessible by
@@ -38,7 +37,7 @@ namespace symmetric_memory {
 // for these two barriers, they can operate correctly in parallel.
 class TORCH_API SymmetricMemory : public c10::intrusive_ptr_target {
  public:
-  virtual ~SymmetricMemory() {}
+  ~SymmetricMemory() override = default;
 
   virtual std::vector<void*> get_buffer_ptrs() = 0;
   virtual std::vector<void*> get_signal_pad_ptrs() = 0;
@@ -72,7 +71,7 @@ class TORCH_API SymmetricMemory : public c10::intrusive_ptr_target {
 
 class SymmetricMemoryAllocator : public c10::intrusive_ptr_target {
  public:
-  virtual ~SymmetricMemoryAllocator(){};
+  ~SymmetricMemoryAllocator() override = default;
 
   virtual void* alloc(
       size_t size,
@@ -159,5 +158,4 @@ TORCH_API c10::intrusive_ptr<SymmetricMemory> get_symmetric_memory(
 TORCH_API bool has_multicast_support(
     c10::DeviceType device_type,
     int device_idx);
-} // namespace symmetric_memory
-} // namespace c10d
+} // namespace c10d::symmetric_memory
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index b1cebfe0502b..2c3f836a394f 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -109,6 +109,7 @@ class IntrusivePtrNoGilDestructor {
   explicit IntrusivePtrNoGilDestructor(T* impl)
       // NOLINTNEXTLINE(bugprone-exception-escape)
       : impl_(c10::intrusive_ptr<T>::unsafe_steal_from_new(impl)) {}
+  // NOLINTNEXTLINE(bugprone-exception-escape)
   ~IntrusivePtrNoGilDestructor() {
     if (impl_) {
       if (PyGILState_Check()) {
@@ -338,6 +339,7 @@ class PythonRequest : public ::c10d::control_plane::Request {
 };
 class PythonResponse : public ::c10d::control_plane::Response {
  public:
+  // NOLINTNEXTLINE(cppcoreguidelines-rvalue-reference-param-not-moved)
   void setContent(std::string&& content, const std::string& content_type)
       override {
     PYBIND11_OVERRIDE_PURE_NAME(
@@ -1610,7 +1612,7 @@ Arguments:
             if (!store) {
               throw py::value_error("store argument cannot be None");
             }
-            return new ::c10d::PrefixStore(prefix, store);
+            return new ::c10d::PrefixStore(prefix, std::move(store));
           }),
           py::arg("prefix"),
           py::arg("store"))
@@ -2755,7 +2757,7 @@ options :class:`~torch.distributed.ProcessGroupNCCL.Options`).
           .def(
               "_verify_work_timeout",
               [](const c10::intrusive_ptr<::c10d::ProcessGroupNCCL>& self,
-                 const c10::intrusive_ptr<::c10d::Work> work,
+                 const c10::intrusive_ptr<::c10d::Work>& work,
                  const std::chrono::milliseconds& timeout) {
                 return self->verifyWorkTimeoutForTest(work, timeout);
               },
@@ -3415,6 +3417,7 @@ static PyMethodDef methods[] = { // NOLINT
     {"_c10d_init", c10d_init, METH_NOARGS, nullptr},
     {nullptr, nullptr, 0, nullptr}};
 
+// NOLINTNEXTLINE(misc-use-internal-linkage)
 PyMethodDef* python_functions() {
   return methods;
 }
diff --git a/torch/csrc/distributed/c10d/intra_node_comm.cpp b/torch/csrc/distributed/c10d/intra_node_comm.cpp
index 6a61f16e9aea..c0c53d220d86 100644
--- a/torch/csrc/distributed/c10d/intra_node_comm.cpp
+++ b/torch/csrc/distributed/c10d/intra_node_comm.cpp
@@ -7,6 +7,7 @@
 
 namespace c10d::intra_node_comm {
 
+// NOLINTNEXTLINE(misc-use-internal-linkage)
 bool isIntraNodeCommSupported();
 
 static std::vector<std::string> ENABLE_INTRA_NODE_COMM = {
@@ -86,7 +87,7 @@ bool IntraNodeComm::isEnabled() {
  * Use c10d::Store to perform allgather on a trivially copyable type.
  */
 template <typename T>
-std::vector<T> storeAllGather(
+static std::vector<T> storeAllGather(
     const c10::intrusive_ptr<c10d::Store>& store,
     const std::string& prefix,
     size_t rank,
@@ -134,10 +135,12 @@ bool IntraNodeComm::rendezvous() {
     return false;
   }
 
+  // NOLINTNEXTLINE(bugprone-signed-char-misuse)
   deviceIdx_ = at::cuda::current_device();
 
   // Exchange hostname and device bus ID
   struct DevInfo {
+    // NOLINTNEXTLINE
     char hostname[HOST_NAME_MAX + 1];
     int deviceIdx;
   };
@@ -170,7 +173,8 @@ bool IntraNodeComm::rendezvous() {
   }
 
   auto groupName = "IntraNodeComm" + std::to_string(intraNodeCommIdx++);
-  set_group_info(groupName, rank_, worldSize_, store_);
+  set_group_info(
+      groupName, static_cast<int>(rank_), static_cast<int>(worldSize_), store_);
   auto allocator = get_allocator(c10::DeviceType::CUDA);
   symmetricMemoryPtr_ = allocator->alloc(bufferSize_, deviceIdx_, groupName);
   symmetricMemory_ = allocator->rendezvous(symmetricMemoryPtr_);
diff --git a/torch/csrc/distributed/c10d/quantization/quantization.h b/torch/csrc/distributed/c10d/quantization/quantization.h
index 3d2f23de421b..1a398d75004e 100644
--- a/torch/csrc/distributed/c10d/quantization/quantization.h
+++ b/torch/csrc/distributed/c10d/quantization/quantization.h
@@ -6,7 +6,6 @@
 #pragma once
 
 #include <ATen/ATen.h>
-#include <vector>
 
 namespace torch::distributed::c10d::quantization {
 
diff --git a/torch/csrc/distributed/c10d/quantization/quantization_gpu.h b/torch/csrc/distributed/c10d/quantization/quantization_gpu.h
index f865599595d3..c45d600b780f 100644
--- a/torch/csrc/distributed/c10d/quantization/quantization_gpu.h
+++ b/torch/csrc/distributed/c10d/quantization/quantization_gpu.h
@@ -6,7 +6,6 @@
 #pragma once
 
 #include <ATen/ATen.h>
-#include <vector>
 
 namespace torch::distributed::c10d::quantization {
 
diff --git a/torch/csrc/distributed/c10d/sequence_num.hpp b/torch/csrc/distributed/c10d/sequence_num.hpp
index 38bd4cb5ed9d..a32bb3dd6026 100644
--- a/torch/csrc/distributed/c10d/sequence_num.hpp
+++ b/torch/csrc/distributed/c10d/sequence_num.hpp
@@ -7,11 +7,11 @@
 #include <vector>
 
 namespace c10d {
-const int kUnsetSeqNum = 0;
+constexpr int kUnsetSeqNum = 0;
 
 namespace {
 constexpr int kByteOffset = 8;
-}
+} // namespace
 
 // Converts from int to char vec to write in store
 template <typename T>