From f9ae3fac8c129838554751eb102d1fc933406370 Mon Sep 17 00:00:00 2001 From: cyy Date: Mon, 28 Oct 2024 05:29:23 +0000 Subject: [PATCH] [Distributed] [19/N] Fix clang-tidy warnings in torch/csrc/distributed/ (#138903) Fixes #ISSUE_NUMBER Pull Request resolved: https://github.com/pytorch/pytorch/pull/138903 Approved by: https://github.com/ezyang --- torch/csrc/distributed/c10d/CUDASymmetricMemory.hpp | 10 ++++------ torch/csrc/distributed/c10d/DMAConnectivity.hpp | 4 +--- torch/csrc/distributed/c10d/GroupRegistry.cpp | 1 + torch/csrc/distributed/c10d/NCCLUtils.hpp | 2 +- torch/csrc/distributed/c10d/ProcessGroupGloo.hpp | 4 +++- torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp | 12 ++++++++---- torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp | 9 +++++---- torch/csrc/distributed/c10d/PyProcessGroup.hpp | 1 + torch/csrc/distributed/c10d/RankLocal.hpp | 2 +- torch/csrc/distributed/c10d/Store.hpp | 1 + torch/csrc/distributed/c10d/SymmetricMemory.hpp | 10 ++++------ torch/csrc/distributed/c10d/init.cpp | 7 +++++-- torch/csrc/distributed/c10d/intra_node_comm.cpp | 8 ++++++-- .../distributed/c10d/quantization/quantization.h | 1 - .../distributed/c10d/quantization/quantization_gpu.h | 1 - torch/csrc/distributed/c10d/sequence_num.hpp | 4 ++-- 16 files changed, 43 insertions(+), 34 deletions(-) diff --git a/torch/csrc/distributed/c10d/CUDASymmetricMemory.hpp b/torch/csrc/distributed/c10d/CUDASymmetricMemory.hpp index 4fa907a95288..6efdcf627767 100644 --- a/torch/csrc/distributed/c10d/CUDASymmetricMemory.hpp +++ b/torch/csrc/distributed/c10d/CUDASymmetricMemory.hpp @@ -4,8 +4,7 @@ #include #include -namespace c10d { -namespace symmetric_memory { +namespace c10d::symmetric_memory { #if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED) using HandleType = CUmemGenericAllocationHandle; @@ -85,13 +84,13 @@ struct Block : public c10::intrusive_ptr_target { size_t block_size, size_t buffer_size, size_t signal_pad_offset, - const std::string& group_name) + std::string group_name) : handle(handle), device_idx(device_idx), block_size(block_size), buffer_size(buffer_size), signal_pad_offset(signal_pad_offset), - group_name(group_name), + group_name(std::move(group_name)), symm_mem(nullptr) {} }; @@ -113,5 +112,4 @@ class CUDASymmetricMemoryAllocator : public SymmetricMemoryAllocator { std::unordered_map> ptr_to_block_; }; -} // namespace symmetric_memory -} // namespace c10d +} // namespace c10d::symmetric_memory diff --git a/torch/csrc/distributed/c10d/DMAConnectivity.hpp b/torch/csrc/distributed/c10d/DMAConnectivity.hpp index cede6aa265c7..db6baa3969ef 100644 --- a/torch/csrc/distributed/c10d/DMAConnectivity.hpp +++ b/torch/csrc/distributed/c10d/DMAConnectivity.hpp @@ -1,7 +1,5 @@ #pragma once -#include - #include namespace c10d { @@ -25,7 +23,7 @@ struct TORCH_API DMAConnectivity : c10::intrusive_ptr_target { struct DMAConnectivityDetector : c10::intrusive_ptr_target { virtual c10::intrusive_ptr detect() = 0; - virtual ~DMAConnectivityDetector() {} + ~DMAConnectivityDetector() override = default; }; C10_EXPORT void register_dma_connectivity_detector( diff --git a/torch/csrc/distributed/c10d/GroupRegistry.cpp b/torch/csrc/distributed/c10d/GroupRegistry.cpp index 2a735a4c9959..c56c91ef6ec3 100644 --- a/torch/csrc/distributed/c10d/GroupRegistry.cpp +++ b/torch/csrc/distributed/c10d/GroupRegistry.cpp @@ -11,6 +11,7 @@ class GroupRegistry { public: void register_group( std::string group_name, + // NOLINTNEXTLINE(performance-unnecessary-value-param) c10::intrusive_ptr group) { std::unique_lock write_lock(lock_); auto [_, inserted] = diff --git a/torch/csrc/distributed/c10d/NCCLUtils.hpp b/torch/csrc/distributed/c10d/NCCLUtils.hpp index 0089d453bb85..32361e17580f 100644 --- a/torch/csrc/distributed/c10d/NCCLUtils.hpp +++ b/torch/csrc/distributed/c10d/NCCLUtils.hpp @@ -370,7 +370,7 @@ class NCCLComm { NCCLComm& operator=(NCCLComm&& other) = delete; // Move constructable - // NOLINTNEXTLINE(.*-noexcept-move-.*) + // NOLINTNEXTLINE(*-noexcept-move-*) NCCLComm(NCCLComm&& other) { // Using other's lock, as it reads other's states // Can not use this.mutex_, as this object is being constructed. diff --git a/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp b/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp index 1ebead6b598e..111cf14bb080 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -91,7 +92,8 @@ class TORCH_API ProcessGroupGloo : public Backend { // Wrap c10d store as Gloo store class TORCH_API GlooStore : public ::gloo::rendezvous::Store { public: - GlooStore(const c10::intrusive_ptr<::c10d::Store>& store) : store_(store) {} + GlooStore(c10::intrusive_ptr<::c10d::Store> store) + : store_(std::move(store)) {} void setUint(const std::string& key, const std::vector& value) { store_->set(key, value); diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp index c9564a31f057..c98e76b1fd7a 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp @@ -87,6 +87,7 @@ ncclDataType_t getNcclDataType(at::ScalarType type) { bool complexViewAsRealAllowed(const ReduceOp& reduceOp) { switch (reduceOp) { + // NOLINTNEXTLINE(bugprone-branch-clone) case ReduceOp::SUM: return true; case ReduceOp::AVG: @@ -119,6 +120,7 @@ ncclRedOpRAII unpackPreMulSum( &preMulSum, // https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/ops.html#ncclredopcreatepremulsum // tells us that the scalar input is strictly a multiplier. + // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast) /*scalar=*/has_tensor ? const_cast(ptr_factor) : &scalar_factor, dataType, residence, @@ -318,6 +320,7 @@ static void cacheAllocatorRegisterHook( auto& ncclComm = it.first; auto& devIdx = it.second; if (te.device_ == devIdx) { + // NOLINTNEXTLINE(performance-no-int-to-ptr) ncclComm->registerSegment(reinterpret_cast(te.addr_), te.size_); } } @@ -336,6 +339,7 @@ static void cacheAllocatorDeregisterHook( auto& ncclComm = it.first; auto& devIdx = it.second; if (te.device_ == devIdx) { + // NOLINTNEXTLINE(performance-no-int-to-ptr) ncclComm->deregisterSegment(reinterpret_cast(te.addr_)); } } @@ -869,7 +873,6 @@ ProcessGroupNCCL::ProcessGroupNCCL( : Backend(rank, size), store_(std::move(store)), options_(std::move(options)), - traceKeyStart_(getTraceStartKey("NCCL", rank)), traceKeyEnd_(getTraceEndKey("NCCL", rank)), terminateProcessGroup_(false), @@ -888,7 +891,7 @@ ProcessGroupNCCL::ProcessGroupNCCL( // other threads and cause segfaults. const auto ncclVersion = getNcclVersion(); this->setGroupUid(options_->group_name); - this->localDeviceCount_ = at::cuda::getNumGPUs(); + this->localDeviceCount_ = static_cast(at::cuda::getNumGPUs()); logPrefix_ = createLogPrefix(); blockingWait_ = getCvarBool(TORCH_NCCL_BLOCKING_WAIT, false); asyncErrorHandling_ = static_cast( @@ -1013,8 +1016,8 @@ ProcessGroupNCCL::ProcessGroupNCCL( this->globalRankStride = 0; } else { bool ranksAreStrided = true; - int startRank = options_->global_ranks_in_group[0]; - int stride = + auto startRank = options_->global_ranks_in_group[0]; + auto stride = options_->global_ranks_in_group[1] - options_->global_ranks_in_group[0]; for (std::vector::size_type i = 0; i < options_->global_ranks_in_group.size(); @@ -1377,6 +1380,7 @@ void ProcessGroupNCCL::shutdown() { this->abort(); } +// NOLINTNEXTLINE(bugprone-exception-escape) ProcessGroupNCCL::~ProcessGroupNCCL() { LOG(INFO) << logPrefix() << "ProcessGroupNCCL destructor entered."; diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp index 839463a9d8be..9ec169d13863 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp @@ -198,7 +198,8 @@ struct DumpPipe { if (fd_ == -1) { return false; } - char buf[128]; + // NOLINTNEXTLINE(*array*) + char buf[128]{}; // non-blocking from O_NONBLOCK above. // Ignore EINTR because we already will poll this // again later. @@ -461,8 +462,8 @@ class TORCH_API ProcessGroupNCCL : public Backend { // NOTE: We intentionally store raw pointers so that // we do not attempt to destroy the event objects on process exit, // because cuda may be gone. - std::deque - eventsArray_[2]; // 0 for timing=false, 1 for timing=true + std::array, 2> + eventsArray_; // 0 for timing=false, 1 for timing=true }; struct Options : Backend::Options { @@ -1181,7 +1182,7 @@ class TORCH_API ProcessGroupNCCL : public Backend { bool dumpOnTimeoutOrEx_; // Whether or not to sleep after an exception is thrown in the watchdog. - bool sleepAfterException_; + bool sleepAfterException_{}; // Whether or not to enable nan check for input tensors to collectives. bool enableNanCheck_; diff --git a/torch/csrc/distributed/c10d/PyProcessGroup.hpp b/torch/csrc/distributed/c10d/PyProcessGroup.hpp index 3655984d452a..81021bdf2c9a 100644 --- a/torch/csrc/distributed/c10d/PyProcessGroup.hpp +++ b/torch/csrc/distributed/c10d/PyProcessGroup.hpp @@ -212,6 +212,7 @@ class TORCH_PYTHON_API PythonOnCompletionHook { PythonOnCompletionHook(py::object hook) : hook_(std::move(hook)) {} PythonOnCompletionHook(const PythonOnCompletionHook&) = default; + // NOLINTNEXTLINE(bugprone-exception-escape) ~PythonOnCompletionHook() { py::gil_scoped_acquire ag; hook_.dec_ref(); diff --git a/torch/csrc/distributed/c10d/RankLocal.hpp b/torch/csrc/distributed/c10d/RankLocal.hpp index b3a649659af4..33f074746d28 100644 --- a/torch/csrc/distributed/c10d/RankLocal.hpp +++ b/torch/csrc/distributed/c10d/RankLocal.hpp @@ -55,7 +55,7 @@ class RankLocal { } private: - RankLocal(){}; + RankLocal() = default; thread_local static T* cached_; static std::unordered_map thread_id_to_rank_local_; static std::shared_mutex lock_; diff --git a/torch/csrc/distributed/c10d/Store.hpp b/torch/csrc/distributed/c10d/Store.hpp index d18de830ff7f..0b6dfe48d0d0 100644 --- a/torch/csrc/distributed/c10d/Store.hpp +++ b/torch/csrc/distributed/c10d/Store.hpp @@ -75,6 +75,7 @@ class TORCH_API Store : public torch::CustomClassHolder { // watchKey() is deprecated and no longer supported. virtual void watchKey( const std::string& /* unused */, + // NOLINTNEXTLINE(performance-unnecessary-value-param) WatchKeyCallback /* unused */) { TORCH_CHECK(false, "watchKey is deprecated, no implementation support it."); } diff --git a/torch/csrc/distributed/c10d/SymmetricMemory.hpp b/torch/csrc/distributed/c10d/SymmetricMemory.hpp index 55b212ef9015..2eaecfce2c8c 100644 --- a/torch/csrc/distributed/c10d/SymmetricMemory.hpp +++ b/torch/csrc/distributed/c10d/SymmetricMemory.hpp @@ -3,8 +3,7 @@ #include #include -namespace c10d { -namespace symmetric_memory { +namespace c10d::symmetric_memory { // SymmetricMemory represents symmetric allocations across a group of devices. // The allocations represented by a SymmetricMemory object are accessible by @@ -38,7 +37,7 @@ namespace symmetric_memory { // for these two barriers, they can operate correctly in parallel. class TORCH_API SymmetricMemory : public c10::intrusive_ptr_target { public: - virtual ~SymmetricMemory() {} + ~SymmetricMemory() override = default; virtual std::vector get_buffer_ptrs() = 0; virtual std::vector get_signal_pad_ptrs() = 0; @@ -72,7 +71,7 @@ class TORCH_API SymmetricMemory : public c10::intrusive_ptr_target { class SymmetricMemoryAllocator : public c10::intrusive_ptr_target { public: - virtual ~SymmetricMemoryAllocator(){}; + ~SymmetricMemoryAllocator() override = default; virtual void* alloc( size_t size, @@ -159,5 +158,4 @@ TORCH_API c10::intrusive_ptr get_symmetric_memory( TORCH_API bool has_multicast_support( c10::DeviceType device_type, int device_idx); -} // namespace symmetric_memory -} // namespace c10d +} // namespace c10d::symmetric_memory diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp index b1cebfe0502b..2c3f836a394f 100644 --- a/torch/csrc/distributed/c10d/init.cpp +++ b/torch/csrc/distributed/c10d/init.cpp @@ -109,6 +109,7 @@ class IntrusivePtrNoGilDestructor { explicit IntrusivePtrNoGilDestructor(T* impl) // NOLINTNEXTLINE(bugprone-exception-escape) : impl_(c10::intrusive_ptr::unsafe_steal_from_new(impl)) {} + // NOLINTNEXTLINE(bugprone-exception-escape) ~IntrusivePtrNoGilDestructor() { if (impl_) { if (PyGILState_Check()) { @@ -338,6 +339,7 @@ class PythonRequest : public ::c10d::control_plane::Request { }; class PythonResponse : public ::c10d::control_plane::Response { public: + // NOLINTNEXTLINE(cppcoreguidelines-rvalue-reference-param-not-moved) void setContent(std::string&& content, const std::string& content_type) override { PYBIND11_OVERRIDE_PURE_NAME( @@ -1610,7 +1612,7 @@ Arguments: if (!store) { throw py::value_error("store argument cannot be None"); } - return new ::c10d::PrefixStore(prefix, store); + return new ::c10d::PrefixStore(prefix, std::move(store)); }), py::arg("prefix"), py::arg("store")) @@ -2755,7 +2757,7 @@ options :class:`~torch.distributed.ProcessGroupNCCL.Options`). .def( "_verify_work_timeout", [](const c10::intrusive_ptr<::c10d::ProcessGroupNCCL>& self, - const c10::intrusive_ptr<::c10d::Work> work, + const c10::intrusive_ptr<::c10d::Work>& work, const std::chrono::milliseconds& timeout) { return self->verifyWorkTimeoutForTest(work, timeout); }, @@ -3415,6 +3417,7 @@ static PyMethodDef methods[] = { // NOLINT {"_c10d_init", c10d_init, METH_NOARGS, nullptr}, {nullptr, nullptr, 0, nullptr}}; +// NOLINTNEXTLINE(misc-use-internal-linkage) PyMethodDef* python_functions() { return methods; } diff --git a/torch/csrc/distributed/c10d/intra_node_comm.cpp b/torch/csrc/distributed/c10d/intra_node_comm.cpp index 6a61f16e9aea..c0c53d220d86 100644 --- a/torch/csrc/distributed/c10d/intra_node_comm.cpp +++ b/torch/csrc/distributed/c10d/intra_node_comm.cpp @@ -7,6 +7,7 @@ namespace c10d::intra_node_comm { +// NOLINTNEXTLINE(misc-use-internal-linkage) bool isIntraNodeCommSupported(); static std::vector ENABLE_INTRA_NODE_COMM = { @@ -86,7 +87,7 @@ bool IntraNodeComm::isEnabled() { * Use c10d::Store to perform allgather on a trivially copyable type. */ template -std::vector storeAllGather( +static std::vector storeAllGather( const c10::intrusive_ptr& store, const std::string& prefix, size_t rank, @@ -134,10 +135,12 @@ bool IntraNodeComm::rendezvous() { return false; } + // NOLINTNEXTLINE(bugprone-signed-char-misuse) deviceIdx_ = at::cuda::current_device(); // Exchange hostname and device bus ID struct DevInfo { + // NOLINTNEXTLINE char hostname[HOST_NAME_MAX + 1]; int deviceIdx; }; @@ -170,7 +173,8 @@ bool IntraNodeComm::rendezvous() { } auto groupName = "IntraNodeComm" + std::to_string(intraNodeCommIdx++); - set_group_info(groupName, rank_, worldSize_, store_); + set_group_info( + groupName, static_cast(rank_), static_cast(worldSize_), store_); auto allocator = get_allocator(c10::DeviceType::CUDA); symmetricMemoryPtr_ = allocator->alloc(bufferSize_, deviceIdx_, groupName); symmetricMemory_ = allocator->rendezvous(symmetricMemoryPtr_); diff --git a/torch/csrc/distributed/c10d/quantization/quantization.h b/torch/csrc/distributed/c10d/quantization/quantization.h index 3d2f23de421b..1a398d75004e 100644 --- a/torch/csrc/distributed/c10d/quantization/quantization.h +++ b/torch/csrc/distributed/c10d/quantization/quantization.h @@ -6,7 +6,6 @@ #pragma once #include -#include namespace torch::distributed::c10d::quantization { diff --git a/torch/csrc/distributed/c10d/quantization/quantization_gpu.h b/torch/csrc/distributed/c10d/quantization/quantization_gpu.h index f865599595d3..c45d600b780f 100644 --- a/torch/csrc/distributed/c10d/quantization/quantization_gpu.h +++ b/torch/csrc/distributed/c10d/quantization/quantization_gpu.h @@ -6,7 +6,6 @@ #pragma once #include -#include namespace torch::distributed::c10d::quantization { diff --git a/torch/csrc/distributed/c10d/sequence_num.hpp b/torch/csrc/distributed/c10d/sequence_num.hpp index 38bd4cb5ed9d..a32bb3dd6026 100644 --- a/torch/csrc/distributed/c10d/sequence_num.hpp +++ b/torch/csrc/distributed/c10d/sequence_num.hpp @@ -7,11 +7,11 @@ #include namespace c10d { -const int kUnsetSeqNum = 0; +constexpr int kUnsetSeqNum = 0; namespace { constexpr int kByteOffset = 8; -} +} // namespace // Converts from int to char vec to write in store template