[Distributed] [19/N] Fix clang-tidy warnings in torch/csrc/distributed/ (#138903)

Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138903
Approved by: https://github.com/ezyang
This commit is contained in:
cyy
2024-10-28 05:29:23 +00:00
committed by PyTorch MergeBot
parent 39aa3cb8d6
commit f9ae3fac8c
16 changed files with 43 additions and 34 deletions

View File

@ -4,8 +4,7 @@
#include <torch/csrc/distributed/c10d/Store.hpp>
#include <torch/csrc/distributed/c10d/SymmetricMemory.hpp>
namespace c10d {
namespace symmetric_memory {
namespace c10d::symmetric_memory {
#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
using HandleType = CUmemGenericAllocationHandle;
@ -85,13 +84,13 @@ struct Block : public c10::intrusive_ptr_target {
size_t block_size,
size_t buffer_size,
size_t signal_pad_offset,
const std::string& group_name)
std::string group_name)
: handle(handle),
device_idx(device_idx),
block_size(block_size),
buffer_size(buffer_size),
signal_pad_offset(signal_pad_offset),
group_name(group_name),
group_name(std::move(group_name)),
symm_mem(nullptr) {}
};
@ -113,5 +112,4 @@ class CUDASymmetricMemoryAllocator : public SymmetricMemoryAllocator {
std::unordered_map<void*, c10::intrusive_ptr<Block>> ptr_to_block_;
};
} // namespace symmetric_memory
} // namespace c10d
} // namespace c10d::symmetric_memory

View File

@ -1,7 +1,5 @@
#pragma once
#include <optional>
#include <ATen/ATen.h>
namespace c10d {
@ -25,7 +23,7 @@ struct TORCH_API DMAConnectivity : c10::intrusive_ptr_target {
struct DMAConnectivityDetector : c10::intrusive_ptr_target {
virtual c10::intrusive_ptr<DMAConnectivity> detect() = 0;
virtual ~DMAConnectivityDetector() {}
~DMAConnectivityDetector() override = default;
};
C10_EXPORT void register_dma_connectivity_detector(

View File

@ -11,6 +11,7 @@ class GroupRegistry {
public:
void register_group(
std::string group_name,
// NOLINTNEXTLINE(performance-unnecessary-value-param)
c10::intrusive_ptr<c10d::ProcessGroup> group) {
std::unique_lock write_lock(lock_);
auto [_, inserted] =

View File

@ -370,7 +370,7 @@ class NCCLComm {
NCCLComm& operator=(NCCLComm&& other) = delete;
// Move constructable
// NOLINTNEXTLINE(.*-noexcept-move-.*)
// NOLINTNEXTLINE(*-noexcept-move-*)
NCCLComm(NCCLComm&& other) {
// Using other's lock, as it reads other's states
// Can not use this.mutex_, as this object is being constructed.

View File

@ -6,6 +6,7 @@
#include <deque>
#include <mutex>
#include <thread>
#include <utility>
#include <vector>
#include <gloo/algorithm.h>
@ -91,7 +92,8 @@ class TORCH_API ProcessGroupGloo : public Backend {
// Wrap c10d store as Gloo store
class TORCH_API GlooStore : public ::gloo::rendezvous::Store {
public:
GlooStore(const c10::intrusive_ptr<::c10d::Store>& store) : store_(store) {}
GlooStore(c10::intrusive_ptr<::c10d::Store> store)
: store_(std::move(store)) {}
void setUint(const std::string& key, const std::vector<uint8_t>& value) {
store_->set(key, value);

View File

@ -87,6 +87,7 @@ ncclDataType_t getNcclDataType(at::ScalarType type) {
bool complexViewAsRealAllowed(const ReduceOp& reduceOp) {
switch (reduceOp) {
// NOLINTNEXTLINE(bugprone-branch-clone)
case ReduceOp::SUM:
return true;
case ReduceOp::AVG:
@ -119,6 +120,7 @@ ncclRedOpRAII unpackPreMulSum(
&preMulSum,
// https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/ops.html#ncclredopcreatepremulsum
// tells us that the scalar input is strictly a multiplier.
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
/*scalar=*/has_tensor ? const_cast<T*>(ptr_factor) : &scalar_factor,
dataType,
residence,
@ -318,6 +320,7 @@ static void cacheAllocatorRegisterHook(
auto& ncclComm = it.first;
auto& devIdx = it.second;
if (te.device_ == devIdx) {
// NOLINTNEXTLINE(performance-no-int-to-ptr)
ncclComm->registerSegment(reinterpret_cast<void*>(te.addr_), te.size_);
}
}
@ -336,6 +339,7 @@ static void cacheAllocatorDeregisterHook(
auto& ncclComm = it.first;
auto& devIdx = it.second;
if (te.device_ == devIdx) {
// NOLINTNEXTLINE(performance-no-int-to-ptr)
ncclComm->deregisterSegment(reinterpret_cast<void*>(te.addr_));
}
}
@ -869,7 +873,6 @@ ProcessGroupNCCL::ProcessGroupNCCL(
: Backend(rank, size),
store_(std::move(store)),
options_(std::move(options)),
traceKeyStart_(getTraceStartKey("NCCL", rank)),
traceKeyEnd_(getTraceEndKey("NCCL", rank)),
terminateProcessGroup_(false),
@ -888,7 +891,7 @@ ProcessGroupNCCL::ProcessGroupNCCL(
// other threads and cause segfaults.
const auto ncclVersion = getNcclVersion();
this->setGroupUid(options_->group_name);
this->localDeviceCount_ = at::cuda::getNumGPUs();
this->localDeviceCount_ = static_cast<int>(at::cuda::getNumGPUs());
logPrefix_ = createLogPrefix();
blockingWait_ = getCvarBool(TORCH_NCCL_BLOCKING_WAIT, false);
asyncErrorHandling_ = static_cast<ErrorHandlingMode>(
@ -1013,8 +1016,8 @@ ProcessGroupNCCL::ProcessGroupNCCL(
this->globalRankStride = 0;
} else {
bool ranksAreStrided = true;
int startRank = options_->global_ranks_in_group[0];
int stride =
auto startRank = options_->global_ranks_in_group[0];
auto stride =
options_->global_ranks_in_group[1] - options_->global_ranks_in_group[0];
for (std::vector<uint64_t>::size_type i = 0;
i < options_->global_ranks_in_group.size();
@ -1377,6 +1380,7 @@ void ProcessGroupNCCL::shutdown() {
this->abort();
}
// NOLINTNEXTLINE(bugprone-exception-escape)
ProcessGroupNCCL::~ProcessGroupNCCL() {
LOG(INFO) << logPrefix() << "ProcessGroupNCCL destructor entered.";

View File

@ -198,7 +198,8 @@ struct DumpPipe {
if (fd_ == -1) {
return false;
}
char buf[128];
// NOLINTNEXTLINE(*array*)
char buf[128]{};
// non-blocking from O_NONBLOCK above.
// Ignore EINTR because we already will poll this
// again later.
@ -461,8 +462,8 @@ class TORCH_API ProcessGroupNCCL : public Backend {
// NOTE: We intentionally store raw pointers so that
// we do not attempt to destroy the event objects on process exit,
// because cuda may be gone.
std::deque<at::cuda::CUDAEvent*>
eventsArray_[2]; // 0 for timing=false, 1 for timing=true
std::array<std::deque<at::cuda::CUDAEvent*>, 2>
eventsArray_; // 0 for timing=false, 1 for timing=true
};
struct Options : Backend::Options {
@ -1181,7 +1182,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
bool dumpOnTimeoutOrEx_;
// Whether or not to sleep after an exception is thrown in the watchdog.
bool sleepAfterException_;
bool sleepAfterException_{};
// Whether or not to enable nan check for input tensors to collectives.
bool enableNanCheck_;

View File

@ -212,6 +212,7 @@ class TORCH_PYTHON_API PythonOnCompletionHook {
PythonOnCompletionHook(py::object hook) : hook_(std::move(hook)) {}
PythonOnCompletionHook(const PythonOnCompletionHook&) = default;
// NOLINTNEXTLINE(bugprone-exception-escape)
~PythonOnCompletionHook() {
py::gil_scoped_acquire ag;
hook_.dec_ref();

View File

@ -55,7 +55,7 @@ class RankLocal {
}
private:
RankLocal(){};
RankLocal() = default;
thread_local static T* cached_;
static std::unordered_map<uint64_t, T> thread_id_to_rank_local_;
static std::shared_mutex lock_;

View File

@ -75,6 +75,7 @@ class TORCH_API Store : public torch::CustomClassHolder {
// watchKey() is deprecated and no longer supported.
virtual void watchKey(
const std::string& /* unused */,
// NOLINTNEXTLINE(performance-unnecessary-value-param)
WatchKeyCallback /* unused */) {
TORCH_CHECK(false, "watchKey is deprecated, no implementation support it.");
}

View File

@ -3,8 +3,7 @@
#include <ATen/ATen.h>
#include <torch/csrc/distributed/c10d/Store.hpp>
namespace c10d {
namespace symmetric_memory {
namespace c10d::symmetric_memory {
// SymmetricMemory represents symmetric allocations across a group of devices.
// The allocations represented by a SymmetricMemory object are accessible by
@ -38,7 +37,7 @@ namespace symmetric_memory {
// for these two barriers, they can operate correctly in parallel.
class TORCH_API SymmetricMemory : public c10::intrusive_ptr_target {
public:
virtual ~SymmetricMemory() {}
~SymmetricMemory() override = default;
virtual std::vector<void*> get_buffer_ptrs() = 0;
virtual std::vector<void*> get_signal_pad_ptrs() = 0;
@ -72,7 +71,7 @@ class TORCH_API SymmetricMemory : public c10::intrusive_ptr_target {
class SymmetricMemoryAllocator : public c10::intrusive_ptr_target {
public:
virtual ~SymmetricMemoryAllocator(){};
~SymmetricMemoryAllocator() override = default;
virtual void* alloc(
size_t size,
@ -159,5 +158,4 @@ TORCH_API c10::intrusive_ptr<SymmetricMemory> get_symmetric_memory(
TORCH_API bool has_multicast_support(
c10::DeviceType device_type,
int device_idx);
} // namespace symmetric_memory
} // namespace c10d
} // namespace c10d::symmetric_memory

View File

@ -109,6 +109,7 @@ class IntrusivePtrNoGilDestructor {
explicit IntrusivePtrNoGilDestructor(T* impl)
// NOLINTNEXTLINE(bugprone-exception-escape)
: impl_(c10::intrusive_ptr<T>::unsafe_steal_from_new(impl)) {}
// NOLINTNEXTLINE(bugprone-exception-escape)
~IntrusivePtrNoGilDestructor() {
if (impl_) {
if (PyGILState_Check()) {
@ -338,6 +339,7 @@ class PythonRequest : public ::c10d::control_plane::Request {
};
class PythonResponse : public ::c10d::control_plane::Response {
public:
// NOLINTNEXTLINE(cppcoreguidelines-rvalue-reference-param-not-moved)
void setContent(std::string&& content, const std::string& content_type)
override {
PYBIND11_OVERRIDE_PURE_NAME(
@ -1610,7 +1612,7 @@ Arguments:
if (!store) {
throw py::value_error("store argument cannot be None");
}
return new ::c10d::PrefixStore(prefix, store);
return new ::c10d::PrefixStore(prefix, std::move(store));
}),
py::arg("prefix"),
py::arg("store"))
@ -2755,7 +2757,7 @@ options :class:`~torch.distributed.ProcessGroupNCCL.Options`).
.def(
"_verify_work_timeout",
[](const c10::intrusive_ptr<::c10d::ProcessGroupNCCL>& self,
const c10::intrusive_ptr<::c10d::Work> work,
const c10::intrusive_ptr<::c10d::Work>& work,
const std::chrono::milliseconds& timeout) {
return self->verifyWorkTimeoutForTest(work, timeout);
},
@ -3415,6 +3417,7 @@ static PyMethodDef methods[] = { // NOLINT
{"_c10d_init", c10d_init, METH_NOARGS, nullptr},
{nullptr, nullptr, 0, nullptr}};
// NOLINTNEXTLINE(misc-use-internal-linkage)
PyMethodDef* python_functions() {
return methods;
}

View File

@ -7,6 +7,7 @@
namespace c10d::intra_node_comm {
// NOLINTNEXTLINE(misc-use-internal-linkage)
bool isIntraNodeCommSupported();
static std::vector<std::string> ENABLE_INTRA_NODE_COMM = {
@ -86,7 +87,7 @@ bool IntraNodeComm::isEnabled() {
* Use c10d::Store to perform allgather on a trivially copyable type.
*/
template <typename T>
std::vector<T> storeAllGather(
static std::vector<T> storeAllGather(
const c10::intrusive_ptr<c10d::Store>& store,
const std::string& prefix,
size_t rank,
@ -134,10 +135,12 @@ bool IntraNodeComm::rendezvous() {
return false;
}
// NOLINTNEXTLINE(bugprone-signed-char-misuse)
deviceIdx_ = at::cuda::current_device();
// Exchange hostname and device bus ID
struct DevInfo {
// NOLINTNEXTLINE
char hostname[HOST_NAME_MAX + 1];
int deviceIdx;
};
@ -170,7 +173,8 @@ bool IntraNodeComm::rendezvous() {
}
auto groupName = "IntraNodeComm" + std::to_string(intraNodeCommIdx++);
set_group_info(groupName, rank_, worldSize_, store_);
set_group_info(
groupName, static_cast<int>(rank_), static_cast<int>(worldSize_), store_);
auto allocator = get_allocator(c10::DeviceType::CUDA);
symmetricMemoryPtr_ = allocator->alloc(bufferSize_, deviceIdx_, groupName);
symmetricMemory_ = allocator->rendezvous(symmetricMemoryPtr_);

View File

@ -6,7 +6,6 @@
#pragma once
#include <ATen/ATen.h>
#include <vector>
namespace torch::distributed::c10d::quantization {

View File

@ -6,7 +6,6 @@
#pragma once
#include <ATen/ATen.h>
#include <vector>
namespace torch::distributed::c10d::quantization {

View File

@ -7,11 +7,11 @@
#include <vector>
namespace c10d {
const int kUnsetSeqNum = 0;
constexpr int kUnsetSeqNum = 0;
namespace {
constexpr int kByteOffset = 8;
}
} // namespace
// Converts from int to char vec to write in store
template <typename T>