mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
[Distributed] [19/N] Fix clang-tidy warnings in torch/csrc/distributed/ (#138903)
Fixes #ISSUE_NUMBER Pull Request resolved: https://github.com/pytorch/pytorch/pull/138903 Approved by: https://github.com/ezyang
This commit is contained in:
@ -4,8 +4,7 @@
|
||||
#include <torch/csrc/distributed/c10d/Store.hpp>
|
||||
#include <torch/csrc/distributed/c10d/SymmetricMemory.hpp>
|
||||
|
||||
namespace c10d {
|
||||
namespace symmetric_memory {
|
||||
namespace c10d::symmetric_memory {
|
||||
|
||||
#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
|
||||
using HandleType = CUmemGenericAllocationHandle;
|
||||
@ -85,13 +84,13 @@ struct Block : public c10::intrusive_ptr_target {
|
||||
size_t block_size,
|
||||
size_t buffer_size,
|
||||
size_t signal_pad_offset,
|
||||
const std::string& group_name)
|
||||
std::string group_name)
|
||||
: handle(handle),
|
||||
device_idx(device_idx),
|
||||
block_size(block_size),
|
||||
buffer_size(buffer_size),
|
||||
signal_pad_offset(signal_pad_offset),
|
||||
group_name(group_name),
|
||||
group_name(std::move(group_name)),
|
||||
symm_mem(nullptr) {}
|
||||
};
|
||||
|
||||
@ -113,5 +112,4 @@ class CUDASymmetricMemoryAllocator : public SymmetricMemoryAllocator {
|
||||
std::unordered_map<void*, c10::intrusive_ptr<Block>> ptr_to_block_;
|
||||
};
|
||||
|
||||
} // namespace symmetric_memory
|
||||
} // namespace c10d
|
||||
} // namespace c10d::symmetric_memory
|
||||
|
@ -1,7 +1,5 @@
|
||||
#pragma once
|
||||
|
||||
#include <optional>
|
||||
|
||||
#include <ATen/ATen.h>
|
||||
|
||||
namespace c10d {
|
||||
@ -25,7 +23,7 @@ struct TORCH_API DMAConnectivity : c10::intrusive_ptr_target {
|
||||
|
||||
struct DMAConnectivityDetector : c10::intrusive_ptr_target {
|
||||
virtual c10::intrusive_ptr<DMAConnectivity> detect() = 0;
|
||||
virtual ~DMAConnectivityDetector() {}
|
||||
~DMAConnectivityDetector() override = default;
|
||||
};
|
||||
|
||||
C10_EXPORT void register_dma_connectivity_detector(
|
||||
|
@ -11,6 +11,7 @@ class GroupRegistry {
|
||||
public:
|
||||
void register_group(
|
||||
std::string group_name,
|
||||
// NOLINTNEXTLINE(performance-unnecessary-value-param)
|
||||
c10::intrusive_ptr<c10d::ProcessGroup> group) {
|
||||
std::unique_lock write_lock(lock_);
|
||||
auto [_, inserted] =
|
||||
|
@ -370,7 +370,7 @@ class NCCLComm {
|
||||
NCCLComm& operator=(NCCLComm&& other) = delete;
|
||||
|
||||
// Move constructable
|
||||
// NOLINTNEXTLINE(.*-noexcept-move-.*)
|
||||
// NOLINTNEXTLINE(*-noexcept-move-*)
|
||||
NCCLComm(NCCLComm&& other) {
|
||||
// Using other's lock, as it reads other's states
|
||||
// Can not use this.mutex_, as this object is being constructed.
|
||||
|
@ -6,6 +6,7 @@
|
||||
#include <deque>
|
||||
#include <mutex>
|
||||
#include <thread>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include <gloo/algorithm.h>
|
||||
@ -91,7 +92,8 @@ class TORCH_API ProcessGroupGloo : public Backend {
|
||||
// Wrap c10d store as Gloo store
|
||||
class TORCH_API GlooStore : public ::gloo::rendezvous::Store {
|
||||
public:
|
||||
GlooStore(const c10::intrusive_ptr<::c10d::Store>& store) : store_(store) {}
|
||||
GlooStore(c10::intrusive_ptr<::c10d::Store> store)
|
||||
: store_(std::move(store)) {}
|
||||
|
||||
void setUint(const std::string& key, const std::vector<uint8_t>& value) {
|
||||
store_->set(key, value);
|
||||
|
@ -87,6 +87,7 @@ ncclDataType_t getNcclDataType(at::ScalarType type) {
|
||||
|
||||
bool complexViewAsRealAllowed(const ReduceOp& reduceOp) {
|
||||
switch (reduceOp) {
|
||||
// NOLINTNEXTLINE(bugprone-branch-clone)
|
||||
case ReduceOp::SUM:
|
||||
return true;
|
||||
case ReduceOp::AVG:
|
||||
@ -119,6 +120,7 @@ ncclRedOpRAII unpackPreMulSum(
|
||||
&preMulSum,
|
||||
// https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/ops.html#ncclredopcreatepremulsum
|
||||
// tells us that the scalar input is strictly a multiplier.
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
|
||||
/*scalar=*/has_tensor ? const_cast<T*>(ptr_factor) : &scalar_factor,
|
||||
dataType,
|
||||
residence,
|
||||
@ -318,6 +320,7 @@ static void cacheAllocatorRegisterHook(
|
||||
auto& ncclComm = it.first;
|
||||
auto& devIdx = it.second;
|
||||
if (te.device_ == devIdx) {
|
||||
// NOLINTNEXTLINE(performance-no-int-to-ptr)
|
||||
ncclComm->registerSegment(reinterpret_cast<void*>(te.addr_), te.size_);
|
||||
}
|
||||
}
|
||||
@ -336,6 +339,7 @@ static void cacheAllocatorDeregisterHook(
|
||||
auto& ncclComm = it.first;
|
||||
auto& devIdx = it.second;
|
||||
if (te.device_ == devIdx) {
|
||||
// NOLINTNEXTLINE(performance-no-int-to-ptr)
|
||||
ncclComm->deregisterSegment(reinterpret_cast<void*>(te.addr_));
|
||||
}
|
||||
}
|
||||
@ -869,7 +873,6 @@ ProcessGroupNCCL::ProcessGroupNCCL(
|
||||
: Backend(rank, size),
|
||||
store_(std::move(store)),
|
||||
options_(std::move(options)),
|
||||
|
||||
traceKeyStart_(getTraceStartKey("NCCL", rank)),
|
||||
traceKeyEnd_(getTraceEndKey("NCCL", rank)),
|
||||
terminateProcessGroup_(false),
|
||||
@ -888,7 +891,7 @@ ProcessGroupNCCL::ProcessGroupNCCL(
|
||||
// other threads and cause segfaults.
|
||||
const auto ncclVersion = getNcclVersion();
|
||||
this->setGroupUid(options_->group_name);
|
||||
this->localDeviceCount_ = at::cuda::getNumGPUs();
|
||||
this->localDeviceCount_ = static_cast<int>(at::cuda::getNumGPUs());
|
||||
logPrefix_ = createLogPrefix();
|
||||
blockingWait_ = getCvarBool(TORCH_NCCL_BLOCKING_WAIT, false);
|
||||
asyncErrorHandling_ = static_cast<ErrorHandlingMode>(
|
||||
@ -1013,8 +1016,8 @@ ProcessGroupNCCL::ProcessGroupNCCL(
|
||||
this->globalRankStride = 0;
|
||||
} else {
|
||||
bool ranksAreStrided = true;
|
||||
int startRank = options_->global_ranks_in_group[0];
|
||||
int stride =
|
||||
auto startRank = options_->global_ranks_in_group[0];
|
||||
auto stride =
|
||||
options_->global_ranks_in_group[1] - options_->global_ranks_in_group[0];
|
||||
for (std::vector<uint64_t>::size_type i = 0;
|
||||
i < options_->global_ranks_in_group.size();
|
||||
@ -1377,6 +1380,7 @@ void ProcessGroupNCCL::shutdown() {
|
||||
this->abort();
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(bugprone-exception-escape)
|
||||
ProcessGroupNCCL::~ProcessGroupNCCL() {
|
||||
LOG(INFO) << logPrefix() << "ProcessGroupNCCL destructor entered.";
|
||||
|
||||
|
@ -198,7 +198,8 @@ struct DumpPipe {
|
||||
if (fd_ == -1) {
|
||||
return false;
|
||||
}
|
||||
char buf[128];
|
||||
// NOLINTNEXTLINE(*array*)
|
||||
char buf[128]{};
|
||||
// non-blocking from O_NONBLOCK above.
|
||||
// Ignore EINTR because we already will poll this
|
||||
// again later.
|
||||
@ -461,8 +462,8 @@ class TORCH_API ProcessGroupNCCL : public Backend {
|
||||
// NOTE: We intentionally store raw pointers so that
|
||||
// we do not attempt to destroy the event objects on process exit,
|
||||
// because cuda may be gone.
|
||||
std::deque<at::cuda::CUDAEvent*>
|
||||
eventsArray_[2]; // 0 for timing=false, 1 for timing=true
|
||||
std::array<std::deque<at::cuda::CUDAEvent*>, 2>
|
||||
eventsArray_; // 0 for timing=false, 1 for timing=true
|
||||
};
|
||||
|
||||
struct Options : Backend::Options {
|
||||
@ -1181,7 +1182,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
|
||||
bool dumpOnTimeoutOrEx_;
|
||||
|
||||
// Whether or not to sleep after an exception is thrown in the watchdog.
|
||||
bool sleepAfterException_;
|
||||
bool sleepAfterException_{};
|
||||
|
||||
// Whether or not to enable nan check for input tensors to collectives.
|
||||
bool enableNanCheck_;
|
||||
|
@ -212,6 +212,7 @@ class TORCH_PYTHON_API PythonOnCompletionHook {
|
||||
PythonOnCompletionHook(py::object hook) : hook_(std::move(hook)) {}
|
||||
PythonOnCompletionHook(const PythonOnCompletionHook&) = default;
|
||||
|
||||
// NOLINTNEXTLINE(bugprone-exception-escape)
|
||||
~PythonOnCompletionHook() {
|
||||
py::gil_scoped_acquire ag;
|
||||
hook_.dec_ref();
|
||||
|
@ -55,7 +55,7 @@ class RankLocal {
|
||||
}
|
||||
|
||||
private:
|
||||
RankLocal(){};
|
||||
RankLocal() = default;
|
||||
thread_local static T* cached_;
|
||||
static std::unordered_map<uint64_t, T> thread_id_to_rank_local_;
|
||||
static std::shared_mutex lock_;
|
||||
|
@ -75,6 +75,7 @@ class TORCH_API Store : public torch::CustomClassHolder {
|
||||
// watchKey() is deprecated and no longer supported.
|
||||
virtual void watchKey(
|
||||
const std::string& /* unused */,
|
||||
// NOLINTNEXTLINE(performance-unnecessary-value-param)
|
||||
WatchKeyCallback /* unused */) {
|
||||
TORCH_CHECK(false, "watchKey is deprecated, no implementation support it.");
|
||||
}
|
||||
|
@ -3,8 +3,7 @@
|
||||
#include <ATen/ATen.h>
|
||||
#include <torch/csrc/distributed/c10d/Store.hpp>
|
||||
|
||||
namespace c10d {
|
||||
namespace symmetric_memory {
|
||||
namespace c10d::symmetric_memory {
|
||||
|
||||
// SymmetricMemory represents symmetric allocations across a group of devices.
|
||||
// The allocations represented by a SymmetricMemory object are accessible by
|
||||
@ -38,7 +37,7 @@ namespace symmetric_memory {
|
||||
// for these two barriers, they can operate correctly in parallel.
|
||||
class TORCH_API SymmetricMemory : public c10::intrusive_ptr_target {
|
||||
public:
|
||||
virtual ~SymmetricMemory() {}
|
||||
~SymmetricMemory() override = default;
|
||||
|
||||
virtual std::vector<void*> get_buffer_ptrs() = 0;
|
||||
virtual std::vector<void*> get_signal_pad_ptrs() = 0;
|
||||
@ -72,7 +71,7 @@ class TORCH_API SymmetricMemory : public c10::intrusive_ptr_target {
|
||||
|
||||
class SymmetricMemoryAllocator : public c10::intrusive_ptr_target {
|
||||
public:
|
||||
virtual ~SymmetricMemoryAllocator(){};
|
||||
~SymmetricMemoryAllocator() override = default;
|
||||
|
||||
virtual void* alloc(
|
||||
size_t size,
|
||||
@ -159,5 +158,4 @@ TORCH_API c10::intrusive_ptr<SymmetricMemory> get_symmetric_memory(
|
||||
TORCH_API bool has_multicast_support(
|
||||
c10::DeviceType device_type,
|
||||
int device_idx);
|
||||
} // namespace symmetric_memory
|
||||
} // namespace c10d
|
||||
} // namespace c10d::symmetric_memory
|
||||
|
@ -109,6 +109,7 @@ class IntrusivePtrNoGilDestructor {
|
||||
explicit IntrusivePtrNoGilDestructor(T* impl)
|
||||
// NOLINTNEXTLINE(bugprone-exception-escape)
|
||||
: impl_(c10::intrusive_ptr<T>::unsafe_steal_from_new(impl)) {}
|
||||
// NOLINTNEXTLINE(bugprone-exception-escape)
|
||||
~IntrusivePtrNoGilDestructor() {
|
||||
if (impl_) {
|
||||
if (PyGILState_Check()) {
|
||||
@ -338,6 +339,7 @@ class PythonRequest : public ::c10d::control_plane::Request {
|
||||
};
|
||||
class PythonResponse : public ::c10d::control_plane::Response {
|
||||
public:
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-rvalue-reference-param-not-moved)
|
||||
void setContent(std::string&& content, const std::string& content_type)
|
||||
override {
|
||||
PYBIND11_OVERRIDE_PURE_NAME(
|
||||
@ -1610,7 +1612,7 @@ Arguments:
|
||||
if (!store) {
|
||||
throw py::value_error("store argument cannot be None");
|
||||
}
|
||||
return new ::c10d::PrefixStore(prefix, store);
|
||||
return new ::c10d::PrefixStore(prefix, std::move(store));
|
||||
}),
|
||||
py::arg("prefix"),
|
||||
py::arg("store"))
|
||||
@ -2755,7 +2757,7 @@ options :class:`~torch.distributed.ProcessGroupNCCL.Options`).
|
||||
.def(
|
||||
"_verify_work_timeout",
|
||||
[](const c10::intrusive_ptr<::c10d::ProcessGroupNCCL>& self,
|
||||
const c10::intrusive_ptr<::c10d::Work> work,
|
||||
const c10::intrusive_ptr<::c10d::Work>& work,
|
||||
const std::chrono::milliseconds& timeout) {
|
||||
return self->verifyWorkTimeoutForTest(work, timeout);
|
||||
},
|
||||
@ -3415,6 +3417,7 @@ static PyMethodDef methods[] = { // NOLINT
|
||||
{"_c10d_init", c10d_init, METH_NOARGS, nullptr},
|
||||
{nullptr, nullptr, 0, nullptr}};
|
||||
|
||||
// NOLINTNEXTLINE(misc-use-internal-linkage)
|
||||
PyMethodDef* python_functions() {
|
||||
return methods;
|
||||
}
|
||||
|
@ -7,6 +7,7 @@
|
||||
|
||||
namespace c10d::intra_node_comm {
|
||||
|
||||
// NOLINTNEXTLINE(misc-use-internal-linkage)
|
||||
bool isIntraNodeCommSupported();
|
||||
|
||||
static std::vector<std::string> ENABLE_INTRA_NODE_COMM = {
|
||||
@ -86,7 +87,7 @@ bool IntraNodeComm::isEnabled() {
|
||||
* Use c10d::Store to perform allgather on a trivially copyable type.
|
||||
*/
|
||||
template <typename T>
|
||||
std::vector<T> storeAllGather(
|
||||
static std::vector<T> storeAllGather(
|
||||
const c10::intrusive_ptr<c10d::Store>& store,
|
||||
const std::string& prefix,
|
||||
size_t rank,
|
||||
@ -134,10 +135,12 @@ bool IntraNodeComm::rendezvous() {
|
||||
return false;
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(bugprone-signed-char-misuse)
|
||||
deviceIdx_ = at::cuda::current_device();
|
||||
|
||||
// Exchange hostname and device bus ID
|
||||
struct DevInfo {
|
||||
// NOLINTNEXTLINE
|
||||
char hostname[HOST_NAME_MAX + 1];
|
||||
int deviceIdx;
|
||||
};
|
||||
@ -170,7 +173,8 @@ bool IntraNodeComm::rendezvous() {
|
||||
}
|
||||
|
||||
auto groupName = "IntraNodeComm" + std::to_string(intraNodeCommIdx++);
|
||||
set_group_info(groupName, rank_, worldSize_, store_);
|
||||
set_group_info(
|
||||
groupName, static_cast<int>(rank_), static_cast<int>(worldSize_), store_);
|
||||
auto allocator = get_allocator(c10::DeviceType::CUDA);
|
||||
symmetricMemoryPtr_ = allocator->alloc(bufferSize_, deviceIdx_, groupName);
|
||||
symmetricMemory_ = allocator->rendezvous(symmetricMemoryPtr_);
|
||||
|
@ -6,7 +6,6 @@
|
||||
#pragma once
|
||||
|
||||
#include <ATen/ATen.h>
|
||||
#include <vector>
|
||||
|
||||
namespace torch::distributed::c10d::quantization {
|
||||
|
||||
|
@ -6,7 +6,6 @@
|
||||
#pragma once
|
||||
|
||||
#include <ATen/ATen.h>
|
||||
#include <vector>
|
||||
|
||||
namespace torch::distributed::c10d::quantization {
|
||||
|
||||
|
@ -7,11 +7,11 @@
|
||||
#include <vector>
|
||||
|
||||
namespace c10d {
|
||||
const int kUnsetSeqNum = 0;
|
||||
constexpr int kUnsetSeqNum = 0;
|
||||
|
||||
namespace {
|
||||
constexpr int kByteOffset = 8;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
// Converts from int to char vec to write in store
|
||||
template <typename T>
|
||||
|
Reference in New Issue
Block a user