diff --git a/.clang-tidy b/.clang-tidy index 4cf4d5ab303b..3c96106fe53d 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -59,9 +59,9 @@ performance-*, -performance-enum-size, readability-container-size-empty, readability-delete-null-pointer, -readability-duplicate-include +readability-duplicate-include, readability-misplaced-array-index, -readability-redundant* +readability-redundant*, readability-simplify-subscript-expr, readability-string-compare, -readability-redundant-access-specifiers, diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp index 575dc9315805..d43e189111c2 100644 --- a/aten/src/ATen/native/cuda/Blas.cpp +++ b/aten/src/ATen/native/cuda/Blas.cpp @@ -1375,7 +1375,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2, if (scaling_choice_a == ScalingType::RowWise && scaling_choice_b == ScalingType::RowWise && ((dprops->major < 9 || CUBLAS_VERSION < 120900 || cublasLtGetVersion() < 120900) // cuBLAS only supports tiled 1D factor layout for 1D block scaling, no 2D block scales - || (dprops->major >= 10 && (scale_a.sizes().size() || scale_b.sizes().size())))) { + || (dprops->major >= 10 && (!scale_a.sizes().empty() || !scale_b.sizes().empty())))) { TORCH_CHECK(out.dtype() == kBFloat16, "Only bf16 high precision output types are supported for row-wise scaling."); at::cuda::detail::f8f8bf16_rowwise( mat1, diff --git a/aten/src/ATen/native/cuda/LinearAlgebraStubs.cpp b/aten/src/ATen/native/cuda/LinearAlgebraStubs.cpp index 02dcf4e7675c..555548547437 100644 --- a/aten/src/ATen/native/cuda/LinearAlgebraStubs.cpp +++ b/aten/src/ATen/native/cuda/LinearAlgebraStubs.cpp @@ -8,7 +8,6 @@ #include #include #include -#include #include #include #include diff --git a/aten/src/ATen/native/cuda/jit_utils.cpp b/aten/src/ATen/native/cuda/jit_utils.cpp index 152aa324002f..f1753c1b52d2 100644 --- a/aten/src/ATen/native/cuda/jit_utils.cpp +++ b/aten/src/ATen/native/cuda/jit_utils.cpp @@ -1041,8 +1041,8 @@ std::string generate_code( // and `extra_args` for computation call if // extra arguments to capture runtime state are passed. // (look at polygamma for example). - std::string extra_params = ""; - std::string extra_args = ""; + std::string extra_params; + std::string extra_args; for (size_t i = 0; i < extra_args_typenames.size(); i++) { auto type = std::string(extra_args_typenames[i]); auto name = "extra_arg_" + std::to_string(i); @@ -1352,7 +1352,7 @@ std::string generate_reduction_code( int vec_size, int max_threads_codegen) { TORCH_INTERNAL_ASSERT(desc.nInputs == 1); - TORCH_INTERNAL_ASSERT(desc.extra_args_types.size() == 0); + TORCH_INTERNAL_ASSERT(desc.extra_args_types.empty()); return generate_reduction_code( desc.nOutputs, @@ -1451,7 +1451,7 @@ std::optional get_cache_dir() { std::string cache_dir; char* ptkcp = std::getenv("PYTORCH_KERNEL_CACHE_PATH"); // Create kernel_cache_dir if needed as we do not want to create the base directory passed by the user - std::string kernels_cache_dir = ""; + std::string kernels_cache_dir; if (ptkcp != nullptr) { cache_dir = std::string(ptkcp); } else { diff --git a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp index d52264026185..813c7fbdcc92 100644 --- a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp +++ b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include diff --git a/torch/csrc/autograd/python_function.cpp b/torch/csrc/autograd/python_function.cpp index 6b13c41978e9..1362502a5d28 100644 --- a/torch/csrc/autograd/python_function.cpp +++ b/torch/csrc/autograd/python_function.cpp @@ -813,7 +813,7 @@ static void _save_variables( const std::vector>& tensors_to_save, const std::shared_ptr& cdata_ptr, THPFunction* self) { - if (tensors_to_save.size() == 0) + if (tensors_to_save.empty()) return; size_t num_saved = tensors_to_save.size(); self->saved_variables.clear(); diff --git a/torch/csrc/distributed/c10d/Functional.cpp b/torch/csrc/distributed/c10d/Functional.cpp index 99b0c7d17bf7..16530f0e6502 100644 --- a/torch/csrc/distributed/c10d/Functional.cpp +++ b/torch/csrc/distributed/c10d/Functional.cpp @@ -35,7 +35,7 @@ at::Tensor allocate_all_gather_output( int64_t group_size) { TORCH_CHECK(input.is_contiguous()); auto output_size = input.sizes().vec(); - if (output_size.size() == 0) { + if (output_size.empty()) { output_size.push_back(group_size); } else { output_size[0] *= group_size; diff --git a/torch/csrc/distributed/c10d/GlooDeviceFactory.cpp b/torch/csrc/distributed/c10d/GlooDeviceFactory.cpp index 7bd7115ba8cf..d9a74e2efa37 100644 --- a/torch/csrc/distributed/c10d/GlooDeviceFactory.cpp +++ b/torch/csrc/distributed/c10d/GlooDeviceFactory.cpp @@ -196,7 +196,7 @@ std::shared_ptr<::gloo::transport::Device> makeGlooDevice( static auto transportName = c10::utils::get_env("GLOO_DEVICE_TRANSPORT"); if (transportName.has_value()) { return GlooDeviceRegistry()->Create( - transportName.value().c_str(), interfaceName, hostName, lazyInit); + transportName.value(), interfaceName, hostName, lazyInit); } #ifdef __linux__ diff --git a/torch/csrc/distributed/c10d/ProcessGroup.cpp b/torch/csrc/distributed/c10d/ProcessGroup.cpp index e57e2c2a8d41..9f79a09d236e 100644 --- a/torch/csrc/distributed/c10d/ProcessGroup.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroup.cpp @@ -165,7 +165,7 @@ c10::intrusive_ptr ProcessGroup::splitGroup( const std::optional& name, const std::optional& desc) { TORCH_CHECK( - ranks.size() > 0, + !ranks.empty(), "Split ranks cannot be empty. Please provide a non-empty list of ranks to split the group."); TORCH_CHECK( ranks.size() <= static_cast(size_), diff --git a/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp b/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp index 8ab924923fe9..5dd62cd38cdd 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp @@ -559,7 +559,7 @@ c10::intrusive_ptr ProcessGroupGloo::Options:: // Use interfaces listed in "GLOO_SOCKET_IFNAME", if set. auto ifnameEnv = c10::utils::get_env("GLOO_SOCKET_IFNAME"); if (ifnameEnv && ifnameEnv->size() > 1) { - for (const auto& iface : ::c10d::split(',', ifnameEnv->c_str())) { + for (const auto& iface : ::c10d::split(',', *ifnameEnv)) { options->devices.push_back( ::c10d::ProcessGroupGloo::createDeviceForInterface(iface, lazyInit)); } diff --git a/torch/csrc/distributed/c10d/ProcessGroupGlooCuda.cpp b/torch/csrc/distributed/c10d/ProcessGroupGlooCuda.cpp index 6e680b41fe8d..7af255f80e38 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupGlooCuda.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupGlooCuda.cpp @@ -1,6 +1,7 @@ #ifdef USE_C10D_GLOO #include #include +#include #include @@ -24,7 +25,7 @@ class AsyncAllreduceCUDADeviceWork : public ProcessGroupGloo::AsyncWork { "gloo:all_reduce", inputs), inputs_(inputs), - reduceOp_(reduceOp) {} + reduceOp_(std::move(reduceOp)) {} template void createAlgorithm(std::unique_ptr& algo) { diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp index e3ac4c09a9b0..3386d8ee0a66 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp @@ -1089,8 +1089,8 @@ class TORCH_API ProcessGroupNCCL : public Backend { bool useNonblocking(); protected: - int globalRankStart_; - int globalRankStride_; + int globalRankStart_{}; + int globalRankStride_{}; private: bool eagerInit_{false}; @@ -1380,7 +1380,7 @@ class TORCH_API ProcessGroupNCCL : public Backend { std::shared_ptr coalescedComm_ = nullptr; // Whether the coalesced calls are sync or async. - bool coalescedAsync_; + bool coalescedAsync_{}; // keeps track of input and output tensors when coalescing is in flight. Will // hand over these tensors to WorkNCCL's stash when coalescing is ended. diff --git a/torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp b/torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp index a0d2738ab692..624a8fc11b61 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp @@ -163,8 +163,8 @@ struct CollectiveFingerPrint { backend->allgather(output_tensors, tensors_to_verify)->wait(); // Verify equivalence for (const auto i : c10::irange(output_tensors.size())) { - const std::vector gathered_tensors = output_tensors[i]; - const at::Tensor reference_tensor = tensors_to_verify[i]; + const std::vector& gathered_tensors = output_tensors[i]; + const at::Tensor& reference_tensor = tensors_to_verify[i]; for (const auto rank : c10::irange(gathered_tensors.size())) { const auto& rank_tensor = gathered_tensors[rank]; if (!rank_tensor.equal(reference_tensor)) { diff --git a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp index c25e83c07c6d..6f54a9429210 100644 --- a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp +++ b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp @@ -263,10 +263,12 @@ constexpr static int kNumUvThreads = 16; std::unique_ptr makeMultiplexedUvChannel() { std::vector> contexts; + contexts.reserve(kNumUvThreads); std::vector> listeners; + listeners.reserve(kNumUvThreads); for ([[maybe_unused]] const auto laneIdx : c10::irange(kNumUvThreads)) { auto context = tensorpipe::transport::uv::create(); - std::string address = TensorPipeAgent::guessAddress(); + const std::string& address = TensorPipeAgent::guessAddress(); contexts.push_back(std::move(context)); listeners.push_back(contexts.back()->listen(address)); }