Revert changes

More fixes
Fix clang-tidy warnings of performance
2025-11-13 05:05:17 +08:00 · 2025-09-21 09:20:22 +00:00 · 2025-09-21 09:20:22 +00:00 · 2025-09-21 09:20:22 +00:00
41 changed files with 114 additions and 103 deletions
--- a/aten/src/ATen/InferSize.h
+++ b/aten/src/ATen/InferSize.h
@ -94,10 +94,10 @@ inline at::DimVector infer_size_dv(IntArrayRef shape, int64_t numel) {

 inline at::SymDimVector infer_size_dv(
    c10::SymIntArrayRef shape,
-    c10::SymInt numel) {
+    const c10::SymInt& numel) {
  auto res = at::SymDimVector(shape);
  infer_size_impl<c10::SymIntArrayRef, c10::SymInt, at::SymDimVector>(
-      shape, std::move(numel), res);
+      shape, numel, res);
  return res;
 }

--- a/aten/src/ATen/core/Dict.h
+++ b/aten/src/ATen/core/Dict.h
@ -6,7 +6,6 @@
 #include <c10/util/TypeList.h>
 #include <c10/util/intrusive_ptr.h>
 #include <c10/util/order_preserving_flat_hash_map.h>
-#include <optional>
 #include <ATen/core/TensorBody.h>
 #include <ATen/core/jit_type_base.h>

--- a/aten/src/ATen/core/dispatch/CppSignature.h
+++ b/aten/src/ATen/core/dispatch/CppSignature.h
@ -55,8 +55,7 @@ class TORCH_API CppSignature final {
  }

 private:
-  explicit CppSignature(std::type_index signature)
-      : signature_(std::move(signature)) {}
+  explicit CppSignature(std::type_index signature) : signature_(signature) {}
  std::type_index signature_;
 };

--- a/aten/src/ATen/core/dispatch/Dispatcher.cpp
+++ b/aten/src/ATen/core/dispatch/Dispatcher.cpp
@ -70,7 +70,7 @@ private:
 void _print_dispatch_trace(const std::string& label, const std::string& op_name, const DispatchKeySet& dispatchKeySet) {
  auto nesting_value = dispatch_trace_nesting_value();
  for (int64_t i = 0; i < nesting_value; ++i) std::cerr << " ";
-  std::cerr << label << " op=[" << op_name << "], key=[" << toString(dispatchKeySet.highestPriorityTypeId()) << "]" << std::endl;
+  std::cerr << label << " op=[" << op_name << "], key=[" << toString(dispatchKeySet.highestPriorityTypeId()) << "]" << '\n';
 }
 } // namespace detail

@ -213,9 +213,11 @@ OperatorHandle Dispatcher::findOrRegisterName_(const OperatorName& op_name) {
 // Windows build doesn't produce the destructor symbol in PyTorch libs
 // causing a linker failure in downstream projects.
 // x-ref https://github.com/pytorch/pytorch/issues/70032
+#if defined(_WIN32)
 OperatorHandle::~OperatorHandle() = default;
+#endif

-RegistrationHandleRAII Dispatcher::registerLibrary(std::string ns, std::string debug) {
+RegistrationHandleRAII Dispatcher::registerLibrary(const std::string& ns, std::string debug) {
  std::lock_guard<std::mutex> lock(guard_->mutex);
  auto found = libraries_.find(ns);
  TORCH_CHECK(
@ -306,7 +308,7 @@ PythonModuleMapType& pythonModulesSingleton() {

 }

-std::optional<std::pair<const char*, const char*>> Dispatcher::getPyStub(OperatorName op_name) {
+std::optional<std::pair<const char*, const char*>> Dispatcher::getPyStub(const OperatorName& op_name) {
  std::lock_guard<std::mutex> lock(guard_->mutex);
  auto found = pythonModulesSingleton().find(op_name);
  if (found == pythonModulesSingleton().end()) {
@ -342,7 +344,7 @@ RegistrationHandleRAII Dispatcher::registerPythonModule(
  });
 }

-void Dispatcher::throwIfHasPythonModule(OperatorName op_name) {
+void Dispatcher::throwIfHasPythonModule(const OperatorName& op_name) {
  std::lock_guard<std::mutex> lock(guard_->mutex);
  auto elt = pythonModulesSingleton().find(op_name);
  if (elt == pythonModulesSingleton().end()) {
@ -362,7 +364,7 @@ void Dispatcher::throwIfHasPythonModule(OperatorName op_name) {
 }

 RegistrationHandleRAII Dispatcher::registerImpl(
-  OperatorName op_name,
+  const OperatorName& op_name,
  std::optional<DispatchKey> dispatch_key,
  KernelFunction kernel,
  std::optional<impl::CppSignature> cpp_signature,
@ -377,7 +379,7 @@ RegistrationHandleRAII Dispatcher::registerImpl(
    *this,
    dispatch_key,
    std::move(kernel),
-    std::move(cpp_signature),
+    cpp_signature,
    std::move(inferred_function_schema),
    std::move(debug)
  );
@ -406,7 +408,7 @@ void Dispatcher::deregisterImpl_(const OperatorHandle& op, const OperatorName& o
  cleanup(op, op_name);
 }

-RegistrationHandleRAII Dispatcher::registerName(OperatorName op_name) {
+RegistrationHandleRAII Dispatcher::registerName(const OperatorName& op_name) {
  std::lock_guard<std::mutex> lock(guard_->mutex);
  auto op = findOrRegisterName_(op_name);
  ++op.operatorDef_->def_and_impl_count;
--- a/aten/src/ATen/core/dispatch/Dispatcher.h
+++ b/aten/src/ATen/core/dispatch/Dispatcher.h
@ -13,15 +13,10 @@
 #include <condition_variable>
 #include <list>
 #include <mutex>
-#include <type_traits>

 #include <ATen/core/enum_tag.h>
 #include <ATen/core/grad_mode.h>

-#ifndef NDEBUG
-#include <iostream>
-#endif
-
 namespace c10 {

 TORCH_API bool show_dispatch_trace();
@ -255,7 +250,7 @@ class TORCH_API Dispatcher final {
  // NB: steals the inferred function schema, as we may need to hold on to
  // it for a bit until the real schema turns up
  RegistrationHandleRAII registerImpl(
-      OperatorName op_name,
+      const OperatorName& op_name,
      std::optional<DispatchKey> dispatch_key,
      KernelFunction kernel,
      std::optional<impl::CppSignature> cpp_signature,
@ -274,15 +269,15 @@ class TORCH_API Dispatcher final {
  /**
   * Given an operator, throws if we have a pystub.
   */
-  void throwIfHasPythonModule(OperatorName op_name);
+  void throwIfHasPythonModule(const OperatorName& op_name);

  std::optional<std::pair<const char*, const char*>> getPyStub(
-      OperatorName op_name);
+      const OperatorName& op_name);

  /**
   * Register a new operator by name.
   */
-  RegistrationHandleRAII registerName(OperatorName op_name);
+  RegistrationHandleRAII registerName(const OperatorName& op_name);

  /**
   * Register a fallback kernel for a backend.
@ -300,7 +295,9 @@ class TORCH_API Dispatcher final {
   * API.  These invocations are only permitted once per program, so we raise
   * an error if this is called again for the same namespace.
   */
-  RegistrationHandleRAII registerLibrary(std::string ns, std::string debug);
+  RegistrationHandleRAII registerLibrary(
+      const std::string& ns,
+      std::string debug);

  // ------------------------------------------------------------------------
  //
@ -448,8 +445,12 @@ class TORCH_API OperatorHandle {
  OperatorHandle& operator=(OperatorHandle&&) noexcept = default;
  OperatorHandle(const OperatorHandle&) = default;
  OperatorHandle& operator=(const OperatorHandle&) = default;
+#if defined(_WIN32)
  // NOLINTNEXTLINE(performance-trivially-destructible)
  ~OperatorHandle();
+#else
+  ~OperatorHandle() = default;
+#endif

  const OperatorName& operator_name() const {
    return operatorDef_->op.operator_name();
--- a/aten/src/ATen/core/function_schema.h
+++ b/aten/src/ATen/core/function_schema.h
@ -556,7 +556,7 @@ inline std::ostream& operator<<(std::ostream& out, const Argument& arg) {
  // real_type versus fake_type: in order to be compatible with FunctionSchema
  // parser, printing an argument with either MemoryFormat or Layout type should
  // give us the original schema string, hence printing out real_type.
-  auto type = arg.real_type();
+  const auto& type = arg.real_type();
  bool is_opt = type->kind() == OptionalType::Kind;
  auto unopt_type = is_opt ? type->castRaw<OptionalType>()->getElementType() : type;

--- a/aten/src/ATen/core/jit_type.h
+++ b/aten/src/ATen/core/jit_type.h
@ -232,7 +232,7 @@ struct TORCH_API OptionalType : public UnionType {
  static TypePtr ofTensor();
  //
  // global singleton
-  static TypePtr get(TypePtr inner);
+  static TypePtr get(const TypePtr& inner);

 private:
  explicit OptionalType(const TypePtr& contained);
@ -895,7 +895,7 @@ struct TORCH_API ListType
  // the type List<T>.
  // The extra "identifier" argument is needed beccause we have multiple container types
  // that all re-use this function (List<T>, array<T, N>, etc.)
-  static TypePtr get(const std::string& identifier, TypePtr inner);
+  static TypePtr get(const std::string& identifier, const TypePtr& inner);

  // common cast List[Tensor]
  static ListTypePtr ofTensors();
--- a/aten/src/ATen/core/type.cpp
+++ b/aten/src/ATen/core/type.cpp
@ -274,7 +274,7 @@ ListTypePtr ListType::ofNumbers() {
  return value;
 }

-TypePtr OptionalType::get(TypePtr inner) {
+TypePtr OptionalType::get(const TypePtr& inner) {
  static ska::flat_hash_map<TypePtr, TypePtr> containerTypePtrs;
  static std::mutex mutex;
  // Perf from the lock is ok because this function is guarded behind
@ -287,7 +287,7 @@ TypePtr OptionalType::get(TypePtr inner) {
  return containerTypePtrs[inner];
 }

-TypePtr ListType::get(const std::string& identifier, TypePtr inner) {
+TypePtr ListType::get(const std::string& identifier, const TypePtr& inner) {
  static ska::flat_hash_map<std::tuple<std::string, TypePtr>, TypePtr> containerTypePtrs;
  static std::mutex mutex;
  // Perf from the lock is ok because this function is guarded behind
--- a/aten/src/ATen/cuda/detail/DeviceThreadHandles.h
+++ b/aten/src/ATen/cuda/detail/DeviceThreadHandles.h
@ -122,7 +122,7 @@ struct DeviceThreadHandlePool : public std::enable_shared_from_this<DeviceThread

    // Called by the destructor.  Releases this thread's handles back into the pool.
    void release() {
-        if(my_handles.size() > 0) {
+        if(!my_handles.empty()) {
            auto parent = weak_parent.lock();
            if (!parent) {
                // If this thread exits after atexit handlers have completed, the
--- a/aten/src/ATen/functorch/ADInterpreters.cpp
+++ b/aten/src/ATen/functorch/ADInterpreters.cpp
@ -139,7 +139,7 @@ static void autogradBasedTransformSendToNext(
  std::bitset<default_bitset_size> outputs_aliasing_immutable; // set = 1 for all bits
  if(!grad_special_case) {
    for (auto idx = stack->size() - args_size; idx < stack->size(); idx++) {
-      const auto ivalue = (*stack)[idx];
+      const auto& ivalue = (*stack)[idx];
      if (!ivalue.isTensor()) {
        continue; // only input that can be aliased is a tensor, not a tensor list (expect in ops without returns)
      }
--- a/aten/src/ATen/functorch/BatchRulesLinearAlgebra.cpp
+++ b/aten/src/ATen/functorch/BatchRulesLinearAlgebra.cpp
@ -6,6 +6,8 @@

 #include <ATen/functorch/BatchRulesHelper.h>

+#include <algorithm>
+
 namespace at::functorch {

 typedef std::tuple<Tensor, std::optional<int64_t>> oneOutput;
@ -315,7 +317,7 @@ oneOutput linalg_lu_solve_batch_rule(
  const auto LU_num_batch_dims = rankWithoutBatchDim(LU_, LU_bdim) - LU_min_rank;
  const auto pivots_num_batch_dims = rankWithoutBatchDim(pivots_, pivots_bdim) - pivots_min_rank;
  const auto B_num_batch_dims = rankWithoutBatchDim(B_, B_bdim) - B_min_rank;
-  const auto max_num_batch_dims = std::max(std::max(LU_num_batch_dims, pivots_num_batch_dims), B_num_batch_dims);
+  const auto max_num_batch_dims = std::max({LU_num_batch_dims, pivots_num_batch_dims, B_num_batch_dims});

  LU_ = maybePadToLogicalRank(LU_, LU_bdim, max_num_batch_dims + LU_min_rank);
  pivots_ = maybePadToLogicalRank(pivots_, pivots_bdim, max_num_batch_dims + pivots_min_rank);
--- a/aten/src/ATen/native/BinaryOps.cpp
+++ b/aten/src/ATen/native/BinaryOps.cpp
@ -897,11 +897,11 @@ Tensor& div_(Tensor& self, const Scalar& other) {
 }

 Tensor div(const Tensor& self, const Scalar& other, std::optional<std::string_view> rounding_mode) {
-  return self.div(wrapped_scalar_tensor(other), std::move(rounding_mode)); // redispatch!
+  return self.div(wrapped_scalar_tensor(other), rounding_mode); // redispatch!
 }

 Tensor& div_(Tensor& self, const Scalar& other, std::optional<std::string_view> rounding_mode) {
-  return self.div_(wrapped_scalar_tensor(other), std::move(rounding_mode)); // redispatch!
+  return self.div_(wrapped_scalar_tensor(other), rounding_mode); // redispatch!
 }

 // divide, alias for div
@ -926,23 +926,23 @@ Tensor& divide_(Tensor& self, const Scalar& other) {
 }

 Tensor& divide_out(const Tensor& self, const Tensor& other, std::optional<std::string_view> rounding_mode, Tensor& result) {
-  return at::div_out(result, self, other, std::move(rounding_mode));
+  return at::div_out(result, self, other, rounding_mode);
 }

 Tensor divide(const Tensor& self, const Tensor& other, std::optional<std::string_view> rounding_mode) {
-  return self.div(other, std::move(rounding_mode));
+  return self.div(other, rounding_mode);
 }

 Tensor& divide_(Tensor& self, const Tensor& other, std::optional<std::string_view> rounding_mode) {
-  return self.div_(other, std::move(rounding_mode));
+  return self.div_(other, rounding_mode);
 }

 Tensor divide(const Tensor& self, const Scalar& other, std::optional<std::string_view> rounding_mode) {
-  return self.div(other, std::move(rounding_mode));
+  return self.div(other, rounding_mode);
 }

 Tensor& divide_(Tensor& self, const Scalar& other, std::optional<std::string_view> rounding_mode) {
-  return self.div_(other, std::move(rounding_mode));
+  return self.div_(other, rounding_mode);
 }

 // true_divide, an alias for div
--- a/aten/src/ATen/native/Histogram.cpp
+++ b/aten/src/ATen/native/Histogram.cpp
@ -150,7 +150,7 @@ void histogramdd_prepare_out(const Tensor& input, const std::vector<int64_t>& bi
 void histogramdd_prepare_out(const Tensor& input, TensorList bins,
        const Tensor& hist, const TensorList& bin_edges) {
    std::vector<int64_t> bin_ct(bins.size());
-    std::transform(bins.begin(), bins.end(), bin_ct.begin(), [](Tensor t) { return t.numel() - 1; });
+    std::transform(bins.begin(), bins.end(), bin_ct.begin(), [](const Tensor& t) { return t.numel() - 1; });
    histogramdd_prepare_out(input, bin_ct, hist, bin_edges);
 }

--- a/aten/src/ATen/native/Linear.cpp
+++ b/aten/src/ATen/native/Linear.cpp
@ -360,7 +360,7 @@ Tensor einsum(std::string_view equation, TensorList operands, at::OptionalIntArr
  // to compute the number of dimensions covered by ellipsis.
  for(const auto i : c10::irange(num_ops)) {
    const auto& operand = operands[i];
-    const auto labels = op_labels[i];
+    const auto& labels = op_labels[i];
    const auto ndims = operand.dim();
    int64_t nlabels = static_cast<int64_t>(labels.size());
    bool has_ellipsis = false;
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@ -237,7 +237,7 @@ TORCH_META_FUNC(linalg_vector_norm)(const Tensor& self, const Scalar& scalar_ord
  at::detail::check_linalg_norm_dtype(opt_dtype, self.scalar_type(), "linalg.vector_norm");

  auto mask = at::native::make_dim_mask(dim, self.dim());
-  auto shape = at::native::shape_from_dim_mask(self, std::move(mask), keepdim);
+  auto shape = at::native::shape_from_dim_mask(self, mask, keepdim);
  auto options = self.options()
                     .dtype(toRealValueType(opt_dtype.value_or(self.scalar_type())));

@ -641,7 +641,7 @@ namespace {
 Tensor linalg_matrix_power_impl(
    const Tensor& self,
    int64_t n,
-    std::optional<Tensor> _out) {
+    const std::optional<Tensor>& _out) {
  NoTF32Guard disable_tf32;
  auto out = _out.value_or(Tensor());

@ -1019,7 +1019,7 @@ Tensor multi_dot_impl(TensorList _tensors, std::optional<Tensor> _out) {
  Tensor result;

  if (_out.has_value()) {
-    auto out = *_out;
+    const auto& out = *_out;
    TORCH_CHECK(
        dtype == out.dtype(),
        "multi_dot(): expected out tensor to have dtype ",
--- a/aten/src/ATen/native/LossCTC.cpp
+++ b/aten/src/ATen/native/LossCTC.cpp
@ -493,7 +493,7 @@ Tensor get_clamped_target_length(
 // the gradient is implemented for _cudnn_ctc_loss (just in derivatives.yaml) and _ctc_loss and this function has automatic gradients
 // it also handles the reduction if desired
 template <typename LengthsType>
-Tensor ctc_loss_impl(const Tensor& log_probs_, const Tensor& targets, LengthsType input_lengths, LengthsType target_lengths, int64_t BLANK, int64_t reduction, bool zero_infinity) {
+Tensor ctc_loss_impl(const Tensor& log_probs_, const Tensor& targets, const LengthsType& input_lengths, const LengthsType& target_lengths, int64_t BLANK, int64_t reduction, bool zero_infinity) {
  auto is_batched = log_probs_.dim() == 3;
  Tensor log_probs = is_batched ? log_probs_ : log_probs_.unsqueeze(1);
  bool use_cudnn =
--- a/aten/src/ATen/native/Normalization.cpp
+++ b/aten/src/ATen/native/Normalization.cpp
@ -599,7 +599,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, int64_t> _batch_norm_impl_index(
    check_dims_match_num_input_features("weight", num_features, weight.sym_numel());
  }
  if (bias.defined()) {
-    check_dims_match_num_input_features("bias", std::move(num_features), bias.sym_numel());
+    check_dims_match_num_input_features("bias", num_features, bias.sym_numel());
  }

  BatchNormBackend backend = _select_batch_norm_backend(input, weight, bias, running_mean, running_var, training, eps);
@ -923,7 +923,7 @@ std::tuple<Tensor, Tensor, Tensor> _batch_norm_legit_no_stats_cpu(
 std::tuple<Tensor, Tensor, Tensor> _batch_norm_legit_no_training(
    const Tensor& self, const std::optional<Tensor>& weight_opt, const std::optional<Tensor>& bias_opt,
    const Tensor& running_mean, const Tensor& running_var, double momentum, double eps) {
-  return at::_native_batch_norm_legit(self, weight_opt, bias_opt, const_cast<Tensor&>(running_mean), const_cast<Tensor&>(running_var), /*train=*/false, momentum, eps);
+  return at::_native_batch_norm_legit(self, weight_opt, bias_opt, const_cast<Tensor&>(running_mean), const_cast<Tensor&>(running_var), /*training=*/false, momentum, eps);
 }


--- a/aten/src/ATen/native/RNN.cpp
+++ b/aten/src/ATen/native/RNN.cpp
@ -1533,7 +1533,7 @@ std::tuple<Tensor, Tensor> lstm_cell(
  check_rnn_cell_forward_input(input, w_ih.sym_size(1));
  auto hidden_size = w_hh.sym_size(1);
  check_rnn_cell_forward_hidden(input, hx[0], hidden_size, 0);
-  check_rnn_cell_forward_hidden(input, hx[1], std::move(hidden_size), 1);
+  check_rnn_cell_forward_hidden(input, hx[1], hidden_size, 1);
  static at::Tensor undefined;
  return LSTMCell<CellParams>{}(input, std::make_tuple(hx[0], hx[1]), CellParams{w_ih, w_hh, b_ih, b_hh, undefined});
 }
@ -1612,13 +1612,13 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _thnn_differentiable_gru_cell
    h_g = h_g + hidden_bias;
  }
  auto chunked_input_gates = in_g.unsafe_chunk(3, 1);
-  Tensor ir = chunked_input_gates[0];
-  Tensor ii = chunked_input_gates[1];
-  Tensor in = chunked_input_gates[2];
+  const Tensor& ir = chunked_input_gates[0];
+  const Tensor& ii = chunked_input_gates[1];
+  const Tensor& in = chunked_input_gates[2];
  auto chunked_hidden_gates = h_g.unsafe_chunk(3, 1);
-  Tensor hr = chunked_hidden_gates[0];
-  Tensor hi = chunked_hidden_gates[1];
-  Tensor hn = chunked_hidden_gates[2];
+  const Tensor& hr = chunked_hidden_gates[0];
+  const Tensor& hi = chunked_hidden_gates[1];
+  const Tensor& hn = chunked_hidden_gates[2];
  Tensor rg = (ir + hr).sigmoid();
  Tensor ig = (ii + hi).sigmoid();
  Tensor grad_hx = grad_hy * ig;
--- a/aten/src/ATen/native/UnaryOps.cpp
+++ b/aten/src/ATen/native/UnaryOps.cpp
@ -409,17 +409,17 @@ static inline Tensor& unary_op_impl_out(Tensor& result, const Tensor& self, Stub
 }

 template <typename Stub, typename ...Args>
-static inline Tensor& unary_op_impl_float_out(Tensor& result, const Tensor& self, Stub& stub, Args... args) {
+static inline Tensor& unary_op_impl_float_out(Tensor& result, const Tensor& self, Stub& stub, Args&&... args) {
  auto iter = TensorIterator::unary_float_op(result, self);
-  stub(iter.device_type(), iter, args...);
+  stub(iter.device_type(), iter, std::forward<Args>(args)...);
  return result;
 }

 template <typename Stub, typename ...Args>
-static inline Tensor unary_op_impl_float(const Tensor& self, Stub& stub, Args... args) {
+static inline Tensor unary_op_impl_float(const Tensor& self, Stub& stub, Args&&... args) {
  Tensor result;
  auto iter = TensorIterator::unary_float_op(result, self);
-  stub(iter.device_type(), iter, args...);
+  stub(iter.device_type(), iter, std::forward<Args>(args)...);
  return iter.output();
 }

--- a/aten/src/ATen/native/Unique.cpp
+++ b/aten/src/ATen/native/Unique.cpp
@ -323,7 +323,7 @@ std::tuple<Tensor, Tensor, Tensor> unique_consecutive_cpu_template(

 template<class ForwardIt>
 ForwardIt _unique_dim_cpu_impl(ForwardIt first, ForwardIt last,
-  std::vector<int64_t>& indices, Tensor inverse_indices_vec, Tensor counts) {
+  std::vector<int64_t>& indices, const Tensor& inverse_indices_vec, const Tensor& counts) {
    if (first == last) {
      return last;
    }
--- a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_deserialize.cpp
+++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_deserialize.cpp
@ -24,7 +24,7 @@ constexpr int64_t num_output_channels_index [[maybe_unused]] = 10;
 constexpr int64_t num_input_channels_index [[maybe_unused]] = 11;

 template <typename TENSOR_DTYPE, typename VEC_DTYPE>
-std::vector<VEC_DTYPE> unwrap_vector(at::Tensor tensor) {
+std::vector<VEC_DTYPE> unwrap_vector(const at::Tensor& tensor) {
  std::vector<VEC_DTYPE> vec(tensor.numel());
  TENSOR_DTYPE* tensor_data_ptr = tensor.data_ptr<TENSOR_DTYPE>();
  std::copy(tensor_data_ptr, tensor_data_ptr + tensor.numel(), vec.data());
@ -39,7 +39,7 @@ std::vector<VEC_DTYPE> unwrap_vector(at::Tensor tensor) {
 */
 void unpack_bcsr(
    int8_t* dst,
-    ao::sparse::BCSR bcsr,
+    const ao::sparse::BCSR& bcsr,
    const int64_t R,
    const int64_t C,
    const int64_t RB,
--- a/aten/src/ATen/native/layer_norm.h
+++ b/aten/src/ATen/native/layer_norm.h
@ -35,7 +35,7 @@ C10_ALWAYS_INLINE void _check_rms_norm_inputs_symint(
    std::stringstream ss;
    ss << "Given normalized_shape=" << normalized_shape
       << ", expected input with shape [*";
-    for (auto size : normalized_shape) {
+    for (const auto& size : normalized_shape) {
      ss << ", " << size;
    }
    ss << "], but got input of size" << input_shape;
--- a/aten/src/ATen/native/nested/NestedTensorBinaryOps.cpp
+++ b/aten/src/ATen/native/nested/NestedTensorBinaryOps.cpp
@ -77,7 +77,7 @@ static Tensor NestedTensor_elementwise_Tensor(
    const Tensor& other,
    const std::string& op_name,
    bool supports_striding,
-    Func f) {
+    const Func& f) {
  Tensor self_contiguous = self;
  Tensor other_contiguous = other;
  // self is a scalar
@ -238,7 +238,7 @@ static Tensor& NestedTensor_elementwise__Tensor(
    Tensor& self,
    const Tensor& other,
    const std::string& op_name,
-    Func f) {
+    const Func& f) {
  // self is a scalar
  if (!self.is_nested() && self.dim() == 0 && self.numel() == 1) {
    auto other_impl = get_nested_tensor_impl(other);
--- a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp
+++ b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp
@ -149,7 +149,7 @@ Tensor MakeStridedQTensorCPU(
    const IntArrayRef& sizes,
    const IntArrayRef& strides,
    const TensorOptions& options,
-    QuantizerPtr quantizer) {
+    const QuantizerPtr& quantizer) {
  AT_ASSERT(options.device().is_cpu());
  at::native::check_size_nonnegative(sizes);
  auto* allocator = at::getCPUAllocator();
--- a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h
+++ b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h
@ -37,7 +37,7 @@ struct TORCH_API PackedLinearWeight : public LinearPackedParamsBase {
        col_offsets(std::move(col_offsets)),
        w_scale(std::move(w_scale)),
        w_zp(std::move(w_zp)),
-        q_scheme(std::move(q_scheme)) {}
+        q_scheme(q_scheme) {}
  std::unique_ptr<fbgemm::PackBMatrix<int8_t>> w;
  std::optional<at::Tensor> bias_;
  std::vector<int32_t> col_offsets;
@ -316,7 +316,7 @@ Tensor MakeStridedQTensorCPU(
    const IntArrayRef& sizes,
    const IntArrayRef& strides,
    const TensorOptions& options,
-    QuantizerPtr quantizer);
+    const QuantizerPtr& quantizer);

 Tensor MakeEmptyAffineQuantizedChannelsLast3dTensor(
    int64_t N,
--- a/aten/src/ATen/quantized/QTensorImpl.cpp
+++ b/aten/src/ATen/quantized/QTensorImpl.cpp
@ -7,7 +7,7 @@ QTensorImpl::QTensorImpl(
    DispatchKeySet key_set,
    const caffe2::TypeMeta data_type,
    QuantizerPtr quantizer)
-    : TensorImpl(std::move(storage), std::move(key_set), data_type),
+    : TensorImpl(std::move(storage), key_set, data_type),
      quantizer_(std::move(quantizer)) {}

 QTensorImpl::QTensorImpl(
@ -16,7 +16,7 @@ QTensorImpl::QTensorImpl(
    DispatchKeySet key_set,
    const caffe2::TypeMeta data_type,
    QuantizerPtr quantizer)
-    : TensorImpl(type, std::move(storage), std::move(key_set), data_type),
+    : TensorImpl(type, std::move(storage), key_set, data_type),
      quantizer_(std::move(quantizer)) {}

 const char* QTensorImpl::tensorimpl_type_name() const {
--- a/aten/src/ATen/quantized/QTensorImpl.h
+++ b/aten/src/ATen/quantized/QTensorImpl.h
@ -4,6 +4,8 @@
 #include <c10/core/TensorImpl.h>
 #include <c10/util/Exception.h>

+#include <utility>
+
 namespace at {

 /**
@ -36,7 +38,7 @@ struct TORCH_API QTensorImpl : public c10::TensorImpl {
  }

  void set_quantizer_(QuantizerPtr quantizer) {
-    quantizer_ = quantizer;
+    quantizer_ = std::move(quantizer);
  }

  /**
--- a/aten/src/ATen/quantized/Quantizer.cpp
+++ b/aten/src/ATen/quantized/Quantizer.cpp
@ -107,7 +107,7 @@ static int64_t get_sub_byte_tensor_size(IntArrayRef sizes, size_t dtype_itemsize
 inline Tensor new_qtensor(
    IntArrayRef sizes,
    const TensorOptions& options,
-    QuantizerPtr quantizer) {
+    const QuantizerPtr& quantizer) {
  auto memory_format = options.memory_format_opt().value_or(MemoryFormat::Contiguous);
  auto device = options.device();
  at::Allocator* allocator = nullptr;
@ -338,7 +338,7 @@ Tensor from_blob_quantized_per_tensor_affine(
  const std::size_t datasize = size * itemsize;

  DataPtr data_ptr = InefficientStdFunctionContext::makeDataPtr(
-      data, deleter, options.device());
+      data, std::move(deleter), options.device());

  Storage storage{Storage::use_byte_size_t{}, datasize, std::move(data_ptr)};

@ -411,7 +411,7 @@ Tensor from_blob_quantized_per_channel_affine(
  const std::size_t datasize = size * itemsize;

  DataPtr data_ptr = InefficientStdFunctionContext::makeDataPtr(
-      data, deleter, options.device());
+      data, std::move(deleter), options.device());

  Storage storage{Storage::use_byte_size_t{}, datasize, std::move(data_ptr)};

--- a/aten/src/ATen/quantized/Quantizer.h
+++ b/aten/src/ATen/quantized/Quantizer.h
@ -196,8 +196,8 @@ struct TORCH_API PerChannelAffineFloatQParamsQuantizer : public PerChannelAffine
      Tensor zero_points,
      int64_t axis)
      : PerChannelAffineQuantizer(scalar_type,
-        scales,
-        zero_points,
+        std::move(scales),
+        std::move(zero_points),
        axis) {}

  QScheme qscheme() const override {
@ -246,7 +246,7 @@ TORCH_API QuantizerPtr make_unknown_quantizer(ScalarType scalar_type);
 TORCH_API Tensor new_qtensor(
    IntArrayRef sizes,
    const TensorOptions& options,
-    QuantizerPtr quantizer);
+    const QuantizerPtr& quantizer);

 TORCH_API void set_quantizer_(const Tensor& self, ConstQuantizerPtr quantizer);

--- a/caffe2/serialize/inline_container.cc
+++ b/caffe2/serialize/inline_container.cc
@ -396,7 +396,8 @@ size_t PyTorchStreamReader::getRecordMultiReaders(
  size_t perThreadSize = (n + nthread - 1) / nthread;
  std::vector<size_t> readSizes(nthread, 0);
  std::lock_guard<std::mutex> guard(reader_lock_);
-  for (size_t i = 0; i < nthread; i++) {
+  loaderThreads.reserve(nthread);
+for (size_t i = 0; i < nthread; i++) {
    loaderThreads.emplace_back([this,
                                name,
                                i,
@ -415,7 +416,7 @@ size_t PyTorchStreamReader::getRecordMultiReaders(
          size =
              read(recordOff + startPos, (char*)dst + startPos, threadReadSize);
        } else {
-          auto reader = additionalReaders[i - 1];
+          const auto& reader = additionalReaders[i - 1];
          size = reader->read(
              recordOff + startPos, (char*)dst + startPos, threadReadSize);
        }
@ -641,7 +642,7 @@ size_t PyTorchStreamReader::getRecordSize(const std::string& name) {

 size_t PyTorchStreamReader::getRecordOffsetNoRead(
    size_t cursor,
-    std::string filename,
+    const std::string& filename,
    size_t size,
    uint64_t alignment) {
  std::string full_name = archive_name_plus_slash_ + filename;
@ -697,7 +698,7 @@ PyTorchStreamWriter::PyTorchStreamWriter(
 }

 PyTorchStreamWriter::PyTorchStreamWriter(
-    const std::function<size_t(const void*, size_t)> writer_func,
+    const std::function<size_t(const void*, size_t)>& writer_func,
    bool compute_crc32,
    uint64_t alignment)
    : archive_name_("archive"),
@ -712,7 +713,7 @@ void PyTorchStreamWriter::setup(const string& file_name) {
  memset(ar_.get(), 0, sizeof(mz_zip_archive));
  archive_name_plus_slash_ = archive_name_ + "/"; // for writeRecord().

-  if (archive_name_.size() == 0) {
+  if (archive_name_.empty()) {
    CAFFE_THROW("invalid file name: ", file_name);
  }

--- a/caffe2/serialize/inline_container.h
+++ b/caffe2/serialize/inline_container.h
@ -180,7 +180,7 @@ class TORCH_API PyTorchStreamReader final {
  size_t getRecordOffset(const std::string& name);
  size_t getRecordOffsetNoRead(
      size_t cursor,
-      std::string filename,
+      const std::string& filename,
      size_t size,
      uint64_t alignment);
  bool hasRecord(const std::string& name);
@ -232,7 +232,7 @@ class TORCH_API PyTorchStreamWriter final {
      bool compute_crc32 = true,
      uint64_t alignment = 64);
  explicit PyTorchStreamWriter(
-      const std::function<size_t(const void*, size_t)> writer_func,
+      const std::function<size_t(const void*, size_t)>& writer_func,
      bool compute_crc32 = true,
      uint64_t alignment = 64);

--- a/torch/csrc/api/include/torch/nn/functional/loss.h
+++ b/torch/csrc/api/include/torch/nn/functional/loss.h
@ -114,8 +114,8 @@ inline Tensor mse_loss(
  }
  std::vector<torch::Tensor> broadcast_tensors =
      torch::broadcast_tensors({input, target});
-  auto expanded_input = broadcast_tensors[0];
-  auto expanded_target = broadcast_tensors[1];
+  const auto& expanded_input = broadcast_tensors[0];
+  const auto& expanded_target = broadcast_tensors[1];
  return torch::mse_loss(
      expanded_input, expanded_target, enumtype::reduction_get_enum(reduction));
 }
--- a/torch/csrc/api/src/python/init.cpp
+++ b/torch/csrc/api/src/python/init.cpp
@ -7,6 +7,7 @@
 #include <torch/csrc/utils/pybind.h>

 #include <string>
+#include <utility>

 namespace py = pybind11;

@ -30,9 +31,7 @@ namespace pybind11::detail {
    }                                                                     \
  }

-// NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
 ITEM_TYPE_CASTER(torch::Tensor, Tensor);
-// NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
 ITEM_TYPE_CASTER(std::shared_ptr<torch::nn::Module>, Module);
 } // namespace pybind11::detail

@ -42,7 +41,7 @@ template <typename T>
 void bind_ordered_dict(py::module module, const char* dict_name) {
  using ODict = OrderedDict<std::string, T>;
  // clang-format off
-  py::class_<ODict>(module, dict_name)
+  py::class_<ODict>(std::move(module), dict_name)
      .def("items", &ODict::items)
      .def("keys", &ODict::keys)
      .def("values", &ODict::values)
--- a/torch/csrc/autograd/FunctionsManual.cpp
+++ b/torch/csrc/autograd/FunctionsManual.cpp
@ -594,17 +594,20 @@ Tensor masked_fill_backward(const Tensor& grad, const Tensor& mask) {
 }

 template <typename T>
-Tensor mul_tensor_backward(const Tensor& grad, T other, ScalarType self_st) {
+Tensor mul_tensor_backward(
+    const Tensor& grad,
+    const T& other,
+    ScalarType self_st) {
  auto out = grad * other.conj();
  return handle_r_to_c(self_st, std::move(out));
 }
-template Tensor mul_tensor_backward(const Tensor&, Tensor, ScalarType);
-template Tensor mul_tensor_backward(const Tensor&, Scalar, ScalarType);
+template Tensor mul_tensor_backward(const Tensor&, const Tensor&, ScalarType);
+template Tensor mul_tensor_backward(const Tensor&, const Scalar&, ScalarType);

 template <typename T>
 Tensor div_tensor_self_backward(
    const Tensor& grad,
-    T other,
+    const T& other,
    ScalarType self_st,
    const std::optional<std::string_view>& rounding_mode) {
  if (rounding_mode.has_value()) {
@ -616,12 +619,12 @@ Tensor div_tensor_self_backward(
 }
 template Tensor div_tensor_self_backward(
    const Tensor&,
-    Tensor,
+    const Tensor&,
    ScalarType,
    const std::optional<std::string_view>&);
 template Tensor div_tensor_self_backward(
    const Tensor&,
-    Scalar,
+    const Scalar&,
    ScalarType,
    const std::optional<std::string_view>&);

--- a/torch/csrc/autograd/FunctionsManual.h
+++ b/torch/csrc/autograd/FunctionsManual.h
@ -137,11 +137,14 @@ at::Tensor pow_backward_exponent(
    const at::Tensor& result);
 at::Tensor angle_backward(const at::Tensor& grad, const at::Tensor& self);
 template <typename T>
-at::Tensor mul_tensor_backward(const Tensor& grad, T other, ScalarType self_st);
+at::Tensor mul_tensor_backward(
+    const Tensor& grad,
+    const T& other,
+    ScalarType self_st);
 template <typename T>
 at::Tensor div_tensor_self_backward(
    const Tensor& grad,
-    T other,
+    const T& other,
    ScalarType self_st,
    const std::optional<std::string_view>& rounding_mode = std::nullopt);
 at::Tensor div_tensor_other_backward(
--- a/torch/csrc/autograd/init.cpp
+++ b/torch/csrc/autograd/init.cpp
@ -1366,7 +1366,7 @@ static PyObject* pop_torch_dispatch_stack(
        "Attempted to unset ",
        c10::impl::to_string(mode_key.value()),
        ", but there wasn't one active.");
-    auto mode = maybe_mode.value();
+    const auto& mode = maybe_mode.value();
    r = mode->ptr(getPyInterpreter());
  } else {
    auto mode = c10::impl::TorchDispatchModeTLS::pop_stack();
--- a/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
@ -1205,7 +1205,7 @@ class AsyncReduceWork : public ProcessGroupGloo::AsyncWork {

 protected:
  template <typename T>
-  void getFunction(gloo::ReduceOptions::Func& fn, const ReduceOp op) {
+  void getFunction(gloo::ReduceOptions::Func& fn, const ReduceOp& op) {
    fn = toFunction<T>(op);
  }

--- a/torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp
@ -163,8 +163,8 @@ struct CollectiveFingerPrint {
    backend->allgather(output_tensors, tensors_to_verify)->wait();
    // Verify equivalence
    for (const auto i : c10::irange(output_tensors.size())) {
-      const std::vector<at::Tensor> gathered_tensors = output_tensors[i];
-      const at::Tensor reference_tensor = tensors_to_verify[i];
+      const std::vector<at::Tensor>& gathered_tensors = output_tensors[i];
+      const at::Tensor& reference_tensor = tensors_to_verify[i];
      for (const auto rank : c10::irange(gathered_tensors.size())) {
        const auto& rank_tensor = gathered_tensors[rank];
        if (!rank_tensor.equal(reference_tensor)) {
--- a/torch/csrc/inductor/aoti_torch/shim_common.cpp
+++ b/torch/csrc/inductor/aoti_torch/shim_common.cpp
@ -1280,11 +1280,11 @@ void aoti_torch_print_tensor_handle(AtenTensorHandle self, const char* msg) {
  // Print dtypes and for float types, print exact precision
  auto scalarType = t->scalar_type();
  if (scalarType == at::ScalarType::Float) {
-    std::cout << "Dtype: float32" << std::endl;
+    std::cout << "Dtype: float32" << '\n';
  } else if (scalarType == at::ScalarType::Half) {
-    std::cout << "Dtype: float16" << std::endl;
+    std::cout << "Dtype: float16" << '\n';
  } else if (scalarType == at::ScalarType::BFloat16) {
-    std::cout << "Dtype: bfloat16" << std::endl;
+    std::cout << "Dtype: bfloat16" << '\n';
  } else {
    std::cout << "Dtype: " << t->dtype() << '\n';
  }
--- a/torch/csrc/profiler/standalone/execution_trace_observer.cpp
+++ b/torch/csrc/profiler/standalone/execution_trace_observer.cpp
@ -838,7 +838,7 @@ static void onFunctionExit(const RecordFunction& fn, ObserverContext* ctx_ptr) {
    if (!checkFunctionOutputsForLogging(fn)) {
      return;
    }
-    auto outputs = fn.outputs();
+    const auto& outputs = fn.outputs();
    auto num_outputs = fn.num_outputs();
    // need to account for Stack mode where the outputs are at the end.
    size_t output_start = outputs.size() - num_outputs;
--- a/torch/lib/libshm/core.cpp
+++ b/torch/lib/libshm/core.cpp
@ -57,7 +57,7 @@ static void start_manager() {
    handle.append(buffer.data(), bytes_read);
  }
  SYSCHECK_ERR_RETURN_NEG1(close(pipe_ends[0]));
-  if (handle.length() == 0) {
+  if (handle.empty()) {
    std::string msg("no response from torch_shm_manager at \"");
    msg += manager_executable_path;
    msg += "\"";
Author	SHA1	Message	Date
cyy	67e094992e	Revert changes	2025-09-21 09:20:22 +00:00
cyy	14622ff376	More fixes	2025-09-21 09:20:22 +00:00
cyy	35d5ec3d85	Fix clang-tidy warnings of performance	2025-09-21 09:20:22 +00:00