[9/N] Fix extra warnings brought by clang-tidy-17 (#139286)

Fixes #ISSUE_NUMBER Pull Request resolved: https://github.com/pytorch/pytorch/pull/139286 Approved by: https://github.com/ezyang
2025-10-20 21:14:14 +08:00 · 2024-10-31 05:20:28 +00:00
parent 42b5e191ae
commit f95c71867e
28 changed files with 79 additions and 63 deletions
--- a/aten/src/ATen/CPUApplyUtils.h
+++ b/aten/src/ATen/CPUApplyUtils.h
@ -59,8 +59,11 @@ struct strided_tensor_iter_fixed {
  T* data_ = NULL;
  int64_t dim_ = 0;

+  // NOLINTNEXTLINE(*array*)
  int64_t counter_[N] = {0};
+  // NOLINTNEXTLINE(*array*)
  int64_t sizes_[N] = {0};
+  // NOLINTNEXTLINE(*array*)
  int64_t strides_[N] = {0};

  strided_tensor_iter_fixed(strided_tensor_iter_fixed const&) = delete;
--- a/aten/src/ATen/CPUFixedAllocator.h
+++ b/aten/src/ATen/CPUFixedAllocator.h
@ -11,15 +11,15 @@

 namespace at {

-static cpu_fixed_malloc(void*, ptrdiff_t) {
+static void* cpu_fixed_malloc(void*, ptrdiff_t) {
  TORCH_CHECK(false, "attempting to resize a tensor view of an external blob");
 }

-static cpu_fixed_realloc(void*, void*, ptrdiff_t) {
+static void* cpu_fixed_realloc(void*, void*, ptrdiff_t) {
  TORCH_CHECK(false, "attempting to resize a tensor view of an external blob");
 }

-static cpu_fixed_free(void* state, void* allocation) {
+static void cpu_fixed_free(void* state, void* allocation) {
  auto on_release = static_cast<std::function<void(void*)>*>(state);
  (*on_release)(allocation);
  delete on_release;
--- a/aten/src/ATen/FunctionalInverses.cpp
+++ b/aten/src/ATen/FunctionalInverses.cpp
@ -256,7 +256,7 @@ Tensor FunctionalInverses::split_with_sizes_inverse(const Tensor& base, const Te
    dim = at::maybe_wrap_dim(dim, base.dim());
    auto dim_size = base.sym_size(dim);
    c10::SymInt start = 0;
-    for (auto i = 0; i < mutated_view_idx; ++i) {
+    for (int64_t i = 0; i < mutated_view_idx; ++i) {
        start += split_sizes[i];
    }
    auto end = start + split_sizes[mutated_view_idx];
--- a/aten/src/ATen/FunctionalStorageImpl.cpp
+++ b/aten/src/ATen/FunctionalStorageImpl.cpp
@ -83,10 +83,10 @@ static c10::SymInt get_nbytes(const Tensor& value) {
    if (value.key_set().has(c10::DispatchKey::Python)) {
      return value.storage().sym_nbytes();
    }
-    return at::detail::computeStorageNbytes(value.sym_sizes(), value.sym_strides(), value.dtype().itemsize(), value.sym_storage_offset());
+    return at::detail::computeStorageNbytes(value.sym_sizes(), value.sym_strides(),static_cast<int64_t>(value.dtype().itemsize()), value.sym_storage_offset());
  }
  // XLA storage objects also do not properly track nbytes.
-  return at::detail::computeStorageNbytes(value.sizes(), value.strides(), value.dtype().itemsize(), value.storage_offset());
+  return static_cast<int64_t>(at::detail::computeStorageNbytes(value.sizes(), value.strides(), value.dtype().itemsize(), value.storage_offset()));
 }

 FunctionalStorageImpl::FunctionalStorageImpl(const Tensor& base)
--- a/aten/src/ATen/LegacyBatchedFallback.cpp
+++ b/aten/src/ATen/LegacyBatchedFallback.cpp
@ -154,7 +154,7 @@ static void batchedTensorInplaceForLoopFallback(const c10::OperatorHandle& op, t
        "please file a bug report instead.");
    }
    batched_tensor_inputs.push_back(tensor);
-    batched_tensor_inputs_position.push_back(idx);
+    batched_tensor_inputs_position.push_back(static_cast<int64_t>(idx));
  }
  TORCH_INTERNAL_ASSERT(!batched_tensor_inputs.empty());

@ -288,7 +288,7 @@ void batchedTensorForLoopFallback(const c10::OperatorHandle& op, torch::jit::Sta
      continue;
    }
    batched_tensor_inputs.push_back(tensor);
-    batched_tensor_inputs_position.push_back(idx);
+    batched_tensor_inputs_position.push_back(static_cast<int64_t>(idx));
  }
  TORCH_INTERNAL_ASSERT(!batched_tensor_inputs.empty());

--- a/aten/src/ATen/LegacyBatchedTensorImpl.cpp
+++ b/aten/src/ATen/LegacyBatchedTensorImpl.cpp
@ -25,7 +25,7 @@ BatchedTensorImpl::BatchedTensorImpl(Tensor value, BatchDims bdims)
  const auto value_strides = value_.strides();
  sizes_and_strides_.resize(public_dims);
  for (const auto dim : c10::irange(public_dims)) {
-    auto actual_dim = actualDim(dim, /*wrap_dim=*/false);
+    auto actual_dim = actualDim(static_cast<int64_t>(dim), /*wrap_dim=*/false);
    sizes_and_strides_.size_at_unchecked(dim) = value_sizes.at(actual_dim);
    sizes_and_strides_.stride_at_unchecked(dim) = value_strides.at(actual_dim);
  }
@ -37,7 +37,7 @@ BatchedTensorImpl::BatchedTensorImpl(Tensor value, BatchDims bdims)
 int64_t BatchedTensorImpl::actualDim(int64_t dim, bool wrap_dim) const {
  if (wrap_dim) {
    const auto ndim = sizes_and_strides_.size();
-    dim = maybe_wrap_dim(dim, ndim);
+    dim = maybe_wrap_dim(dim, static_cast<int64_t>(ndim));
  }
  auto is_bdim = createBatchDimBitset(bdims_);

--- a/aten/src/ATen/LegacyBatchingRegistrations.cpp
+++ b/aten/src/ATen/LegacyBatchingRegistrations.cpp
@ -366,7 +366,7 @@ Tensor select_batching_rule(const Tensor& self, int64_t dim, int64_t index) {
 }

 static int64_t getGradInputPhysicalDim(int64_t dim, IntArrayRef input_sizes, int64_t num_batch_dims) {
-  return maybe_wrap_dim(dim, input_sizes.size()) + num_batch_dims;
+  return maybe_wrap_dim(dim, static_cast<int64_t>(input_sizes.size())) + num_batch_dims;
 }

 Tensor select_backward_batching_rule(const Tensor& grad, IntArrayRef input_sizes, int64_t dim, int64_t index) {
--- a/aten/src/ATen/LegacyVmapTransforms.cpp
+++ b/aten/src/ATen/LegacyVmapTransforms.cpp
@ -35,7 +35,7 @@ static Tensor permuteBatchDimsToFront(BatchedTensorImpl* batched) {
    if (is_bdim[ptr]) {
      continue;
    }
-    permutation[idx++] = ptr;
+    permutation[idx++] = static_cast<int64_t>(ptr);
  }
  return physical_tensor.permute(permutation);
 }
@ -49,7 +49,7 @@ VmapPhysicalView MultiBatchVmapTransform::logicalToPhysical(const Tensor& logica
 }

 int64_t VmapPhysicalView::numBatchDims() const {
-  return levels_.count();
+  return static_cast<int64_t>(levels_.count());
 }

 int64_t VmapPhysicalView::numLogicalDims() const {
@ -202,7 +202,7 @@ MultiBatchVmapTransform::logicalToPhysical(ITensorListRef logical_tensors) {
  // batch dims have been moved to the front of the tensor. Any previously
  // non-existing batch dims get added to the tensors as new dimensions of size 1.
  std::vector<Tensor> physical_tensors;
-  int64_t num_batch_dims = collective_levels.count();
+  auto num_batch_dims = collective_levels.count();
  for (const auto& logical_tensor : logical_tensors) {
    auto requested_example_dim = /*logical_dim*/logical_tensor.dim();
    auto physical_tensor = alignBatchDimsAtFront(
--- a/aten/src/ATen/ThreadLocalState.cpp
+++ b/aten/src/ATen/ThreadLocalState.cpp
@ -21,7 +21,7 @@ ThreadLocalState::ThreadLocalState()
      saved_tensors_default_hooks_state_(at::SavedTensorDefaultHooks::get_tls_state()), functionalization_reapply_views_state_(at::functionalization::impl::getFunctionalizationReapplyViewsTLS()),
      saved_objects_(at::impl::ThreadLocalPythonObjects::get_state()) {
 #if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE) && !defined(BUILD_LITE_INTERPRETER)
-  for(uint8_t i=0; i<autocast_dtypes_.size(); i++) {
+  for(size_t i=0; i<autocast_dtypes_.size(); i++) {
     autocast_dtypes_[i] = at::autocast::get_autocast_dtype(static_cast<at::DeviceType>(i));
  }
 #endif
@ -62,7 +62,7 @@ void ThreadLocalState::setThreadLocalState(

  at::impl::ThreadLocalPythonObjects::set_state(state.saved_objects_);
 #if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE) && !defined(BUILD_LITE_INTERPRETER)
-  for(uint8_t i=0; i<state.autocast_dtypes_.size(); i++) {
+  for(size_t i=0; i<state.autocast_dtypes_.size(); i++) {
     at::autocast::set_autocast_dtype(static_cast<at::DeviceType>(i), state.autocast_dtypes_[i]);
  }
 #endif
--- a/aten/src/ATen/core/stack.h
+++ b/aten/src/ATen/core/stack.h
@ -67,14 +67,14 @@ class Operation {
 // treat the last N elements of the stack as a list, looking up
 // element i
 inline IValue& peek(Stack& stack, size_t i, size_t N) {
-  // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions)
+  // NOLINTNEXTLINE(*-narrowing-conversions)
  return *(stack.end() - N + i);
 }
 inline IValue& peek(Stack* stack, size_t i, size_t N) {
  return peek(*stack, i, N);
 }
 inline const IValue& peek(const Stack& stack, size_t i, size_t N) {
-  // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions)
+  // NOLINTNEXTLINE(*-narrowing-conversions)
  return *(stack.end() - N + i);
 }
 inline const IValue& peek(const Stack* stack, size_t i, size_t N) {
@ -96,7 +96,7 @@ inline at::ArrayRef<IValue> last(const Stack* stack, size_t N) {
  return last(*stack, N);
 }
 inline void drop(Stack& stack, size_t n) {
-  // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions)
+  // NOLINTNEXTLINE(*-narrowing-conversions)
  stack.erase(stack.end() - n, stack.end());
 }
 inline void drop(Stack* stack, size_t n) {
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@ -282,6 +282,7 @@ class CuBlasLtMatmulDescriptor : public CuBlasLtDescriptor<
  }
  template <typename T>
  inline void setAttribute(cublasLtMatmulDescAttributes_t attr, const T value) {
+    // NOLINTNEXTLINE(bugprone-sizeof-expression)
    TORCH_CUDABLAS_CHECK(::cublasLtMatmulDescSetAttribute(descriptor(), attr, &value, sizeof(T)));
  }
 };
@ -1750,6 +1751,7 @@ void trsm<c10::complex<double>>(CUDABLAS_TRSM_ARGTYPES(c10::complex<double>)) {
 }

 template <>
+// NOLINTNEXTLINE(*array*)
 void trsmBatched<float>(CUDABLAS_TRSM_BATCHED_ARGTYPES(float)) {
  TORCH_CUDABLAS_CHECK(cublasStrsmBatched(
      handle,
@ -1768,6 +1770,7 @@ void trsmBatched<float>(CUDABLAS_TRSM_BATCHED_ARGTYPES(float)) {
 }

 template <>
+// NOLINTNEXTLINE(*array*)
 void trsmBatched<double>(CUDABLAS_TRSM_BATCHED_ARGTYPES(double)) {
  TORCH_CUDABLAS_CHECK(cublasDtrsmBatched(
      handle,
@ -1787,6 +1790,7 @@ void trsmBatched<double>(CUDABLAS_TRSM_BATCHED_ARGTYPES(double)) {

 template <>
 void trsmBatched<c10::complex<float>>(
+// NOLINTNEXTLINE(*array*)
    CUDABLAS_TRSM_BATCHED_ARGTYPES(c10::complex<float>)) {
  TORCH_CUDABLAS_CHECK(cublasCtrsmBatched(
      handle,
@ -1806,6 +1810,7 @@ void trsmBatched<c10::complex<float>>(

 template <>
 void trsmBatched<c10::complex<double>>(
+// NOLINTNEXTLINE(*array*)
    CUDABLAS_TRSM_BATCHED_ARGTYPES(c10::complex<double>)) {
  TORCH_CUDABLAS_CHECK(cublasZtrsmBatched(
      handle,
--- a/aten/src/ATen/cuda/PeerToPeerAccess.cpp
+++ b/aten/src/ATen/cuda/PeerToPeerAccess.cpp
@ -33,7 +33,7 @@ void init_p2p_access_cache(int64_t num_devices) {

 }  // namespace detail

-bool get_p2p_access(int dev, int dev_to_access) {
+bool get_p2p_access(c10::DeviceIndex dev, c10::DeviceIndex dev_to_access) {
  at::globalContext().lazyInitDevice(c10::DeviceType::CUDA);

  TORCH_CHECK(dev >= 0 || dev < num_devices_,
--- a/aten/src/ATen/cuda/PeerToPeerAccess.h
+++ b/aten/src/ATen/cuda/PeerToPeerAccess.h
@ -1,4 +1,5 @@
 #include <c10/macros/Macros.h>
+#include <c10/core/Device.h>
 #include <cstdint>

 namespace at::cuda {
@ -6,6 +7,6 @@ namespace detail {
 void init_p2p_access_cache(int64_t num_devices);
 }

-TORCH_CUDA_CPP_API bool get_p2p_access(int source_dev, int dest_dev);
+TORCH_CUDA_CPP_API bool get_p2p_access(c10::DeviceIndex source_dev, c10::DeviceIndex dest_dev);

 }  // namespace at::cuda
--- a/aten/src/ATen/native/nested/NestedTensorTransformerFunctions.h
+++ b/aten/src/ATen/native/nested/NestedTensorTransformerFunctions.h
@ -59,8 +59,8 @@ void remove_padding_kernelLauncher(
    const int* offsets,
    const int* input_sizes,
    const int* output_sizes,
-    int output_dim,
-    const int batch_size);
+    int64_t output_dim,
+    const int64_t batch_size);

 template <typename T>
 void remove_padding_transform0213_kernelLauncher(
@ -69,8 +69,8 @@ void remove_padding_transform0213_kernelLauncher(
    const int* offsets,
    const int* input_sizes,
    const int* output_sizes,
-    int output_dim,
-    const int batch_size);
+    int64_t output_dim,
+    const int64_t batch_size);

 template <typename T>
 void add_padding_kernelLauncher(
--- a/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp
+++ b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp
@ -1,5 +1,3 @@
-#include <numeric>
-#include <algorithm>
 #include <c10/util/Exception.h>

 #include <ATen/ATen.h>
@ -118,7 +116,7 @@ Tensor nested_from_padded_cuda(
  }
 }

-Tensor batch_offsets_from_efficient_size(const Tensor& ef_sizes) {
+static Tensor batch_offsets_from_efficient_size(const Tensor& ef_sizes) {
  int64_t* nt_sizes_ptr = ef_sizes.data_ptr<int64_t>();
  int64_t ef_sizes_size_0 = ef_sizes.sizes()[0];
  Tensor offsets = at::empty({1 + ef_sizes_size_0}, at::kLong);
--- a/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cu
+++ b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cu
@ -154,8 +154,8 @@ void remove_padding_kernelLauncher(
    const int* offsets,
    const int* input_sizes,
    const int* output_sizes,
-    int output_dim,
-    const int batch_size) {
+    int64_t output_dim,
+    const int64_t batch_size) {
  dim3 grid;
  grid.x = batch_size;
  grid.y = GRID_DIM_Y;
@ -188,8 +188,8 @@ void remove_padding_transform0213_kernelLauncher(
    const int* offsets,
    const int* input_sizes,
    const int* output_sizes,
-    int output_dim,
-    const int batch_size) {
+    int64_t output_dim,
+    const int64_t batch_size) {
  dim3 grid;
  grid.x = batch_size;
  grid.y = GRID_DIM_Y;
@ -214,8 +214,8 @@ template void remove_padding_kernelLauncher<float>(
    const int* offsets,
    const int* input_sizes,
    const int* output_sizes,
-    int output_dim,
-    const int batch_size);
+    int64_t output_dim,
+    const int64_t batch_size);

 template void remove_padding_kernelLauncher<c10::Half>(
    const c10::Half* input,
@ -223,8 +223,8 @@ template void remove_padding_kernelLauncher<c10::Half>(
    const int* offsets,
    const int* input_sizes,
    const int* output_sizes,
-    int output_dim,
-    const int batch_size);
+    int64_t output_dim,
+    const int64_t batch_size);

 template void remove_padding_transform0213_kernelLauncher<float>(
    const float* input,
@ -232,8 +232,8 @@ template void remove_padding_transform0213_kernelLauncher<float>(
    const int* offsets,
    const int* input_sizes,
    const int* output_sizes,
-    int output_dim,
-    const int batch_size);
+    int64_t output_dim,
+    const int64_t batch_size);

 template void remove_padding_transform0213_kernelLauncher<c10::Half>(
    const c10::Half* input,
@ -241,8 +241,8 @@ template void remove_padding_transform0213_kernelLauncher<c10::Half>(
    const int* offsets,
    const int* input_sizes,
    const int* output_sizes,
-    int output_dim,
-    const int batch_size);
+    int64_t output_dim,
+    const int64_t batch_size);

 template <typename T>
 __global__ void add_padding_1(
--- a/aten/src/ATen/native/nested/cuda/NestedTensorTransformerUtils.cpp
+++ b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerUtils.cpp
@ -89,7 +89,7 @@ int64_t get_nnz(const Tensor& nestedtensor) {
    const Tensor& tensor_strides = tensor->get_nested_strides();

    const int64_t n_tensors = tensor_strides.size(0);
-    constexpr int64_t n_dims = 3;
+    constexpr int n_dims = 3;
    // This is safe since head_dim is assured to be consistent
    const int64_t num_heads = tensor -> opt_size(2).value();
    const int64_t tensor_stride_0 = tensor_strides.stride(0);
--- a/torch/csrc/Event.cpp
+++ b/torch/csrc/Event.cpp
@ -114,7 +114,7 @@ static PyObject* THPEvent_record(
    auto stream = (THPStream*)_stream;
    self->event.record(c10::Stream::unpack3(
        stream->stream_id,
-        stream->device_index,
+        static_cast<c10::DeviceIndex>(stream->device_index),
        static_cast<c10::DeviceType>(stream->device_type)));
  } else {
    c10::impl::VirtualGuardImpl impl{
@ -192,7 +192,7 @@ static PyObject* THPEvent_wait(
      auto stream = (THPStream*)_stream;
      self->event.block(c10::Stream::unpack3(
          stream->stream_id,
-          stream->device_index,
+          static_cast<c10::DeviceIndex>(stream->device_index),
          static_cast<c10::DeviceType>(stream->device_type)));
    } else {
      c10::impl::VirtualGuardImpl impl{
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@ -326,7 +326,7 @@ static PyObject* THPModule_setNumThreads(PyObject* module, PyObject* arg) {
 static PyObject* THPModule_getNumInteropThreads(
    PyObject* module,
    PyObject* noargs) {
-  return THPUtils_packInt32(at::get_num_interop_threads());
+  return THPUtils_packUInt64(at::get_num_interop_threads());
 }

 static PyObject* THPModule_setNumInteropThreads(
--- a/torch/csrc/api/include/torch/python.h
+++ b/torch/csrc/api/include/torch/python.h
@ -46,7 +46,7 @@ using PyModuleClass =
 /// to which it delegates all calls.
 template <typename ModuleType>
 void bind_cpp_module_wrapper(
-    py::module module,
+    const py::module& module,
    PyModuleClass<ModuleType> cpp_class,
    const char* name) {
  // Grab the `torch.nn.cpp.ModuleWrapper` class, which we'll subclass
--- a/torch/csrc/autograd/init.cpp
+++ b/torch/csrc/autograd/init.cpp
@ -1280,6 +1280,7 @@ PyObject* THPModule_increment_version(
 }

 // autograd methods on torch._C
+// NOLINTNEXTLINE(*array*)
 static PyMethodDef methods[] = {
    {"_set_grad_enabled",
     castPyCFunctionWithKeywords(set_grad_enabled),
--- a/torch/csrc/autograd/utils/wrap_outputs.h
+++ b/torch/csrc/autograd/utils/wrap_outputs.h
@ -81,7 +81,7 @@ inline PyObject* wrap(at::QScheme qscheme) {
 }

 inline PyObject* wrap(at::TensorList tl) {
-  auto r = THPObjectPtr{PyTuple_New(tl.size())};
+  auto r = THPObjectPtr{PyTuple_New(static_cast<Py_ssize_t>(tl.size()))};
  if (!r)
    throw python_error();
  for (const auto i : c10::irange(tl.size())) {
@ -91,7 +91,7 @@ inline PyObject* wrap(at::TensorList tl) {
 }

 inline PyObject* wrap(at::IntArrayRef list) {
-  auto r = THPObjectPtr{PyTuple_New(list.size())};
+  auto r = THPObjectPtr{PyTuple_New(static_cast<Py_ssize_t>(list.size()))};
  if (!r)
    throw python_error();
  for (const auto i : c10::irange(list.size())) {
--- a/torch/csrc/autograd/variable.h
+++ b/torch/csrc/autograd/variable.h
@ -358,10 +358,12 @@ struct TORCH_API ViewFunc {
  /// Sets the values of any SymInts in the saved state. The input vector size
  /// must match the number of SymInts in the saved state (i.e. the size of the
  /// list returned by get_symints()).
+  /// NOLINTNEXTLINE(performance-unnecessary-value-param)
  virtual void set_symints(std::vector<c10::SymInt>) {}
  /// Sets the values of any Tensors in the saved state. The input vector size
  /// must match the number of Tensors in the saved state (i.e. the size of the
  /// list returned by get_tensors()).
+  /// NOLINTNEXTLINE(performance-unnecessary-value-param)
  virtual void set_tensors(std::vector<at::Tensor>) {}
 };

--- a/torch/csrc/cuda/Module.cpp
+++ b/torch/csrc/cuda/Module.cpp
@ -1253,6 +1253,7 @@ static void registerCudaPluggableAllocator(PyObject* module) {
  m.def(
      "_set_storage_data_ptr_access_error_msg",
      [](size_t storage_impl_ptr, std::string s) {
+        // NOLINTNEXTLINE(performance-no-int-to-ptr)
        c10::StorageImpl* storage_impl = (c10::StorageImpl*)storage_impl_ptr;
        storage_impl->release_data_and_set_meta_custom_data_ptr_error_msg_(s);
      });
--- a/torch/csrc/cuda/comm.cpp
+++ b/torch/csrc/cuda/comm.cpp
@ -282,8 +282,7 @@ std::vector<at::Tensor>& scatter_out(
  at::cuda::OptionalCUDAStreamGuard cuda_guard;
  for (const auto i : c10::irange(chunks.size())) {
    if (i < (streams ? streams->size() : 0U) && (*streams)[i]) {
-      const auto device_index =
-          static_cast<int16_t>(out_tensors[i].get_device());
+      const auto device_index = out_tensors[i].get_device();
      TORCH_CHECK(
          (*streams)[i]->device_index() == device_index,
          "Expected the device associated with the stream at index ",
@ -293,7 +292,7 @@ std::vector<at::Tensor>& scatter_out(
          ") ",
          "to match the device supplied at that index ",
          "(expected ",
-          device_index,
+          static_cast<int16_t>(device_index),
          ")");
      cuda_guard.reset_stream(*(*streams)[i]);
    }
--- a/torch/csrc/cuda/nccl.cpp
+++ b/torch/csrc/cuda/nccl.cpp
@ -109,6 +109,7 @@ ncclDataType_t to_nccl_data_type(c10::ScalarType type) {
      return ncclDataType_t::ncclInt;
    case at::kChar:
      return ncclDataType_t::ncclChar;
+    // NOLINTNEXTLINE(*-narrowing-conversions)
    case at::kByte:
      return ncclDataType_t::ncclUint8;
    case at::kBool:
@ -260,8 +261,9 @@ void throw_nccl_error(torch::cuda::nccl::ncclResult status) {
 }

 struct NcclCommList {
+  // NOLINTNEXTLINE(*array*)
  std::unique_ptr<ncclComm_t[]> comms;
-  int ndevices;
+  size_t ndevices;
  NcclCommList(const std::vector<int>& devices)
      : comms(new ncclComm_t[devices.size()]), ndevices(devices.size()) {
    NCCL_CHECK(ncclCommInitAll(
@ -309,8 +311,8 @@ ArrayRef<ncclComm_t> get_communicators(TensorList inputs) {
 static inline void check_tensor(
    const at::Tensor& input,
    const std::optional<at::Tensor>& output,
-    int input_multiplier,
-    int output_multiplier,
+    size_t input_multiplier,
+    size_t output_multiplier,
    int64_t ref_numel,
    ScalarType ref_dtype) {
  auto check_one = [&](const at::Tensor& tensor) {
@ -355,12 +357,12 @@ static inline void check_tensor(
 void check_inputs(
    TensorList inputs,
    TensorList outputs,
-    int input_multiplier,
-    int output_multiplier) {
+    size_t input_multiplier,
+    size_t output_multiplier) {
  // len(inputs) == len(outputs)
  size_t len = inputs.size();

-  if (len <= 0) {
+  if (len == 0) {
    throw std::runtime_error("input sequence can't be empty");
  }

@ -967,7 +969,7 @@ void all2all(
  uintptr_t recvBase = reinterpret_cast<uintptr_t>(outputTensors[0].data_ptr());
  size_t dtypeSize = inputTensors.front().element_size();

-  for (const auto r : c10::irange(outputTensors.size())) {
+  for (const int r : c10::irange(outputTensors.size())) {
    sendCounts[r] = inputTensors[r].numel();
    auto sendOffset =
        reinterpret_cast<uintptr_t>(inputTensors[r].data_ptr()) - sendBase;
@ -995,7 +997,7 @@ void all2all(
      stream.stream()));
 #else
  NCCL_CHECK(ncclGroupStart());
-  for (const auto r : c10::irange(outputTensors.size())) {
+  for (const int r : c10::irange(static_cast<int>(outputTensors.size()))) {
    at::Tensor& input = inputTensors[r];
    at::Tensor& output = outputTensors[r];

--- a/torch/csrc/cuda/nccl.h
+++ b/torch/csrc/cuda/nccl.h
@ -32,7 +32,7 @@ typedef void* ncclComm_t;
 * nccl impp. */
 #define NCCL_UNIQUE_ID_BYTES 128
 typedef struct {
-  // NOLINTNEXTLINE(*array)
+  // NOLINTNEXTLINE(*array*)
  char internal[NCCL_UNIQUE_ID_BYTES];
 } ncclUniqueId;

@ -100,14 +100,14 @@ TORCH_CUDA_CPP_API at::ArrayRef<ncclComm_t> get_communicators(
 TORCH_CUDA_CPP_API void check_inputs(
    at::TensorList inputs,
    at::TensorList outputs,
-    int input_multiplier,
-    int output_multiplier);
+    size_t input_multiplier,
+    size_t output_multiplier);
 TORCH_CUDA_CPP_API void check_inputs(
    at::TensorList inputs,
    const at::Tensor& output,
    int root,
-    int input_multiplier,
-    int output_multiplier);
+    size_t input_multiplier,
+    size_t output_multiplier);

 } // namespace detail

--- a/torch/csrc/cuda/shared/cudart.cpp
+++ b/torch/csrc/cuda/shared/cudart.cpp
@ -73,6 +73,7 @@ void initCudartBindings(PyObject* module) {
      [](uintptr_t ptr, size_t size, unsigned int flags) -> cudaError_t {
        py::gil_scoped_release no_gil;
        return C10_CUDA_ERROR_HANDLED(
+            // NOLINTNEXTLINE(performance-no-int-to-ptr)
            cudaHostRegister((void*)ptr, size, flags));
      });
  cudart.def(
@ -80,6 +81,7 @@ void initCudartBindings(PyObject* module) {
      "HostUnregister",
      [](uintptr_t ptr) -> cudaError_t {
        py::gil_scoped_release no_gil;
+        // NOLINTNEXTLINE(performance-no-int-to-ptr)
        return C10_CUDA_ERROR_HANDLED(cudaHostUnregister((void*)ptr));
      });
  cudart.def(
@ -87,6 +89,7 @@ void initCudartBindings(PyObject* module) {
      "StreamCreate",
      [](uintptr_t ptr) -> cudaError_t {
        py::gil_scoped_release no_gil;
+        // NOLINTNEXTLINE(performance-no-int-to-ptr)
        return C10_CUDA_ERROR_HANDLED(cudaStreamCreate((cudaStream_t*)ptr));
      });
  cudart.def(
@ -94,6 +97,7 @@ void initCudartBindings(PyObject* module) {
      "StreamDestroy",
      [](uintptr_t ptr) -> cudaError_t {
        py::gil_scoped_release no_gil;
+        // NOLINTNEXTLINE(performance-no-int-to-ptr)
        return C10_CUDA_ERROR_HANDLED(cudaStreamDestroy((cudaStream_t)ptr));
      });
 #if !defined(USE_ROCM) && defined(CUDA_VERSION) && CUDA_VERSION < 12000