use irange for loops (#66234)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/66234 Modified loops in files under fbsource/fbcode/caffe2/ from the format `for(TYPE var=x0;var<x_max;x++)` to the format `for(const auto var: irange(xmax))` This was achieved by running r-barnes's loop upgrader script (D28874212) with some modification to exclude all files under /torch/jit and a number of reversions or unused variable suppression warnings added by hand. bypass_size_limit allow-large-files Test Plan: Sandcastle Reviewed By: ngimel Differential Revision: D30652629 fbshipit-source-id: 0ae6c4bbbb554bad42e372792a6430e1acf15e3e
2025-10-20 21:14:14 +08:00 · 2021-10-15 13:48:39 -07:00
parent b5b7d6a3a6
commit 687c2267d4
487 changed files with 22184 additions and 21930 deletions
--- a/android/pytorch_android/src/main/cpp/pytorch_jni_common.cpp
+++ b/android/pytorch_android/src/main/cpp/pytorch_jni_common.cpp
@ -4,6 +4,7 @@
 #include <string>
 #include <c10/core/MemoryFormat.h>
 #include <c10/util/irange.h>
 #include <fbjni/ByteBuffer.h>
 #include <fbjni/fbjni.h>
@ -97,7 +98,7 @@ static at::Tensor newAtTensor(
  std::vector<int64_t> shapeVec{};
  shapeVec.reserve(rank);
  auto numel = 1;
-  for (auto i = 0; i < rank; ++i) {
+  for (const auto i : c10::irange(rank)) {
    shapeVec.push_back(shapeArr[i]);
    numel *= shapeArr[i];
  }
@ -521,7 +522,7 @@ at::IValue JIValue::JIValueToAtIValue(
    std::vector<at::IValue> elements;
    elements.reserve(n);
-    for (auto i = 0; i < n; ++i) {
+    for (const auto i : c10::irange(n)) {
      auto jivalue_element = jarray->getElement(i);
      auto element = JIValue::JIValueToAtIValue(jivalue_element);
      elements.push_back(std::move(element));
@ -535,7 +536,7 @@ at::IValue JIValue::JIValueToAtIValue(
    size_t n = jArrayPinned.size();
    c10::List<bool> list{};
    list.reserve(n);
-    for (size_t i = 0; i < n; ++i) {
+    for (const auto i : c10::irange(n)) {
      list.push_back(jArrayPinned[i]);
    }
    return at::IValue{std::move(list)};
@ -547,7 +548,7 @@ at::IValue JIValue::JIValueToAtIValue(
    size_t n = jArrayPinned.size();
    c10::List<int64_t> list{};
    list.reserve(n);
-    for (size_t i = 0; i < n; ++i) {
+    for (const auto i : c10::irange(n)) {
      list.push_back(jArrayPinned[i]);
    }
    return at::IValue{std::move(list)};
@ -559,7 +560,7 @@ at::IValue JIValue::JIValueToAtIValue(
    size_t n = jArrayPinned.size();
    c10::List<double> list{};
    list.reserve(n);
-    for (size_t i = 0; i < n; ++i) {
+    for (const auto i : c10::irange(n)) {
      list.push_back(jArrayPinned[i]);
    }
    return at::IValue{std::move(list)};
@ -572,7 +573,7 @@ at::IValue JIValue::JIValueToAtIValue(
    size_t n = jArray->size();
    c10::List<at::Tensor> list{};
    list.reserve(n);
-    for (size_t i = 0; i < n; ++i) {
+    for (const auto i : c10::irange(n)) {
      list.push_back(
          TensorHybrid::newAtTensorFromJTensor(jArray->getElement(i)));
    }
@ -594,7 +595,7 @@ at::IValue JIValue::JIValueToAtIValue(
    c10::impl::GenericList list{c10::unshapedType(first_element.type())};
    list.reserve(n);
    list.push_back(first_element);
-    for (auto i = 1; i < n; ++i) {
+    for (const auto i : c10::irange(1, n)) {
      auto jivalue_element = jarray->getElement(i);
      auto element = JIValue::JIValueToAtIValue(jivalue_element);
      list.push_back(element);
--- a/android/pytorch_android/src/main/cpp/pytorch_jni_lite.cpp
+++ b/android/pytorch_android/src/main/cpp/pytorch_jni_lite.cpp
@ -6,6 +6,7 @@
 #include <fbjni/ByteBuffer.h>
 #include <fbjni/fbjni.h>
 #include <c10/util/irange.h>
 #include <torch/csrc/jit/mobile/import.h>
 #include <torch/csrc/jit/mobile/module.h>
 #include <torch/script.h>
@ -157,7 +158,7 @@ class PytorchJni : public facebook::jni::HybridClass<PytorchJni> {
    std::vector<at::IValue> inputs{};
    size_t n = jinputs->size();
    inputs.reserve(n);
-    for (size_t i = 0; i < n; i++) {
+    for (const auto i : c10::irange(n)) {
      at::IValue atIValue = JIValue::JIValueToAtIValue(jinputs->getElement(i));
      if (at::kVulkan == deviceType_) {
        inputs.push_back(
@ -186,7 +187,7 @@ class PytorchJni : public facebook::jni::HybridClass<PytorchJni> {
    std::vector<at::IValue> inputs{};
    size_t n = jinputs->size();
    inputs.reserve(n);
-    for (size_t i = 0; i < n; i++) {
+    for (const auto i : c10::irange(n)) {
      at::IValue atIValue = JIValue::JIValueToAtIValue(jinputs->getElement(i));
      if (at::kVulkan == deviceType_) {
        inputs.push_back(
--- a/aten/src/ATen/BatchingRegistrations.cpp
+++ b/aten/src/ATen/BatchingRegistrations.cpp
@ -3,6 +3,7 @@
 #include <ATen/BatchedFallback.h>
 #include <ATen/native/ResizeCommon.h>
 #include <ATen/ATen.h>
 #include <c10/util/irange.h>
 namespace at {
@ -329,7 +330,7 @@ Tensor permute_batching_rule(const Tensor& self, IntArrayRef dims) {
  VmapDimVector all_dims_physical;
  all_dims_physical.reserve(self_physical.tensor().dim());
-  for (int64_t bdim = 0; bdim < self_physical.numBatchDims(); bdim++) {
+  for (const auto bdim : c10::irange(self_physical.numBatchDims())) {
    all_dims_physical.push_back(bdim);
  }
  all_dims_physical.insert(
--- a/aten/src/ATen/CPUApplyUtils.h
+++ b/aten/src/ATen/CPUApplyUtils.h
@ -2,6 +2,7 @@
 #include <ATen/Parallel.h>
 #include <ATen/TensorUtils.h>
 #include <c10/util/irange.h>
 #include <limits>
 #include <utility>
 #include <cstring>
@ -130,7 +131,7 @@ inline Tensor sort_strides(Tensor& tensor_) {
  IntArrayRef strides = tensor_.strides();
  std::vector<int64_t> indices;
  indices.reserve(tensor_.ndimension());
-  for (int64_t i = 0; i < tensor_.ndimension(); i++) {
+  for (const auto i : c10::irange(tensor_.ndimension())) {
    indices.push_back(i);
  }
  std::sort(indices.begin(), indices.end(), [&strides](int64_t i1, int64_t i2) {
@ -196,7 +197,7 @@ inline bool _all_equal_numel(at::ArrayRef<Tensor> tensors) {
  if (tensors.size() == 0)
    return true;
  int64_t all_numel = tensors[0].numel();
-  for (size_t i = 1; i < tensors.size(); i++) {
+  for (const auto i : c10::irange(1, tensors.size())) {
    if (tensors[i].numel() != all_numel)
      return false;
  }
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@ -11,6 +11,7 @@
 #include <c10/util/Exception.h>
 #include <c10/core/impl/DeviceGuardImplInterface.h>
 #include <c10/core/QEngine.h>
 #include <c10/util/irange.h>
 #include <memory>
 #include <mutex>
@ -349,7 +350,7 @@ static inline void manual_seed(uint64_t seed) {
  // available. In that case, we must not seed CUDA; it will fail!
  const auto num_gpus = detail::getCUDAHooks().getNumGPUs();
  if (hasCUDA() && num_gpus > 0) {
-    for (int i = 0; i < num_gpus; i++) {
+    for (const auto i : c10::irange(num_gpus)) {
      auto cuda_gen = globalContext().defaultGenerator(
        Device(at::kCUDA, static_cast<c10::DeviceIndex>(i))
      );
--- a/aten/src/ATen/ExpandUtils.cpp
+++ b/aten/src/ATen/ExpandUtils.cpp
@ -197,7 +197,7 @@ std::vector<int64_t> infer_dense_strides(IntArrayRef tensor_sizes, IntArrayRef t
  // compute output strides which preserves the input tensor's memory layout
  std::vector<int64_t> out_strides(ndim);
  int64_t curr_stride = 1;
-  for (size_t i = 0; i < ndim; ++i) {
+  for (const auto i : c10::irange(ndim)) {
    int64_t idx = perm[i];
    out_strides[idx] = curr_stride;
    // Note: for size 0, we simply treated it as 1, it really doesn't matter here
--- a/aten/src/ATen/ExpandUtils.h
+++ b/aten/src/ATen/ExpandUtils.h
@ -4,6 +4,7 @@
 #include <ATen/Tensor.h>
 #include <c10/util/Exception.h>
 #include <c10/util/MaybeOwned.h>
 #include <c10/util/irange.h>
 #include <functional>
 #include <sstream>
@ -266,7 +267,7 @@ inline std::vector<Tensor> expand_outplace(TensorList to_expand) {
  // expands a list of Tensors; ignores undefined (null) tensors
  bool first = true;
  DimVector sizes;
-  for (size_t i = 0; i < to_expand.size(); ++i) {
+  for (const auto i : c10::irange(to_expand.size())) {
    if (!to_expand[i].defined()) {
      continue;
    } else if (first) {
@ -278,7 +279,7 @@ inline std::vector<Tensor> expand_outplace(TensorList to_expand) {
  }
  std::vector<Tensor> result(to_expand.size());
-  for (size_t i = 0; i < to_expand.size(); ++i) {
+  for (const auto i : c10::irange(to_expand.size())) {
    if (!to_expand[i].defined()) {
      continue;
    } else if (to_expand[i].sizes().equals(sizes)) {
@ -299,7 +300,7 @@ static inline Tensor sum_to(Tensor tensor, const IntArrayRef shape) {
  c10::SmallVector<int64_t, 8> reduce_dims;
  const at::IntArrayRef sizes = tensor.sizes();
  const int64_t leading_dims = sizes.size() - shape.size();
-  for (int64_t i = 0; i < leading_dims; ++i) {
+  for (const auto i : c10::irange(leading_dims)) {
    reduce_dims.push_back(i);
  }
  for (int64_t i = leading_dims; i < static_cast<int64_t>(sizes.size()); ++i) {
@ -320,7 +321,7 @@ static inline bool is_expandable_to(IntArrayRef shape, IntArrayRef desired) {
  if (ndim > target_dim) {
    return false;
  }
-  for (size_t i = 0; i < ndim; i++) {
+  for (const auto i : c10::irange(ndim)) {
    int64_t size = shape[ndim - i - 1];
    int64_t target = desired[target_dim - i - 1];
    if (size != target && size != 1) {
--- a/aten/src/ATen/MemoryOverlap.cpp
+++ b/aten/src/ATen/MemoryOverlap.cpp
@ -1,6 +1,7 @@
 #include <ATen/MemoryOverlap.h>
 #include <ATen/core/TensorBase.h>
 #include <c10/core/Layout.h>
 #include <c10/util/irange.h>
 namespace at {
@ -17,7 +18,7 @@ MemOverlap has_internal_overlap(TensorImpl* t) {
  auto strides = t->strides();
  auto sizes = t->sizes();
-  for (size_t i = 0; i < strides.size(); ++i) {
+  for (const auto i : c10::irange(strides.size())) {
    if (strides[i] == 0 && sizes[i] > 1) {
      return MemOverlap::YES;
    }
--- a/aten/src/ATen/NamedTensorUtils.cpp
+++ b/aten/src/ATen/NamedTensorUtils.cpp
@ -225,7 +225,7 @@ std::vector<Dimname> compute_squeeze_outnames(const Tensor& tensor) {
  }
  std::vector<Dimname> outnames;
  auto tensor_names = tensor.names();
-  for (int64_t d = 0; d < tensor.dim(); d++) {
+  for (const auto d : c10::irange(tensor.dim())) {
    if (tensor.sizes()[d] != 1) {
      outnames.push_back(tensor_names[d]);
    }
@ -242,7 +242,7 @@ std::vector<Dimname> compute_diagonal_outnames(
  }
  std::vector<Dimname> outnames;
  auto tensor_names = tensor.names();
-  for (int64_t d = 0; d < tensor.dim(); d++) {
+  for (const auto d : c10::irange(tensor.dim())) {
    if (d == dim1 || d == dim2) {
      continue;
    }
--- a/aten/src/ATen/ParallelNative.cpp
+++ b/aten/src/ATen/ParallelNative.cpp
@ -6,6 +6,7 @@
 #ifndef C10_MOBILE
 #include <c10/core/thread_pool.h>
 #include <c10/util/irange.h>
 #else
 #include <caffe2/utils/threadpool/pthreadpool-cpp.h>
 #endif // C10_MOBILE
@ -87,7 +88,7 @@ TaskThreadPoolBase& _get_intraop_pool() {
 // `fn` will be called with params: (thread_pool_task_id, task_id).
 void _run_with_pool(const std::function<void(int, size_t)>& fn, size_t range) {
 #ifndef C10_MOBILE
-  for (size_t i = 1; i < range; ++i) {
+  for (const auto i : c10::irange(1, range)) {
    _get_intraop_pool().run([fn, i]() { fn((int)i, i); });
  }
  // Run the first task on the current thread directly.
--- a/aten/src/ATen/SparseTensorImpl.h
+++ b/aten/src/ATen/SparseTensorImpl.h
@ -3,6 +3,7 @@
 #include <ATen/Tensor.h>
 #include <c10/core/TensorImpl.h>
 #include <c10/util/Exception.h>
 #include <c10/util/irange.h>
 namespace at {
 struct TORCH_API SparseTensorImpl : public TensorImpl {
@ -109,7 +110,7 @@ public:
      bool shrinking_dense_dim = false;
      auto sparse_size_original = sizes().slice(0, sparse_dim);
      auto sparse_size_new = size.slice(0, sparse_dim);
-      for (int64_t i = 0; i < sparse_dim; i++) {
+      for (const auto i : c10::irange(sparse_dim)) {
        if (sparse_size_new[i] < sparse_size_original[i]) {
          shrinking_sparse_dims = true;
          break;
@ -117,7 +118,7 @@ public:
      }
      auto dense_size_original = sizes().slice(sparse_dim);
      auto dense_size_new = size.slice(sparse_dim);
-      for (int64_t i = 0; i < dense_dim; i++) {
+      for (const auto i : c10::irange(dense_dim)) {
        if (dense_size_new[i] < dense_size_original[i]) {
          shrinking_dense_dim = true;
          break;
--- a/aten/src/ATen/SparseTensorUtils.cpp
+++ b/aten/src/ATen/SparseTensorUtils.cpp
@ -3,6 +3,7 @@
 #include <ATen/ATen.h>
 #include <ATen/SparseTensorImpl.h>
 #include <ATen/Parallel.h>
 #include <c10/util/irange.h>
 namespace at { namespace sparse {
@ -98,7 +99,7 @@ Tensor coo_to_csr(const int64_t* indices, int64_t dim, int64_t nnz) {
    at::parallel_for(0, nnz, 10000, [&](int64_t start, int64_t end) {
      // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
      int64_t h, hp0, hp1;
-      for (auto i = start; i < end; i++) {
+      for (const auto i : c10::irange(start, end)) {
        hp0 = indices[i];
        hp1 = (i+1 == nnz) ?  dim : indices[i+1];
        if (hp0 != hp1) {
--- a/aten/src/ATen/TensorIndexing.cpp
+++ b/aten/src/ATen/TensorIndexing.cpp
@ -1,6 +1,7 @@
 #include <ATen/TensorIndexing.h>
 #include <c10/util/Exception.h>
 #include <c10/util/irange.h>
 namespace at {
 namespace indexing {
@ -31,7 +32,7 @@ std::ostream& operator<<(std::ostream& stream, const TensorIndex& tensor_index)
 std::ostream& operator<<(std::ostream& stream, const std::vector<TensorIndex>& tensor_indices) {
  stream << "(";
-  for (size_t i = 0; i < tensor_indices.size(); i++) {
+  for (const auto i : c10::irange(tensor_indices.size())) {
    stream << tensor_indices[i];
    if (i < tensor_indices.size() - 1) stream << ", ";
  }
--- a/aten/src/ATen/TensorIndexing.h
+++ b/aten/src/ATen/TensorIndexing.h
@ -1,6 +1,7 @@
 #pragma once
 #include <c10/util/Optional.h>
 #include <c10/util/irange.h>
 #include <ATen/core/TensorBody.h>
 #include <ATen/ExpandUtils.h>
 #include <ATen/Functions.h>
@ -335,7 +336,7 @@ static inline Tensor scalarToTensor(const Scalar& v, const TensorOptions& option
 // strip away unit dimensions from the left of 'src'
 static inline IntArrayRef slicePrefix1sSize(const IntArrayRef& sizes) {
  size_t first_non1_src = sizes.size();
-  for (size_t i = 0; i < sizes.size(); ++i) {
+  for (const auto i : c10::irange(sizes.size())) {
    if (sizes[i] != 1) {
      first_non1_src = i;
      break;
@ -439,7 +440,7 @@ static inline Tensor applySlicing(
    "too many indices for tensor of dimension ", (int)self_sizes.size());
  Tensor result = self;
-  for (size_t i = 0; i < indices.size(); i++) {
+  for (const auto i : c10::irange(indices.size())) {
    auto& obj = indices[i];
    result = handleDimInMultiDimIndexing(
      /*prev_dim_result=*/result,
--- a/aten/src/ATen/TensorIterator.cpp
+++ b/aten/src/ATen/TensorIterator.cpp
@ -36,8 +36,8 @@ inline void get_base_ptrs(char** ptrs, ArrayRef<OperandInfo> operands) {
 }
 inline void get_strides(int64_t* strides, ArrayRef<OperandInfo> operands, int64_t ndim) {
-  for (int64_t dim = 0; dim < ndim; ++dim) {
+  for (const auto dim : c10::irange(ndim)) {
-    for (size_t arg = 0; arg < operands.size(); ++arg) {
+    for (const auto arg : c10::irange(operands.size())) {
      *strides++ = operands[arg].stride_bytes[dim];
    }
  }
@ -214,7 +214,7 @@ void TensorIteratorBase::reorder_dimensions() {
  // returns 1 if the dim0 should come after dim1, -1 if dim0 should come
  // before dim1, and 0 if the comparison is ambiguous.
  auto should_swap = [&](size_t dim0, size_t dim1) {
-    for (int arg = 0; arg < ntensors(); arg++) {
+    for (const auto arg : c10::irange(ntensors())) {
      // ignore undefined or incorrectly sized tensors
      if (operands_[arg].stride_bytes.empty() || operands_[arg].will_resize) {
        continue;
@ -251,7 +251,7 @@ void TensorIteratorBase::reorder_dimensions() {
  };
  // insertion sort with support for ambiguous comparisons
-  for (int i = 1; i < ndim(); i++) {
+  for (const auto i : c10::irange(1, ndim())) {
    int dim1 = i;
    for (int dim0 = i - 1; dim0 >= 0; dim0--) {
      int comparison = should_swap(perm_[dim0], perm_[dim1]);
@ -497,7 +497,7 @@ void TensorIteratorBase::compute_types(const TensorIteratorConfig& config) {
 StrideVector TensorIteratorBase::compatible_stride(int element_size) const {
  auto stride = StrideVector();
  int64_t next_stride = element_size;
-  for (int dim = 0; dim < ndim(); dim++) {
+  for (const auto dim : c10::irange(ndim())) {
    stride.push_back(next_stride);
    next_stride *= shape_[dim];
  }
@ -510,14 +510,14 @@ DimVector TensorIteratorBase::invert_perm(IntArrayRef input) const {
  TORCH_INTERNAL_ASSERT(!has_coalesced_dimensions_);
  TORCH_INTERNAL_ASSERT(input.size()==perm_.size());
  auto res = DimVector(input.size()); //no initialization needed, every value in res should be written to.
-  for (int dim = 0; dim < ndim(); dim++) {
+  for (const auto dim : c10::irange(ndim())) {
    res[perm_[dim]] = input[dim];
  }
  return res;
 }
 void TensorIteratorBase::allocate_or_resize_outputs() {
-  for (int i = 0; i < num_outputs_; i++) {
+  for (const auto i : c10::irange(num_outputs_)) {
    auto& op = operands_[i];
    if (!op.tensor_base().defined() || op.will_resize) {
      TORCH_INTERNAL_ASSERT(op.is_type_defined(), "no type for operand", i);
@ -525,7 +525,7 @@ void TensorIteratorBase::allocate_or_resize_outputs() {
      op.stride_bytes = compatible_stride(element_size);
      // check if permutation is just an inverted order
      bool inverted = true;
-      for (int i = 0; i < ndim(); i++) {
+      for (const auto i : c10::irange(ndim())) {
        if (perm_[i] != ndim() - i - 1) {
          inverted = false;
          break;
@ -539,7 +539,7 @@ void TensorIteratorBase::allocate_or_resize_outputs() {
        set_output(i, tensor_shape, {}, original_options(op), names_);
      } else {
        auto tensor_stride = invert_perm(op.stride_bytes);
-        for (int dim = 0; dim < ndim(); dim++) {
+        for (const auto dim : c10::irange(ndim())) {
          tensor_stride[dim] /= element_size;
        }
        set_output(i, tensor_shape, tensor_stride, original_options(op), names_);
@ -593,7 +593,7 @@ void TensorIteratorBase::coalesce_dimensions() {
    if (shape0 == 1 || shape1 == 1) {
      return true;
    }
-    for (int i = 0; i < ntensors(); i++) {
+    for (const auto i : c10::irange(ntensors())) {
      auto& stride = operands_[i].stride_bytes;
      if (shape0 * stride[dim0] != stride[dim1]) {
        return false;
@ -604,14 +604,14 @@ void TensorIteratorBase::coalesce_dimensions() {
  // replace each operands stride at dim0 with its stride at dim1
  auto replace_stride = [&](int dim0, int dim1) {
-    for (int i = 0; i < ntensors(); i++) {
+    for (const auto i : c10::irange(ntensors())) {
      auto& stride = operands_[i].stride_bytes;
      stride[dim0] = stride[dim1];
    }
  };
  int prev_dim = 0;
-  for (int dim = 1; dim < ndim(); dim++) {
+  for (const auto dim : c10::irange(1, ndim())) {
    if (can_coalesce(prev_dim, dim)) {
      if (shape_[prev_dim] == 1) {
        replace_stride(prev_dim, dim);
@ -627,7 +627,7 @@ void TensorIteratorBase::coalesce_dimensions() {
  }
  shape_.resize(prev_dim + 1);
-  for (int i = 0; i < ntensors(); i++) {
+  for (const auto i : c10::irange(ntensors())) {
    operands_[i].stride_bytes.resize(ndim());
  }
  has_coalesced_dimensions_ = true;
@ -670,7 +670,7 @@ void TensorIteratorBase::permute_dimensions(IntArrayRef perm) {
  auto reorder = [perm](IntArrayRef data) {
    auto res = DimVector(data.size(), 0);
-    for (size_t i = 0; i < perm.size(); i++) {
+    for (const auto i : c10::irange(perm.size())) {
      res[i] = data[perm[i]];
    }
    return res;
@ -687,7 +687,7 @@ void TensorIteratorBase::permute_dimensions(IntArrayRef perm) {
 int64_t TensorIteratorBase::num_output_elements() const {
  int64_t elem = 1;
-  for (int dim = 0; dim < ndim(); dim++) {
+  for (const auto dim : c10::irange(ndim())) {
    if (operands_[0].stride_bytes[dim] != 0 || shape_[dim] == 0)  {
      elem *= shape_[dim];
    }
@ -697,7 +697,7 @@ int64_t TensorIteratorBase::num_output_elements() const {
 int TensorIteratorBase::num_reduce_dims() const {
  int count = 0;
-  for (int dim = 0; dim < ndim(); dim++) {
+  for (const auto dim : c10::irange(ndim())) {
    if (operands_[0].stride_bytes[dim] == 0) {
      count++;
    }
@ -760,7 +760,7 @@ bool TensorIteratorBase::is_contiguous() const {
 bool TensorIteratorBase::is_scalar(int arg) const {
  const auto& stride = operands_[arg].stride_bytes;
-  for (int i = 0; i < ndim(); i++) {
+  for (const auto i : c10::irange(ndim())) {
    if (stride[i] != 0 && shape_[i] != 1) {
      return false;
    }
@ -815,7 +815,7 @@ void TensorIteratorBase::narrow(int dim, int64_t start, int64_t size) {
 void TensorIteratorBase::select_all_keeping_dim(int start_dim, IntArrayRef indices) {
  TORCH_INTERNAL_ASSERT(start_dim <= ndim());
-  for (int i = start_dim; i < ndim(); ++i) {
+  for (const auto i : c10::irange(start_dim, ndim())) {
    for (auto& op : operands_) {
      op.data = ((char*)op.data) + op.stride_bytes[i] * indices[i - start_dim];
    }
@ -1063,13 +1063,13 @@ void TensorIteratorBase::populate_operands(TensorIteratorConfig& config) {
 void TensorIteratorBase::mark_outputs() {
  // TODO: merge this into populate_operands
-  for (int i = 0; i < num_outputs_; i++) {
+  for (const auto i : c10::irange(num_outputs_)) {
    operands_[i].is_output = true;
    const auto& output = tensor(i);
    if (!output.defined()) continue;
    // check if output is also an input
-    for (int arg = num_outputs_; arg < ntensors(); arg++) {
+    for (const auto arg : c10::irange(num_outputs_, ntensors())) {
      const auto& input = tensor(arg);
      if (output.is_same(input)) {
        operands_[i].is_read_write = true;
@ -1086,7 +1086,7 @@ void TensorIteratorBase::mark_resize_outputs(const TensorIteratorConfig& config)
  if (config.static_shape_.has_value()) {
    return;
  }
-  for (int i = 0; i < num_outputs_; i++) {
+  for (const auto i : c10::irange(num_outputs_)) {
    const auto& output = tensor(i);
    if (output.defined() && !output.sizes().equals(shape_)) {
      if (config.resize_outputs_ && !operands_[i].is_read_write) {
@ -1104,11 +1104,11 @@ void TensorIteratorBase::compute_mem_overlaps(const TensorIteratorConfig& config
  if (!config.check_mem_overlap_) {
    return;
  }
-  for (int i = 0; i < num_outputs_; i++) {
+  for (const auto i : c10::irange(num_outputs_)) {
    const auto& output = tensor_base(i);
    if (!output.defined()) continue;
    assert_no_internal_overlap(output);
-    for (int j = num_outputs_; j < ntensors(); j++) {
+    for (const auto j : c10::irange(num_outputs_, ntensors())) {
      const auto& input = tensor_base(j);
      if (!input.is_same(output)) {
        assert_no_partial_overlap(output, input);
@ -1164,7 +1164,7 @@ void TensorIteratorBase::compute_strides(const TensorIteratorConfig& config) {
          op.stride_bytes.resize(ndim(), 0);
      else
          op.stride_bytes.resize(ndim());
-      for (size_t i = 0; i < original_shape.size(); i++) {
+      for (const auto i : c10::irange(original_shape.size())) {
        // see NOTE: [Computing output strides]
        if (original_shape[i] == 1 && shape_[offset + i] !=1) {
          op.stride_bytes[offset + i] = 0;
@ -1183,7 +1183,7 @@ bool TensorIteratorBase::can_use_32bit_indexing() const {
  }
  for (auto& op : operands_) {
    int64_t max_offset = 1;
-    for (int dim = 0; dim < ndim(); dim++) {
+    for (const auto dim : c10::irange(ndim())) {
      max_offset += (shape_[dim] - 1) * op.stride_bytes[dim];
    }
    if (max_offset > max_value) {
@ -1245,7 +1245,7 @@ bool TensorIteratorBase::fast_set_up(const TensorIteratorConfig& config) {
  switch (setup_type) {
    case FastSetupType::CONTIGUOUS:
      {
-        for (int i = 0; i < num_outputs_; i++){
+        for (const auto i : c10::irange(num_outputs_)) {
          auto& op = operands_[i];
          if (!op.tensor_base().defined()) {
            TORCH_INTERNAL_ASSERT(op.is_type_defined(), "no type for operand", i);
@ -1256,7 +1256,7 @@ bool TensorIteratorBase::fast_set_up(const TensorIteratorConfig& config) {
      }
    case FastSetupType::CHANNELS_LAST:
      {
-        for (int i = 0; i < num_outputs_; i++){
+        for (const auto i : c10::irange(num_outputs_)) {
          auto& op = operands_[i];
          if (!op.tensor_base().defined()) {
            TORCH_INTERNAL_ASSERT(op.is_type_defined(), "no type for operand", i);
@ -1273,7 +1273,7 @@ bool TensorIteratorBase::fast_set_up(const TensorIteratorConfig& config) {
          if (tensor(i_defined).defined()) break;
        }
        TORCH_CHECK(i_defined >= 0, "Can not find a defined tensor when fast allocating memory to outputs");
-        for (int i = 0; i < num_outputs_; i++){
+        for (const auto i : c10::irange(num_outputs_)) {
          auto& op = operands_[i];
          if (!op.tensor_base().defined()) {
            TORCH_INTERNAL_ASSERT(op.is_type_defined(), "no type for operand", i);
--- a/aten/src/ATen/TensorIterator.h
+++ b/aten/src/ATen/TensorIterator.h
@ -4,6 +4,7 @@
 #include <c10/util/MaybeOwned.h>
 #include <c10/util/SmallVector.h>
 #include <c10/util/TypeCast.h>
 #include <c10/util/irange.h>
 #include <ATen/core/Dimname.h>
 #include <ATen/core/Range.h>
 #include <ATen/core/TensorBase.h>
@ -322,9 +323,9 @@ private:
        char** base, const int64_t* strides, int64_t size0, int64_t size1) {
      PtrVector data(base, base + ntensor);
      const int64_t* outer_strides = &strides[ntensor];
-      for (int64_t i = 0; i < size1; i++) {
+      for (const auto i : c10::irange(size1)) {
        if (i > 0) {
-          for (int64_t arg = 0; arg < ntensor; arg++) {
+          for (const auto arg : c10::irange(ntensor)) {
            data[arg] += outer_strides[arg];
          }
        }
@ -397,7 +398,7 @@ public:
  bool has_contiguous_first_dim() const {
    int num_tensors = ntensors();
-    for (int i = 0; i < num_tensors; i++) {
+    for (const auto i : c10::irange(num_tensors)) {
      if (strides(i)[0] != element_size(i)) {
        return false;
      }
--- a/aten/src/ATen/TensorIteratorInternal.h
+++ b/aten/src/ATen/TensorIteratorInternal.h
@ -1,6 +1,7 @@
 #pragma once
 #include <ATen/native/TensorIterator.h>
 #include <c10/util/SmallBuffer.h>
 #include <c10/util/irange.h>
 namespace at {
@ -24,9 +25,9 @@ inline void get_data_ptrs(
  const int64_t ntensors = base.size();
  const int64_t ndim = counter.size();
  std::copy(base.begin(), base.end(), ptrs);
-  for (int64_t dim = 0; dim < ndim; ++dim) {
+  for (const auto dim : c10::irange(ndim)) {
    int64_t value = counter[dim];
-    for (int64_t arg = 0; arg < ntensors; ++arg) {
+    for (const auto arg : c10::irange(ntensors)) {
      ptrs[arg] += value * strides[dim * ntensors + arg];
    }
  }
--- a/aten/src/ATen/TensorNames.cpp
+++ b/aten/src/ATen/TensorNames.cpp
@ -56,7 +56,7 @@ TensorNames::TensorNames(ArrayRef<Dimname> names, int64_t start, int64_t end) {
  start = maybe_wrap_dim(start, names.size());
  end = maybe_wrap_dim(end, names.size());
  names_.reserve(end - start);
-  for (int64_t idx = start; idx < end; ++idx) {
+  for (const auto idx : c10::irange(start, end)) {
    names_.emplace_back(names, idx);
  }
 }
--- a/aten/src/ATen/TensorUtils.cpp
+++ b/aten/src/ATen/TensorUtils.cpp
@ -2,6 +2,7 @@
 #include <ATen/Config.h>
 #include <ATen/TensorUtils.h>
 #include <c10/util/accumulate.h>
 #include <c10/util/irange.h>
 #include <ostream>
 #include <sstream>
@ -323,7 +324,7 @@ size_t computeStorageNbytes(
  // size of the underlying storage is 1 bigger than the offset
  // of the last element according to stride
  size_t size = 1;
-  for(size_t i = 0; i < sizes.size(); i++) {
+  for (const auto i : c10::irange(sizes.size())) {
    if(sizes[i] == 0) {
      return 0;
    }
--- a/aten/src/ATen/VmapTransforms.cpp
+++ b/aten/src/ATen/VmapTransforms.cpp
@ -83,7 +83,7 @@ VmapDimVector VmapPhysicalView::getPhysicalShape(IntArrayRef logical_shape) cons
 static BatchDims computeFrontBatchDimsFromLevels(std::bitset<kVmapNumLevels> levels_bitset) {
  BatchDims bdims;
  int64_t dim = 0;
-  for (int64_t level = 0; level < kVmapNumLevels; level++) {
+  for (const auto level : c10::irange(kVmapNumLevels)) {
    if (!levels_bitset[level]) {
      continue;
    }
@ -208,7 +208,7 @@ MultiBatchVmapTransform::logicalToPhysical(TensorList logical_tensors) {
  VmapDimVector batch_sizes(num_batch_dims, 1);
  for (const auto& physical_tensor : physical_tensors) {
    auto physical_sizes = physical_tensor.sizes();
-    for (int64_t dim = 0; dim < num_batch_dims; dim++) {
+    for (const auto dim : c10::irange(num_batch_dims)) {
      if (physical_sizes[dim] != 1) {
        batch_sizes[dim] = physical_sizes[dim];
      }
--- a/aten/src/ATen/WrapDimUtils.h
+++ b/aten/src/ATen/WrapDimUtils.h
@ -2,6 +2,7 @@
 #include <c10/core/WrapDimMinimal.h>
 #include <c10/core/TensorImpl.h>
 #include <c10/util/irange.h>
 #include <ATen/core/Tensor.h>
 namespace at {
@ -40,7 +41,7 @@ static inline void maybe_wrap_dims_n(int64_t* dims, int64_t ndims, int64_t dim_p
  }
  int64_t min = -dim_post_expr;
  int64_t max = dim_post_expr - 1;
-  for (int64_t i = 0; i < ndims; ++i) {
+  for (const auto i : c10::irange(ndims)) {
    auto &dim = dims[i];
    if (dim < min || dim > max) {
      TORCH_CHECK_INDEX(false,
@ -85,7 +86,7 @@ static inline int64_t legacy_cat_wrap_dim(int64_t dim, TensorList tensors) {
 // wrap negative dims in a vector
 static inline void wrap_all_dims(std::vector<int64_t>& dims_to_wrap, int64_t tensor_total_dims) {
-  for (size_t i = 0; i < dims_to_wrap.size(); i++) {
+  for (const auto i : c10::irange(dims_to_wrap.size())) {
    dims_to_wrap[i] = maybe_wrap_dim(dims_to_wrap[i], tensor_total_dims);
  }
 }
--- a/aten/src/ATen/WrapDimUtilsMulti.h
+++ b/aten/src/ATen/WrapDimUtilsMulti.h
@ -1,6 +1,7 @@
 #pragma once
 #include <c10/core/TensorImpl.h>
 #include <c10/util/irange.h>
 #include <ATen/WrapDimUtils.h>
 #include <sstream>
 #include <bitset>
@ -15,7 +16,7 @@ constexpr size_t dim_bitset_size = 64;
 static inline std::bitset<dim_bitset_size> dim_list_to_bitset(IntArrayRef dims, int64_t ndims) {
  TORCH_CHECK(ndims <= (int64_t) dim_bitset_size, "only tensors with up to ", dim_bitset_size, " dims are supported");
  std::bitset<dim_bitset_size> seen;
-  for (size_t i = 0; i < dims.size(); i++) {
+  for (const auto i : c10::irange(dims.size())) {
    size_t dim = maybe_wrap_dim(dims[i], ndims);
    TORCH_CHECK(!seen[dim], "dim ", dim, " appears multiple times in the list of dims");
    seen[dim] = true;
--- a/aten/src/ATen/benchmarks/stateful_conv1d.cpp
+++ b/aten/src/ATen/benchmarks/stateful_conv1d.cpp
@ -1,4 +1,5 @@
 #include <benchmark/benchmark.h>
 #include <c10/util/irange.h>
 #include <torch/csrc/jit/passes/xnnpack_rewrite.h>
 #include <torch/csrc/autograd/generated/variable_factories.h>
 #include <torch/csrc/jit/api/module.h>
@ -33,7 +34,7 @@ static void stateful_conv1d(benchmark::State& state) {
  )");
  std::vector<std::vector<torch::jit::IValue>> inputs;
-  for (int i = 0; i < 10; ++i) {
+  for (const auto i : c10::irange(10)) {
    std::vector<torch::jit::IValue> input;
    // NOLINTNEXTLINE(modernize-use-emplace)
    input.push_back(torch::rand({batch_size, input_channels, width}));
@ -70,8 +71,8 @@ static void GenerateSizes(benchmark::internal::Benchmark* b) {
  for (size_t input_channels = 32; input_channels < 256; input_channels *= 2) {
    for (size_t output_channels = 32; output_channels < 256; output_channels *= 2) {
-      for (size_t kernel = 3; kernel < 8; ++kernel) {
+      for (const auto kernel : c10::irange(3, 8)) {
-        for (size_t batch_size = 1; batch_size < 5; ++batch_size) {
+        for (const auto batch_size : c10::irange(1, 5)) {
          for (size_t width = 32; width < 256; width *= 2) {
            b->Args({input_channels, output_channels, kernel, batch_size, width, true});
            b->Args({input_channels, output_channels, kernel, batch_size, width, false});
--- a/aten/src/ATen/core/Array.h
+++ b/aten/src/ATen/core/Array.h
@ -4,6 +4,7 @@
 // device code.
 #include <c10/macros/Macros.h>
 #include <c10/util/irange.h>
 namespace at { namespace detail {
--- a/aten/src/ATen/core/Formatting.cpp
+++ b/aten/src/ATen/core/Formatting.cpp
@ -1,4 +1,5 @@
 #include <ATen/core/Formatting.h>
 #include <c10/util/irange.h>
 #include <cmath>
 #include <cstdint>
@ -44,7 +45,7 @@ static std::tuple<double, int64_t> __printFormat(std::ostream& stream, const Ten
  }
  bool intMode = true;
  auto self_p = self.data_ptr<double>();
-  for(int64_t i = 0; i < size; i++) {
+  for (const auto i : c10::irange(size)) {
    auto z = self_p[i];
    if(std::isfinite(z)) {
      if(z != std::ceil(z)) {
@ -70,7 +71,7 @@ static std::tuple<double, int64_t> __printFormat(std::ostream& stream, const Ten
  } else {
    expMin = fabs(self_p[offset]);
    expMax = fabs(self_p[offset]);
-    for(int64_t i = offset; i < size; i++) {
+    for (const auto i : c10::irange(offset, size)) {
      double z = fabs(self_p[i]);
      if(std::isfinite(z)) {
        if(z < expMin) {
@ -130,7 +131,8 @@ static std::tuple<double, int64_t> __printFormat(std::ostream& stream, const Ten
 static void __printIndent(std::ostream &stream, int64_t indent)
 {
-  for(int64_t i = 0; i < indent; i++) {
+  for (const auto i : c10::irange(indent)) {
    (void)i; //Suppress unused variable warning
    stream << " ";
  }
 }
@ -168,7 +170,7 @@ static void __printMatrix(std::ostream& stream, const Tensor& self, int64_t line
      printScale(stream,scale);
      __printIndent(stream, indent);
    }
-    for(int64_t l = 0; l < self.size(0); l++) {
+    for (const auto l : c10::irange(self.size(0))) {
      Tensor row = self.select(0,l);
      double *row_ptr = row.data_ptr<double>();
      for(int64_t c = firstColumn; c < lastColumn+1; c++) {
@ -198,8 +200,7 @@ void __printTensor(std::ostream& stream, Tensor& self, int64_t linesize)
  bool start = true;
  bool finished = false;
  counter[0] = -1;
-  for(size_t i = 1; i < counter.size(); i++)
+  for (const auto i : c10::irange(1, counter.size()))counter[i] = 0;
    counter[i] = 0;
  while(true) {
    for(int64_t i = 0; self.ndimension()-2; i++) {
      counter[i] = counter[i] + 1;
@ -269,7 +270,7 @@ std::ostream& print(std::ostream& stream, const Tensor & tensor_, int64_t linesi
          printScale(stream, scale);
        }
        double* tensor_p = tensor.data_ptr<double>();
-        for (int64_t i = 0; i < tensor.size(0); i++) {
+        for (const auto i : c10::irange(tensor.size(0))) {
          stream << std::setw(sz) << tensor_p[i]/scale << std::endl;
        }
      }
@ -284,7 +285,7 @@ std::ostream& print(std::ostream& stream, const Tensor & tensor_, int64_t linesi
        __printTensor(stream, tensor, linesize);
      }
      stream << "[ " << tensor_.toString() << "{" << tensor.size(0);
-      for(int64_t i = 1; i < tensor.ndimension(); i++) {
+      for (const auto i : c10::irange(1, tensor.ndimension())) {
        stream << "," << tensor.size(i);
      }
      stream << "}";
--- a/aten/src/ATen/core/MT19937RNGEngine.h
+++ b/aten/src/ATen/core/MT19937RNGEngine.h
@ -155,7 +155,7 @@ private:
    data_.seed_ = seed;
    data_.seeded_ = true;
    data_.state_[0] = seed & 0xffffffff;
-    for(int j = 1; j < MERSENNE_STATE_N; j++) {
+    for (const auto j : c10::irange(1, MERSENNE_STATE_N)) {
      data_.state_[j] = (1812433253 * (data_.state_[j-1] ^ (data_.state_[j-1] >> 30)) + j);
    }
    data_.left_ = 1;
--- a/aten/src/ATen/core/TensorAccessor.h
+++ b/aten/src/ATen/core/TensorAccessor.h
@ -3,6 +3,7 @@
 #include <c10/macros/Macros.h>
 #include <c10/util/Deprecated.h>
 #include <c10/util/Exception.h>
 #include <c10/util/irange.h>
 #include <stdint.h>
 #include <cstddef>
@ -134,7 +135,7 @@ public:
      const source_index_t* sizes_,
      const source_index_t* strides_)
      : data_(data_) {
-    for (int i = 0; i < N; i++) {
+    for (const auto i : c10::irange(N)) {
      this->sizes_[i] = sizes_[i];
      this->strides_[i] = strides_[i];
    }
--- a/aten/src/ATen/core/boxing/impl/test_helpers.h
+++ b/aten/src/ATen/core/boxing/impl/test_helpers.h
@ -7,6 +7,7 @@
 #include <ATen/core/dispatch/Dispatcher.h>
 #include <ATen/core/ivalue.h>
 #include <c10/core/CPUAllocator.h>
 #include <c10/util/irange.h>
 template<class... Inputs>
 inline std::vector<c10::IValue> makeStack(Inputs&&... inputs) {
@ -87,7 +88,7 @@ inline void expectThrows(Functor&& functor, const char* expectMessageContains) {
 template<class T, size_t N>
 void expectListEquals(c10::ArrayRef<T> expected, std::array<T, N> actual) {
  EXPECT_EQ(expected.size(), actual.size());
-  for (size_t i = 0; i < expected.size(); ++i) {
+  for (const auto i : c10::irange(expected.size())) {
    EXPECT_EQ(expected[i], actual[i]);
  }
 }
@ -95,7 +96,7 @@ void expectListEquals(c10::ArrayRef<T> expected, std::array<T, N> actual) {
 template<class T>
 void expectListEquals(c10::ArrayRef<T> expected, c10::ArrayRef<T> actual) {
  EXPECT_EQ(expected.size(), actual.size());
-  for (size_t i = 0; i < expected.size(); ++i) {
+  for (const auto i : c10::irange(expected.size())) {
    EXPECT_EQ(expected[i], actual[i]);
  }
 }
@ -103,7 +104,7 @@ void expectListEquals(c10::ArrayRef<T> expected, c10::ArrayRef<T> actual) {
 template<class T>
 void expectListEquals(c10::ArrayRef<T> expected, c10::List<T> actual) {
  EXPECT_EQ(expected.size(), actual.size());
-  for (size_t i = 0; i < expected.size(); ++i) {
+  for (const auto i : c10::irange(expected.size())) {
    EXPECT_EQ(expected[i], actual.get(i));
  }
 }
@ -111,7 +112,7 @@ void expectListEquals(c10::ArrayRef<T> expected, c10::List<T> actual) {
 template<class T>
 void expectListEquals(c10::ArrayRef<T> expected, std::vector<T> actual) {
  EXPECT_EQ(expected.size(), actual.size());
-  for (size_t i = 0; i < expected.size(); ++i) {
+  for (const auto i : c10::irange(expected.size())) {
    EXPECT_EQ(expected[i], actual[i]);
  }
 }
--- a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h
+++ b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h
@ -5,6 +5,7 @@
 #include <ATen/core/jit_type.h>
 #include <c10/util/Bitset.h>
 #include <c10/core/DispatchKeySet.h>
 #include <c10/util/irange.h>
 #include <ATen/core/Variadic.h>
 #include <ATen/core/stack.h>
@ -171,7 +172,7 @@ private:
        "The function schema has ", schema.arguments().size(),
        " arguments but this PyTorch build only supports ", c10::utils::bitset::NUM_BITS());
    c10::utils::bitset dispatch_arg_indices_reverse;
-    for (size_t index = 0; index < schema.arguments().size(); ++index) {
+    for (const auto index : c10::irange(schema.arguments().size())) {
      if (schema.arguments()[index].type()->isSubtypeOf(*TensorType::get()) ||
          schema.arguments()[index].type()->isSubtypeOf(
              *ListType::ofTensors()) ||
--- a/aten/src/ATen/core/dispatch/backend_fallback_test.cpp
+++ b/aten/src/ATen/core/dispatch/backend_fallback_test.cpp
@ -5,6 +5,7 @@
 #include <ATen/Functions.h>
 #include <ATen/core/dispatch/Dispatcher.h>
 #include <ATen/core/op_registration/op_registration.h>
 #include <c10/util/irange.h>
 #include <torch/library.h>
 using namespace at;
@ -51,7 +52,7 @@ void generic_wrapper_fallback(const c10::OperatorHandle& op, torch::jit::Stack*
  // Unwrap all arguments
  auto args = torch::jit::pop(*stack, num_arguments);
-  for (size_t i = 0; i < num_arguments; i++) {
+  for (const auto i : c10::irange(num_arguments)) {
    // TODO: Handle tensor list
    if (args[i].isTensor()) {
      auto* impl = args[i].unsafeToTensorImpl();
@ -70,7 +71,7 @@ void generic_wrapper_fallback(const c10::OperatorHandle& op, torch::jit::Stack*
  // Rewrap outputs
  auto rets = torch::jit::pop(*stack, num_returns);
-  for (size_t i = 0; i < num_returns; i++) {
+  for (const auto i : c10::irange(num_returns)) {
    // TODO: Handle tensor list
    if (rets[i].isTensor()) {
      torch::jit::push(*stack, at::detail::make_tensor<GenericWrapperTensorImpl>(std::move(rets[i]).toTensor()));  // yes move!
--- a/aten/src/ATen/core/function_schema.h
+++ b/aten/src/ATen/core/function_schema.h
@ -2,6 +2,7 @@
 #include <c10/util/StringUtil.h>
 #include <c10/util/string_view.h>
 #include <c10/util/irange.h>
 #include <ATen/core/jit_type.h>
 #include <ATen/core/interned_strings.h>
 #include <ATen/core/ivalue.h>
--- a/aten/src/ATen/core/function_schema_inl.h
+++ b/aten/src/ATen/core/function_schema_inl.h
@ -16,7 +16,7 @@ inline std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema)
  out << "(";
  bool seen_kwarg_only = false;
-  for(size_t i = 0; i < schema.arguments().size(); ++i) {
+  for (const auto i : c10::irange(schema.arguments().size())) {
    if (i > 0) out << ", ";
    if (schema.arguments()[i].kwarg_only() && !seen_kwarg_only) {
      out << "*, ";
@ -35,7 +35,7 @@ inline std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema)
  const auto& returns = schema.returns();
  out << "(";
-  for(size_t i = 0; i < returns.size(); ++i) {
+  for (const auto i : c10::irange(returns.size())) {
    if (i > 0) {
      out << ", ";
    }
@ -53,7 +53,7 @@ inline std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema)
 inline size_t findFirstOutArg(const std::vector<Argument>& args) {
  // find the start of out args in the schema
-  for (size_t out_start_idx = 0; out_start_idx < args.size(); out_start_idx++) {
+  for (const auto out_start_idx : c10::irange(args.size())) {
    if (args.at(out_start_idx).is_out()) {
      return out_start_idx;
    }
@ -122,7 +122,7 @@ inline bool FunctionSchema::isBackwardCompatibleWith(
        && arguments().size() >= old.arguments().size())) {
    return false;
  }
-  for (size_t i = 0; i < returns().size(); ++i) {
+  for (const auto i : c10::irange(returns().size())) {
    // Backwards compatibility requires covariance on argument types
    // (i.e. more generic), and contravariance on return types (i.e.
    //  more specific).
@ -138,7 +138,7 @@ inline bool FunctionSchema::isBackwardCompatibleWith(
  size_t new_out_start_idx = findFirstOutArg(arguments());
  // make sure among the default args, they are backward compatible
-  for (size_t i = 0; i < old_out_start_idx; i++) {
+  for (const auto i : c10::irange(old_out_start_idx)) {
    if (!arguments().at(i).isBackwardCompatibleWith(
          old.arguments().at(i), why_not)) {
      return false;
@ -146,7 +146,7 @@ inline bool FunctionSchema::isBackwardCompatibleWith(
  }
  // // Validate that all new arguments provided has a default value
-  for (size_t i = old_out_start_idx; i < new_out_start_idx; ++i) {
+  for (const auto i : c10::irange(old_out_start_idx, new_out_start_idx)) {
    if (!arguments().at(i).default_value()) {
      if (why_not) {
        *why_not
@ -160,7 +160,7 @@ inline bool FunctionSchema::isBackwardCompatibleWith(
  }
  // now compare the out args
-  for (size_t i = old_out_start_idx; i < old.arguments().size(); i++) {
+  for (const auto i : c10::irange(old_out_start_idx, old.arguments().size())) {
    if (!arguments()
             .at(i - old_out_start_idx + new_out_start_idx)
             .isBackwardCompatibleWith(old.arguments().at(i), why_not)) {
@ -238,7 +238,7 @@ inline void FunctionSchema::checkAndNormalizeInputs(
      *this);
  size_t consumed_kwargs = 0;
-  for (size_t pos = 0; pos < arguments().size(); ++pos) {
+  for (const auto pos : c10::irange(arguments().size())) {
    const auto& argument = arguments()[pos];
    if (pos < inputs.size()) {
      checkArg(inputs[pos], argument, pos);
@ -298,7 +298,7 @@ inline bool isSubtypeOfList(
  if (child.size() != parent.size()) {
    return false;
  }
-  for (size_t i = 0; i < child.size(); ++i) {
+  for (const auto i : c10::irange(child.size())) {
    const Argument& c = child[i];
    const Argument& p = parent[i];
    if (c.name() != p.name()) {
--- a/aten/src/ATen/core/ivalue_inl.h
+++ b/aten/src/ATen/core/ivalue_inl.h
@ -22,6 +22,7 @@
 #include <c10/util/intrusive_ptr.h>
 #include <c10/util/irange.h>
 #include <c10/util/hash.h>
 #include <c10/util/irange.h>
 namespace torch {
 namespace jit {
@ -1114,7 +1115,7 @@ struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target {
    }
    std::ostringstream oss;
    oss << devices[0];
-    for (size_t idx = 1; idx < devices.size(); idx++) {
+    for (const auto idx : c10::irange(1, devices.size())) {
      if (idx == devices.size() - 1) {
        oss << " and ";
      } else {
@ -1131,7 +1132,7 @@ struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target {
      return c10::kCPU;
    }
    c10::DeviceType deviceType = devices[0].type();
-    for (size_t idx = 1; idx < devices.size(); idx++) {
+    for (const auto idx : c10::irange(1, devices.size())) {
      TORCH_CHECK_VALUE(
          devices[idx].type() == deviceType,
          "Expected all devices to be of the same type, but got a mismatch between ",
@ -1151,7 +1152,7 @@ struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target {
      [](const c10::Device& a, const c10::Device& b) { return a.index() < b.index(); });
    // Deduplicate by compacting.
    size_t targetIdx = 0;
-    for (size_t sourceIdx = 0; sourceIdx < devices.size(); sourceIdx++) {
+    for (const auto sourceIdx : c10::irange(devices.size())) {
      TORCH_CHECK_VALUE(
          devices[sourceIdx].has_index(),
          "Expected devices to have indices, got ", devices[sourceIdx]);
--- a/aten/src/ATen/core/op_registration/infer_schema.cpp
+++ b/aten/src/ATen/core/op_registration/infer_schema.cpp
@ -1,4 +1,5 @@
 #include <ATen/core/op_registration/infer_schema.h>
 #include <c10/util/irange.h>
 #include <sstream>
 namespace c10 {
@ -20,7 +21,7 @@ std::string fastToString(size_t x) {
 std::vector<Argument> createArgumentVector(c10::ArrayRef<ArgumentDef> args) {
  std::vector<Argument> result;
  result.reserve(args.size());
-  for (size_t i = 0; i < args.size(); ++i) {
+  for (const auto i : c10::irange(args.size())) {
    // Arguments are named "_<index>"
    result.emplace_back(fastToString(i), (*args[i].getTypeFn)());
  }
@ -49,7 +50,7 @@ C10_EXPORT c10::optional<std::string> findSchemaDifferences(const FunctionSchema
             " vs " + guts::to_string(rhs.returns().size());
  }
-  for (size_t i = 0; i < lhs.arguments().size(); ++i) {
+  for (const auto i : c10::irange(lhs.arguments().size())) {
    const TypePtr& leftType = lhs.arguments()[i].type();
    const TypePtr& rightType = rhs.arguments()[i].type();
    // Type::operator== is virtual. Comparing pointers first is
@ -61,7 +62,7 @@ C10_EXPORT c10::optional<std::string> findSchemaDifferences(const FunctionSchema
    }
  }
-  for (size_t i = 0; i < lhs.returns().size(); ++i) {
+  for (const auto i : c10::irange(lhs.returns().size())) {
    const TypePtr& leftType = lhs.returns()[i].type();
    const TypePtr& rightType = rhs.returns()[i].type();
    // See above about comparing pointers first.
--- a/aten/src/ATen/core/qualified_name.h
+++ b/aten/src/ATen/core/qualified_name.h
@ -3,6 +3,7 @@
 #include <c10/util/ArrayRef.h>
 #include <c10/util/Exception.h>
 #include <c10/util/StringUtil.h>
 #include <c10/util/irange.h>
 #include <string>
 namespace c10 {
@ -69,7 +70,7 @@ struct QualifiedName {
      // Can't be a prefix if it's bigger
      return false;
    }
-    for (size_t i = 0; i < thisAtoms.size(); i++) {
+    for (const auto i : c10::irange(thisAtoms.size())) {
      if (thisAtoms[i] != otherAtoms[i]) {
        return false;
      }
@ -116,7 +117,7 @@ struct QualifiedName {
      reserve += e.size() + 1;
    }
    out.reserve(reserve);
-    for (size_t i = 0; i < v.size(); ++i) {
+    for (const auto i : c10::irange(v.size())) {
      if (i != 0) {
        out.push_back(delimiter);
      }
--- a/aten/src/ATen/core/stack.h
+++ b/aten/src/ATen/core/stack.h
@ -4,6 +4,7 @@
 #include <ATen/core/ivalue.h>
 #include <c10/util/Deprecated.h>
 #include <c10/util/irange.h>
 // TODO move this to c10 namespace
@ -108,7 +109,7 @@ static inline IValue pop(Stack* stack) {
 static inline std::vector<IValue> pop(Stack& stack, size_t n) {
  std::vector<IValue> result;
  result.reserve(n);
-  for (size_t i = 0; i < n; ++i) {
+  for (const auto i : c10::irange(n)) {
    result.push_back(std::move(peek(stack, i, n)));
  }
  drop(stack, n);
--- a/aten/src/ATen/cpu/vec/functional_base.h
+++ b/aten/src/ATen/cpu/vec/functional_base.h
@ -4,6 +4,7 @@
 // See Note [Do not compile initializers with AVX]
 #include <ATen/cpu/vec/vec.h>
 #include <c10/util/irange.h>
 namespace at { namespace vec {
@ -16,7 +17,7 @@ inline scalar_t vec_reduce_all(
  using Vec = vec::Vectorized<scalar_t>;
  scalar_t acc_arr[Vec::size()];
  acc_vec.store(acc_arr);
-  for (int64_t i = 1; i < size; i++) {
+  for (const auto i : c10::irange(1, size)) {
    std::array<scalar_t, Vec::size()> acc_arr_next = {0};
    acc_arr_next[0] = acc_arr[i];
    Vec acc_vec_next = Vec::loadu(acc_arr_next.data());
--- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
@ -4,6 +4,7 @@
 // See Note [Do not compile initializers with AVX]
 #include <c10/util/complex.h>
 #include <c10/util/irange.h>
 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/cpu/vec/vec_base.h>
@ -109,7 +110,7 @@ public:
  Vectorized<c10::complex<double>> map(c10::complex<double> (*const f)(const c10::complex<double> &)) const {
    __at_align__ c10::complex<double> tmp[size()];
    store(tmp);
-    for (int i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
      tmp[i] = f(tmp[i]);
    }
    return loadu(tmp);
@ -293,7 +294,7 @@ public:
    __at_align__ c10::complex<double> y_tmp[size()];
    store(x_tmp);
    exp.store(y_tmp);
-    for (int i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
      x_tmp[i] = std::pow(x_tmp[i], y_tmp[i]);
    }
    return loadu(x_tmp);
--- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h
@ -4,6 +4,7 @@
 // See Note [Do not compile initializers with AVX]
 #include <c10/util/complex.h>
 #include <c10/util/irange.h>
 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/cpu/vec/vec_base.h>
 #if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
@ -144,7 +145,7 @@ public:
  Vectorized<c10::complex<float>> map(c10::complex<float> (*const f)(const c10::complex<float> &)) const {
    __at_align__ c10::complex<float> tmp[size()];
    store(tmp);
-    for (int i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
      tmp[i] = f(tmp[i]);
    }
    return loadu(tmp);
@ -327,7 +328,7 @@ public:
    __at_align__ c10::complex<float> y_tmp[size()];
    store(x_tmp);
    exp.store(y_tmp);
-    for (int i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
      x_tmp[i] = std::pow(x_tmp[i], y_tmp[i]);
    }
    return loadu(x_tmp);
--- a/aten/src/ATen/cpu/vec/vec256/vec256_double.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_double.h
@ -5,6 +5,7 @@
 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/cpu/vec/vec_base.h>
 #include <c10/util/irange.h>
 #if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
 #include <sleef.h>
 #endif
@ -72,7 +73,7 @@ public:
    // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
    // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
    // instructions while a loop would be compiled to one instruction.
-    for (auto i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
      tmp_values[i] = 0.0;
    }
    std::memcpy(
@ -103,7 +104,7 @@ public:
  Vectorized<double> map(double (*const f)(double)) const {
    __at_align__ double tmp[size()];
    store(tmp);
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
      tmp[i] = f(tmp[i]);
    }
    return loadu(tmp);
@ -180,7 +181,7 @@ public:
    __at_align__ double tmp_x[size()];
    store(tmp);
    x.store(tmp_x);
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
      tmp[i] = calc_igamma(tmp[i], tmp_x[i]);
    }
    return loadu(tmp);
@ -190,7 +191,7 @@ public:
    __at_align__ double tmp_x[size()];
    store(tmp);
    x.store(tmp_x);
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
      tmp[i] = calc_igammac(tmp[i], tmp_x[i]);
    }
    return loadu(tmp);
--- a/aten/src/ATen/cpu/vec/vec256/vec256_float.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_float.h
@ -5,6 +5,7 @@
 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/cpu/vec/vec_base.h>
 #include <c10/util/irange.h>
 #if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
 #include <sleef.h>
 #endif
@ -80,7 +81,7 @@ public:
    // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
    // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
    // instructions while a loop would be compiled to one instruction.
-    for (auto i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
      tmp_values[i] = 0.0;
    }
    std::memcpy(
@ -109,7 +110,7 @@ public:
  Vectorized<float> map(float (*const f)(float)) const {
    __at_align__ float tmp[size()];
    store(tmp);
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
      tmp[i] = f(tmp[i]);
    }
    return loadu(tmp);
@ -217,7 +218,7 @@ public:
    __at_align__ float tmp_x[size()];
    store(tmp);
    x.store(tmp_x);
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
      tmp[i] = calc_igamma(tmp[i], tmp_x[i]);
    }
    return loadu(tmp);
@ -227,7 +228,7 @@ public:
    __at_align__ float tmp_x[size()];
    store(tmp);
    x.store(tmp_x);
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
      tmp[i] = calc_igammac(tmp[i], tmp_x[i]);
    }
    return loadu(tmp);
--- a/aten/src/ATen/cpu/vec/vec256/vec256_float_neon.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_float_neon.h
@ -5,6 +5,7 @@
 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/cpu/vec/vec_base.h>
 #include <c10/util/irange.h>
 // Sleef offers vectorized versions of some transcedentals
 // such as sin, cos, tan etc..
 // However for now opting for STL, since we are not building
@ -221,7 +222,7 @@ public:
    }
    else {
      __at_align__ float tmp_values[size()];
-      for (auto i = 0; i < size(); ++i) {
+      for (const auto i : c10::irange(size())) {
        tmp_values[i] = 0.0;
      }
      std::memcpy(
@ -287,7 +288,7 @@ public:
    __at_align__ float tmp[size()];
    __at_align__ float res[size()];
    store(tmp);
-    for (int i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
      if (_isnan(tmp[i])) {
        std::memset(static_cast<void*>(&res[i]), 0xFF, sizeof(float));
      } else {
@ -299,7 +300,7 @@ public:
  Vectorized<float> map(float (*const f)(float)) const {
    __at_align__ float tmp[size()];
    store(tmp);
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
      tmp[i] = f(tmp[i]);
    }
    return loadu(tmp);
@ -336,7 +337,7 @@ public:
    __at_align__ float tmp_exp[size()];
    store(tmp);
    exp.store(tmp_exp);
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
      tmp[i] = std::atan2(tmp[i], tmp_exp[i]);
    }
    return loadu(tmp);
@ -371,7 +372,7 @@ public:
    __at_align__ float tmp_q[size()];
    store(tmp);
    q.store(tmp_q);
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
      tmp[i] = std::fmod(tmp[i], tmp_q[i]);
    }
    return loadu(tmp);
@ -381,7 +382,7 @@ public:
    __at_align__ float tmp_b[size()];
    store(tmp);
    b.store(tmp_b);
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
      tmp[i] = std::hypot(tmp[i], tmp_b[i]);
    }
    return loadu(tmp);
@ -397,7 +398,7 @@ public:
    __at_align__ float tmp_x[size()];
    store(tmp);
    x.store(tmp_x);
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
      tmp[i] = calc_igamma(tmp[i], tmp_x[i]);
    }
    return loadu(tmp);
@ -407,7 +408,7 @@ public:
    __at_align__ float tmp_x[size()];
    store(tmp);
    x.store(tmp_x);
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
      tmp[i] = calc_igammac(tmp[i], tmp_x[i]);
    }
    return loadu(tmp);
@ -429,7 +430,7 @@ public:
    __at_align__ float tmp_b[size()];
    store(tmp);
    b.store(tmp_b);
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
      tmp[i] = std::nextafter(tmp[i], tmp_b[i]);
    }
    return loadu(tmp);
@ -494,7 +495,7 @@ public:
    __at_align__ float tmp_exp[size()];
    store(tmp);
    exp.store(tmp_exp);
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
      tmp[i] = std::pow(tmp[i], tmp_exp[i]);
    }
    return loadu(tmp);
--- a/aten/src/ATen/cpu/vec/vec256/vec256_int.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_int.h
@ -6,6 +6,7 @@
 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/cpu/vec/vec_base.h>
 #include <c10/macros/Macros.h>
 #include <c10/util/irange.h>
 #include <iostream>
 namespace at {
@ -98,7 +99,7 @@ public:
    // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
    // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
    // instructions while a loop would be compiled to one instruction.
-    for (auto i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
      tmp_values[i] = 0;
    }
    std::memcpy(tmp_values, ptr, count * sizeof(int64_t));
@ -221,7 +222,7 @@ public:
    // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
    // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
    // instructions while a loop would be compiled to one instruction.
-    for (auto i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
      tmp_values[i] = 0;
    }
    std::memcpy(tmp_values, ptr, count * sizeof(int32_t));
@ -435,7 +436,7 @@ public:
    // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
    // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
    // instructions while a loop would be compiled to one instruction.
-    for (auto i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
      tmp_values[i] = 0;
    }
    std::memcpy(tmp_values, ptr, count * sizeof(int16_t));
@ -684,7 +685,7 @@ public:
    // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
    // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
    // instructions while a loop would be compiled to one instruction.
-    for (size_t i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
      tmp_values[i] = 0;
    }
    std::memcpy(tmp_values, ptr, count * sizeof(int8_t));
--- a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
@ -6,6 +6,8 @@
 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/cpu/vec/vec_base.h>
 #include <ATen/native/quantized/affine_quantizer_base.h>
 #include <c10/util/irange.h>
 #include <c10/util/qint32.h>
 #include <c10/util/qint8.h>
 #include <c10/util/quint8.h>
@ -739,7 +741,7 @@ struct VectorizedQuantizedConverter {
  std::array<value_type, size_> vals;
  VectorizedQuantizedConverter(T val) {
-    for (size_t i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
      vals[i] = val.val_;
    }
  }
@ -757,9 +759,9 @@ struct VectorizedQuantizedConverter {
      Vectorized<float> zero_point,
      Vectorized<float> scale_zp_premul) const {
    float_vec_return_type rv;
-    for (int i = 0; i < float_num_vecs(); ++i) {
+    for (const auto i : c10::irange(float_num_vecs())) {
      float tmp_vals[8];
-      for (int j = 0; j < 8; ++j) {
+      for (const auto j : c10::irange(8)) {
        tmp_vals[j] = at::native::dequantize_val<T>(
            scale[j], zero_point[j], T(vals[8 * i + j]));
      }
@ -816,7 +818,7 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
    std::array<value_type, size()> qvals;
    std::array<float, float_num_vecs() * 8> float_vals;
-    for (int i = 0; i < float_num_vecs(); ++i) {
+    for (const auto i : c10::irange(float_num_vecs())) {
      rhs[i].store(&float_vals[i * 8], 8);
    }
@ -832,7 +834,7 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
  Vectorized<c10::qint32> maximum(Vectorized<c10::qint32> b) const {
    Vectorized<c10::qint32> retval;
-    for (size_t i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
      retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]);
    }
    return retval;
@ -840,7 +842,7 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
  Vectorized<c10::qint32> minimum(Vectorized<c10::qint32> b) const {
    Vectorized<c10::qint32> retval;
-    for (size_t i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
      retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]);
    }
    return retval;
@ -855,7 +857,7 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
      Vectorized<c10::qint32> zero_point,
      Vectorized<c10::qint32> q_six) {
    Vectorized<c10::qint32> retval;
-    for (size_t i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
      retval.vals[i] = std::min<value_type>(
          std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]);
    }
@ -864,7 +866,7 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
  int_vec_return_type widening_subtract(Vectorized<c10::qint32> b) const {
    int_vec_return_type retval;
-    for (size_t i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
      retval[0].vals[i] = vals[i] - b.vals[i];
    }
    return retval;
@ -875,7 +877,7 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
      float multiplier,
      int32_t zero_point) {
    Vectorized<c10::qint32> retval;
-    for (size_t i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
      retval.vals[i] =
          nearbyint(static_cast<float>(inp[0].vals[i]) * multiplier) +
          zero_point;
@ -948,7 +950,7 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
    std::array<value_type, size()> qvals;
    std::array<float, float_num_vecs() * 8> float_vals;
-    for (int i = 0; i < float_num_vecs(); ++i) {
+    for (const auto i : c10::irange(float_num_vecs())) {
      rhs[i].store(&float_vals[i * 8], 8);
    }
@ -964,7 +966,7 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
  Vectorized<c10::qint8> maximum(Vectorized<c10::qint8> b) const {
    Vectorized<c10::qint8> retval;
-    for (size_t i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
      retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]);
    }
    return retval;
@ -972,7 +974,7 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
  Vectorized<c10::qint8> minimum(Vectorized<c10::qint8> b) const {
    Vectorized<c10::qint8> retval;
-    for (size_t i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
      retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]);
    }
    return retval;
@ -986,7 +988,7 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
      Vectorized<c10::qint8> zero_point,
      Vectorized<c10::qint8> q_six) {
    Vectorized<c10::qint8> retval;
-    for (size_t i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
      retval.vals[i] = std::min<value_type>(
          std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]);
    }
@ -996,8 +998,8 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
  int_vec_return_type widening_subtract(Vectorized<c10::qint8> b) const {
    int_vec_return_type retval;
    constexpr int elem_per_int_vec = size() / int_num_vecs();
-    for (size_t i = 0; i < int_num_vecs(); ++i) {
+    for (const auto i : c10::irange(int_num_vecs())) {
-      for (size_t j = 0; j < elem_per_int_vec; ++j) {
+      for (const auto j : c10::irange(elem_per_int_vec)) {
        retval[i].vals[j] =
            static_cast<int32_t>(vals[i * elem_per_int_vec + j]) -
            static_cast<int32_t>(b.vals[i * elem_per_int_vec + j]);
@ -1013,8 +1015,8 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
    constexpr auto min_val = std::numeric_limits<value_type>::min();
    constexpr auto max_val = std::numeric_limits<value_type>::max();
    Vectorized<c10::qint8> retval;
-    for (size_t i = 0; i < int_num_vecs(); ++i) {
+    for (const auto i : c10::irange(int_num_vecs())) {
-      for (size_t j = 0; j < elem_per_int_vec; ++j) {
+      for (const auto j : c10::irange(elem_per_int_vec)) {
        int32_t rounded =
            nearbyint(static_cast<float>(inp[i].vals[j]) * multiplier) +
            zero_point;
@ -1068,7 +1070,7 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
    std::array<value_type, size()> qvals;
    std::array<float, float_num_vecs() * 8> float_vals;
-    for (int i = 0; i < float_num_vecs(); ++i) {
+    for (const auto i : c10::irange(float_num_vecs())) {
      rhs[i].store(&float_vals[i * 8], 8);
    }
@ -1084,7 +1086,7 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
  Vectorized<c10::quint8> maximum(Vectorized<c10::quint8> b) const {
    Vectorized<c10::quint8> retval;
-    for (size_t i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
      retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]);
    }
    return retval;
@ -1092,7 +1094,7 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
  Vectorized<c10::quint8> minimum(Vectorized<c10::quint8> b) const {
    Vectorized<c10::quint8> retval;
-    for (size_t i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
      retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]);
    }
    return retval;
@ -1107,7 +1109,7 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
      Vectorized<c10::quint8> zero_point,
      Vectorized<c10::quint8> q_six) {
    Vectorized<c10::quint8> retval;
-    for (size_t i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
      retval.vals[i] = std::min<value_type>(
          std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]);
    }
@ -1117,8 +1119,8 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
  int_vec_return_type widening_subtract(Vectorized<c10::quint8> b) const {
    int_vec_return_type retval;
    constexpr int elem_per_int_vec = size() / int_num_vecs();
-    for (size_t i = 0; i < int_num_vecs(); ++i) {
+    for (const auto i : c10::irange(int_num_vecs())) {
-      for (size_t j = 0; j < elem_per_int_vec; ++j) {
+      for (const auto j : c10::irange(elem_per_int_vec)) {
        retval[i].vals[j] =
            static_cast<int32_t>(vals[i * elem_per_int_vec + j]) -
            static_cast<int32_t>(b.vals[i * elem_per_int_vec + j]);
@ -1134,8 +1136,8 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
    constexpr auto min_val = std::numeric_limits<value_type>::min();
    constexpr auto max_val = std::numeric_limits<value_type>::max();
    Vectorized<c10::quint8> retval;
-    for (size_t i = 0; i < int_num_vecs(); ++i) {
+    for (const auto i : c10::irange(int_num_vecs())) {
-      for (size_t j = 0; j < elem_per_int_vec; ++j) {
+      for (const auto j : c10::irange(elem_per_int_vec)) {
        int32_t rounded =
            nearbyint(static_cast<float>(inp[i].vals[j]) * multiplier) +
            zero_point;
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_double_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_double_vsx.h
@ -3,6 +3,7 @@
 #include <ATen/cpu/vec/vec_base.h>
 #include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
 #include <c10/util/complex.h>
 #include <c10/util/irange.h>
 namespace at {
 namespace vec {
@ -167,7 +168,7 @@ class Vectorized<ComplexDbl> {
  Vectorized<ComplexDbl> map(ComplexDbl (*const f)(ComplexDbl)) const {
    __at_align__ ComplexDbl tmp[size()];
    store(tmp);
-    for (int i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
      tmp[i] = f(tmp[i]);
    }
    return loadu(tmp);
@ -176,7 +177,7 @@ class Vectorized<ComplexDbl> {
  Vectorized<ComplexDbl> map(ComplexDbl (*const f)(const ComplexDbl&)) const {
    __at_align__ ComplexDbl tmp[size()];
    store(tmp);
-    for (int i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
      tmp[i] = f(tmp[i]);
    }
    return loadu(tmp);
@ -454,7 +455,7 @@ class Vectorized<ComplexDbl> {
    __at_align__ ComplexDbl y_tmp[size()];
    store(x_tmp);
    exp.store(y_tmp);
-    for (int i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
      x_tmp[i] = std::pow(x_tmp[i], y_tmp[i]);
    }
    return loadu(x_tmp);
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_float_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_float_vsx.h
@ -4,6 +4,7 @@
 #include <ATen/cpu/vec/vec_base.h>
 #include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
 #include <c10/util/complex.h>
 #include <c10/util/irange.h>
 namespace at {
 namespace vec {
@ -222,7 +223,7 @@ class Vectorized<ComplexFlt> {
  Vectorized<ComplexFlt> map(ComplexFlt (*const f)(ComplexFlt)) const {
    __at_align__ ComplexFlt tmp[size()];
    store(tmp);
-    for (int i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
      tmp[i] = f(tmp[i]);
    }
    return loadu(tmp);
@ -231,7 +232,7 @@ class Vectorized<ComplexFlt> {
  Vectorized<ComplexFlt> map(ComplexFlt (*const f)(const ComplexFlt&)) const {
    __at_align__ ComplexFlt tmp[size()];
    store(tmp);
-    for (int i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
      tmp[i] = f(tmp[i]);
    }
    return loadu(tmp);
@ -430,7 +431,7 @@ class Vectorized<ComplexFlt> {
    __at_align__ ComplexFlt y_tmp[size()];
    store(x_tmp);
    exp.store(y_tmp);
-    for (int i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
      x_tmp[i] = std::pow(x_tmp[i], y_tmp[i]);
    }
    return loadu(x_tmp);
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_quint8_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_quint8_vsx.h
@ -3,6 +3,8 @@
 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/cpu/vec/vec_base.h>
 #include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
 #include <c10/util/irange.h>
 #include <c10/util/quint8.h>
 #include <array>
--- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h
@ -4,6 +4,7 @@
 // See Note [Do not compile initializers with AVX]
 #include <c10/util/complex.h>
 #include <c10/util/irange.h>
 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/cpu/vec/vec_base.h>
 #if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
@ -149,7 +150,7 @@ public:
  Vectorized<c10::complex<double>> map(c10::complex<double> (*const f)(const c10::complex<double> &)) const {
    __at_align__ c10::complex<double> tmp[size()];
    store(tmp);
-    for (int i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
      tmp[i] = f(tmp[i]);
    }
    return loadu(tmp);
@ -357,7 +358,7 @@ public:
    __at_align__ c10::complex<double> y_tmp[size()];
    store(x_tmp);
    exp.store(y_tmp);
-    for (int i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
      x_tmp[i] = std::pow(x_tmp[i], y_tmp[i]);
    }
    return loadu(x_tmp);
--- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h
@ -4,6 +4,7 @@
 // See Note [Do not compile initializers with AVX]
 #include <c10/util/complex.h>
 #include <c10/util/irange.h>
 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/cpu/vec/vec_base.h>
 #if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
@ -667,7 +668,7 @@ public:
  Vectorized<c10::complex<float>> map(c10::complex<float> (*const f)(const c10::complex<float> &)) const {
    __at_align__ c10::complex<float> tmp[size()];
    store(tmp);
-    for (int i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
      tmp[i] = f(tmp[i]);
    }
    return loadu(tmp);
@ -858,7 +859,7 @@ public:
    __at_align__ c10::complex<float> y_tmp[size()];
    store(x_tmp);
    exp.store(y_tmp);
-    for (int i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
      x_tmp[i] = std::pow(x_tmp[i], y_tmp[i]);
    }
    return loadu(x_tmp);
--- a/aten/src/ATen/cpu/vec/vec512/vec512_double.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_double.h
@ -5,6 +5,7 @@
 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/cpu/vec/vec_base.h>
 #include <c10/util/irange.h>
 #if (defined(CPU_CAPABILITY_AVX512)) && !defined(_MSC_VER)
 #include <sleef.h>
 #endif
@ -87,7 +88,7 @@ public:
    // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
    // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
    // instructions while a loop would be compiled to one instruction.
-    for (auto i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
      tmp_values[i] = 0.0;
    }
    std::memcpy(
@ -120,7 +121,7 @@ public:
  Vectorized<double> map(double (*const f)(double)) const {
    __at_align__ double tmp[size()];
    store(tmp);
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
      tmp[i] = f(tmp[i]);
    }
    return loadu(tmp);
@ -200,7 +201,7 @@ public:
    __at_align__ double tmp_x[size()];
    store(tmp);
    x.store(tmp_x);
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
      tmp[i] = calc_igamma(tmp[i], tmp_x[i]);
    }
    return loadu(tmp);
@ -210,7 +211,7 @@ public:
    __at_align__ double tmp_x[size()];
    store(tmp);
    x.store(tmp_x);
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
      tmp[i] = calc_igammac(tmp[i], tmp_x[i]);
    }
    return loadu(tmp);
--- a/aten/src/ATen/cpu/vec/vec512/vec512_float.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_float.h
@ -5,6 +5,7 @@
 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/cpu/vec/vec_base.h>
 #include <c10/util/irange.h>
 #if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
 #include <sleef.h>
 #endif
@ -104,7 +105,7 @@ public:
    // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
    // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
    // instructions while a loop would be compiled to one instruction.
-    for (auto i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
      tmp_values[i] = 0.0;
    }
    std::memcpy(
@ -135,7 +136,7 @@ public:
  Vectorized<float> map(float (*const f)(float)) const {
    __at_align__ float tmp[size()];
    store(tmp);
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
      tmp[i] = f(tmp[i]);
    }
    return loadu(tmp);
@ -246,7 +247,7 @@ public:
    __at_align__ float tmp_x[size()];
    store(tmp);
    x.store(tmp_x);
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
      tmp[i] = calc_igamma(tmp[i], tmp_x[i]);
    }
    return loadu(tmp);
@ -256,7 +257,7 @@ public:
    __at_align__ float tmp_x[size()];
    store(tmp);
    x.store(tmp_x);
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
      tmp[i] = calc_igammac(tmp[i], tmp_x[i]);
    }
    return loadu(tmp);
--- a/aten/src/ATen/cpu/vec/vec512/vec512_int.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_int.h
@ -6,6 +6,7 @@
 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/cpu/vec/vec_base.h>
 #include <c10/macros/Macros.h>
 #include <c10/util/irange.h>
 namespace at {
 namespace vec {
@ -100,7 +101,7 @@ public:
    // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
    // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
    // instructions while a loop would be compiled to one instruction.
-    for (auto i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
      tmp_values[i] = 0;
    }
    std::memcpy(tmp_values, ptr, count * sizeof(int64_t));
@ -253,7 +254,7 @@ public:
    // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
    // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
    // instructions while a loop would be compiled to one instruction.
-    for (auto i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
      tmp_values[i] = 0;
    }
    std::memcpy(tmp_values, ptr, count * sizeof(int32_t));
@ -485,7 +486,7 @@ public:
    // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
    // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
    // instructions while a loop would be compiled to one instruction.
-    for (auto i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
      tmp_values[i] = 0;
    }
    std::memcpy(tmp_values, ptr, count * sizeof(int16_t));
@ -761,7 +762,7 @@ public:
    // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
    // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
    // instructions while a loop would be compiled to one instruction.
-    for (size_t i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
      tmp_values[i] = 0;
    }
    std::memcpy(tmp_values, ptr, count * sizeof(int8_t));
--- a/aten/src/ATen/cpu/vec/vec512/vec512_qint.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_qint.h
@ -6,6 +6,8 @@
 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/cpu/vec/vec_base.h>
 #include <ATen/native/quantized/affine_quantizer_base.h>
 #include <c10/util/irange.h>
 #include <c10/util/qint32.h>
 #include <c10/util/qint8.h>
 #include <c10/util/quint8.h>
@ -744,7 +746,7 @@ struct VectorizedQuantizedConverter {
  std::array<value_type, size_> vals;
  VectorizedQuantizedConverter(T val) {
-    for (size_t i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
      vals[i] = val.val_;
    }
  }
@ -762,9 +764,9 @@ struct VectorizedQuantizedConverter {
      Vectorized<float> zero_point,
      Vectorized<float> scale_zp_premul) const {
    float_vec_return_type rv;
-    for (int i = 0; i < float_num_vecs(); ++i) {
+    for (const auto i : c10::irange(float_num_vecs())) {
      float tmp_vals[16];
-      for (int j = 0; j < 16; ++j) {
+      for (const auto j : c10::irange(16)) {
        tmp_vals[j] = at::native::dequantize_val<T>(
            scale[j], zero_point[j], T(vals[16 * i + j]));
      }
@ -829,7 +831,7 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
    std::array<value_type, size()> qvals;
    std::array<float, float_num_vecs() * 16> float_vals;
-    for (int i = 0; i < float_num_vecs(); ++i) {
+    for (const auto i : c10::irange(float_num_vecs())) {
      rhs[i].store(&float_vals[i * 16], 16);
    }
@ -845,7 +847,7 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
  Vectorized<c10::qint32> maximum(Vectorized<c10::qint32> b) const {
    Vectorized<c10::qint32> retval;
-    for (size_t i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
      retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]);
    }
    return retval;
@ -853,7 +855,7 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
  Vectorized<c10::qint32> minimum(Vectorized<c10::qint32> b) const {
    Vectorized<c10::qint32> retval;
-    for (size_t i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
      retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]);
    }
    return retval;
@ -868,7 +870,7 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
      Vectorized<c10::qint32> zero_point,
      Vectorized<c10::qint32> q_six) {
    Vectorized<c10::qint32> retval;
-    for (size_t i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
      retval.vals[i] = std::min<value_type>(
          std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]);
    }
@ -877,7 +879,7 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
  int_vec_return_type widening_subtract(Vectorized<c10::qint32> b) const {
    int_vec_return_type retval;
-    for (size_t i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
      retval[0].vals[i] = vals[i] - b.vals[i];
    }
    return retval;
@ -888,7 +890,7 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
      float multiplier,
      int32_t zero_point) {
    Vectorized<c10::qint32> retval;
-    for (size_t i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
      retval.vals[i] =
          nearbyint(static_cast<float>(inp[0].vals[i]) * multiplier) +
          zero_point;
@ -961,7 +963,7 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
    std::array<value_type, size()> qvals;
    std::array<float, float_num_vecs() * 16> float_vals;
-    for (int i = 0; i < float_num_vecs(); ++i) {
+    for (const auto i : c10::irange(float_num_vecs())) {
      rhs[i].store(&float_vals[i * 16], 16);
    }
@ -977,7 +979,7 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
  Vectorized<c10::qint8> maximum(Vectorized<c10::qint8> b) const {
    Vectorized<c10::qint8> retval;
-    for (size_t i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
      retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]);
    }
    return retval;
@ -985,7 +987,7 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
  Vectorized<c10::qint8> minimum(Vectorized<c10::qint8> b) const {
    Vectorized<c10::qint8> retval;
-    for (size_t i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
      retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]);
    }
    return retval;
@ -999,7 +1001,7 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
      Vectorized<c10::qint8> zero_point,
      Vectorized<c10::qint8> q_six) {
    Vectorized<c10::qint8> retval;
-    for (size_t i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
      retval.vals[i] = std::min<value_type>(
          std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]);
    }
@ -1009,8 +1011,8 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
  int_vec_return_type widening_subtract(Vectorized<c10::qint8> b) const {
    int_vec_return_type retval;
    constexpr int elem_per_int_vec = size() / int_num_vecs();
-    for (size_t i = 0; i < int_num_vecs(); ++i) {
+    for (const auto i : c10::irange(int_num_vecs())) {
-      for (size_t j = 0; j < elem_per_int_vec; ++j) {
+      for (const auto j : c10::irange(elem_per_int_vec)) {
        retval[i].vals[j] =
            static_cast<int32_t>(vals[i * elem_per_int_vec + j]) -
            static_cast<int32_t>(b.vals[i * elem_per_int_vec + j]);
@ -1026,8 +1028,8 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
    constexpr auto min_val = std::numeric_limits<value_type>::min();
    constexpr auto max_val = std::numeric_limits<value_type>::max();
    Vectorized<c10::qint8> retval;
-    for (size_t i = 0; i < int_num_vecs(); ++i) {
+    for (const auto i : c10::irange(int_num_vecs())) {
-      for (size_t j = 0; j < elem_per_int_vec; ++j) {
+      for (const auto j : c10::irange(elem_per_int_vec)) {
        int32_t rounded =
            nearbyint(static_cast<float>(inp[i].vals[j]) * multiplier) +
            zero_point;
@ -1081,7 +1083,7 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
    std::array<value_type, size()> qvals;
    std::array<float, float_num_vecs() * 16> float_vals;
-    for (int i = 0; i < float_num_vecs(); ++i) {
+    for (const auto i : c10::irange(float_num_vecs())) {
      rhs[i].store(&float_vals[i * 16], 16);
    }
@ -1097,7 +1099,7 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
  Vectorized<c10::quint8> maximum(Vectorized<c10::quint8> b) const {
    Vectorized<c10::quint8> retval;
-    for (size_t i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
      retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]);
    }
    return retval;
@ -1105,7 +1107,7 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
  Vectorized<c10::quint8> minimum(Vectorized<c10::quint8> b) const {
    Vectorized<c10::quint8> retval;
-    for (size_t i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
      retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]);
    }
    return retval;
@ -1120,7 +1122,7 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
      Vectorized<c10::quint8> zero_point,
      Vectorized<c10::quint8> q_six) {
    Vectorized<c10::quint8> retval;
-    for (size_t i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
      retval.vals[i] = std::min<value_type>(
          std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]);
    }
@ -1130,8 +1132,8 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
  int_vec_return_type widening_subtract(Vectorized<c10::quint8> b) const {
    int_vec_return_type retval;
    constexpr int elem_per_int_vec = size() / int_num_vecs();
-    for (size_t i = 0; i < int_num_vecs(); ++i) {
+    for (const auto i : c10::irange(int_num_vecs())) {
-      for (size_t j = 0; j < elem_per_int_vec; ++j) {
+      for (const auto j : c10::irange(elem_per_int_vec)) {
        retval[i].vals[j] =
            static_cast<int32_t>(vals[i * elem_per_int_vec + j]) -
            static_cast<int32_t>(b.vals[i * elem_per_int_vec + j]);
@ -1147,8 +1149,8 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
    constexpr auto min_val = std::numeric_limits<value_type>::min();
    constexpr auto max_val = std::numeric_limits<value_type>::max();
    Vectorized<c10::quint8> retval;
-    for (size_t i = 0; i < int_num_vecs(); ++i) {
+    for (const auto i : c10::irange(int_num_vecs())) {
-      for (size_t j = 0; j < elem_per_int_vec; ++j) {
+      for (const auto j : c10::irange(elem_per_int_vec)) {
        int32_t rounded =
            nearbyint(static_cast<float>(inp[i].vals[j]) * multiplier) +
            zero_point;
--- a/aten/src/ATen/cpu/vec/vec_base.h
+++ b/aten/src/ATen/cpu/vec/vec_base.h
@ -31,6 +31,7 @@
 #include <ATen/native/cpu/zmath.h>
 #include <c10/util/TypeCast.h>
 #include <c10/macros/Macros.h>
 #include <c10/util/irange.h>
 // These macros helped us unify vec_base.h
 #ifdef CPU_CAPABILITY_AVX512
@ -150,7 +151,7 @@ public:
  static Vectorized<T> blend(const Vectorized<T>& a, const Vectorized<T>& b) {
    int64_t mask = mask_;
    Vectorized vector;
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
      if (mask & 0x01) {
        vector[i] = b[i];
      } else {
@ -165,7 +166,7 @@ public:
    Vectorized vector;
    int_same_size_t<T> buffer[size()];
    mask.store(buffer);
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
      if (buffer[i] & 0x01)
       {
        vector[i] = b[i];
@ -178,14 +179,14 @@ public:
  template<typename step_t>  // step sometimes requires a higher precision type (e.g., T=int, step_t=double)
  static Vectorized<T> arange(T base = static_cast<T>(0), step_t step = static_cast<step_t>(1)) {
    Vectorized vector;
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
      vector.values[i] = base + i * step;
    }
    return vector;
  }
  static Vectorized<T> set(const Vectorized<T>& a, const Vectorized<T>& b, int64_t count = size()) {
    Vectorized vector;
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
      if (i < count) {
        vector[i] = b[i];
      } else {
@ -340,7 +341,7 @@ public:
  }
  Vectorized<T> atan2(const Vectorized<T> &exp) const {
    Vectorized<T> ret;
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
      ret[i] = std::atan2(values[i], exp[i]);
    }
    return ret;
@ -380,7 +381,7 @@ public:
    // U is for SFINAE purposes only. Make sure it is not changed.
    static_assert(std::is_same<U, T>::value, "U must be T");
    Vectorized<T> ret;
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
      ret[i] = std::fmod(values[i], q[i]);
    }
    return ret;
@ -423,7 +424,7 @@ public:
  }
  Vectorized<T> hypot(const Vectorized<T> &b) const {
    Vectorized<T> ret;
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
      ret[i] = std::hypot(values[i], b[i]);
    }
    return ret;
@ -436,14 +437,14 @@ public:
  }
  Vectorized<T> igamma(const Vectorized<T> &x) const {
    Vectorized<T> ret;
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
      ret[i] = calc_igamma(values[i], x[i]);
    }
    return ret;
  }
  Vectorized<T> igammac(const Vectorized<T> &x) const {
    Vectorized<T> ret;
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
      ret[i] = calc_igammac(values[i], x[i]);
    }
    return ret;
@ -456,7 +457,7 @@ public:
  }
  Vectorized<T> nextafter(const Vectorized<T> &b) const {
    Vectorized<T> ret;
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
      ret[i] = std::nextafter(values[i], b[i]);
    }
    return ret;
@ -494,7 +495,7 @@ public:
  }
  Vectorized<T> pow(const Vectorized<T> &exp) const {
    Vectorized<T> ret;
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
      ret[i] = std::pow(values[i], exp[i]);
    }
    return ret;
@ -808,7 +809,7 @@ inline gather(T const* base_addr, const Vectorized<int_same_size_t<T>>& vindex)
  int_same_size_t<T> index_arr[size];
  vindex.store(static_cast<void*>(index_arr));
  T buffer[size];
-  for (int64_t i = 0; i < size; i++) {
+  for (const auto i : c10::irange(size)) {
    buffer[i] = base_addr[index_arr[i] * scale / sizeof(T)];
  }
  return Vectorized<T>::loadu(static_cast<void*>(buffer));
@ -826,7 +827,7 @@ inline mask_gather(const Vectorized<T>& src, T const* base_addr,
  mask.store(static_cast<void*>(mask_arr));
  vindex.store(static_cast<void*>(index_arr));
  T buffer[size];
-  for (int64_t i = 0; i < size; i++) {
+  for (const auto i : c10::irange(size)) {
    if (mask_arr[i] & 0x01) {  // check highest bit
      buffer[i] = base_addr[index_arr[i] * scale / sizeof(T)];
    } else {
@ -872,7 +873,7 @@ inline Vectorized<int_same_size_t<T>> convert_to_int_of_same_size(const Vectoriz
  T src_arr[size];
  src.store(static_cast<void*>(src_arr));
  int_same_size_t<T> buffer[size];
-  for (int64_t i = 0; i < size; i++) {
+  for (const auto i : c10::irange(size)) {
    buffer[i] = static_cast<int_same_size_t<T>>(src_arr[i]);
  }
  return Vectorized<int_same_size_t<T>>::loadu(static_cast<void*>(buffer));
@ -899,7 +900,7 @@ deinterleave2(const Vectorized<T>& a, const Vectorized<T>& b) {
  T buffer2[size];
  a.store(static_cast<void*>(a_arr));
  b.store(static_cast<void*>(b_arr));
-  for (int64_t i = 0; i < half_size; i++) {
+  for (const auto i : c10::irange(half_size)) {
    buffer1[i] = a_arr[i * 2];
    buffer1[half_size + i] = b_arr[i * 2];
    buffer2[i] = a_arr[i * 2 + 1];
@ -931,7 +932,7 @@ interleave2(const Vectorized<T>& a, const Vectorized<T>& b) {
  T buffer2[size];
  a.store(static_cast<void*>(a_arr));
  b.store(static_cast<void*>(b_arr));
-  for (int64_t i = 0; i < half_size; i++) {
+  for (const auto i : c10::irange(half_size)) {
    buffer1[i * 2] = a_arr[i];
    buffer1[i * 2 + 1] = b_arr[i];
    buffer2[i * 2] = a_arr[half_size + i];
@ -946,7 +947,8 @@ inline void convert(const src_T *src, dst_T *dst, int64_t n) {
 #ifndef _MSC_VER
 # pragma unroll
 #endif
-  for (int64_t i = 0; i < n; i++) {
+  for (const auto i : c10::irange(n)) {
    (void)i; //Suppress unused variable warning
    *dst = c10::static_cast_with_inter_type<dst_T, src_T>::apply(*src);
    src++;
    dst++;
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@ -4,6 +4,7 @@
 #include <ATen/cuda/CUDABlas.h>
 #include <ATen/cuda/Exceptions.h>
 #include <c10/util/irange.h>
 #define CUDABLAS_POSINT_CHECK(FD, X)         \
  TORCH_CHECK(                               \
@ -295,7 +296,7 @@ void bgemm<at::Half>(CUDABLAS_BGEMM_ARGTYPES(at::Half)) {
      c, CUDA_R_16F, ldc, stridec,
      num_batches, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
  } else {
-    for (int64_t i = 0; i < num_batches; ++i) {
+    for (const auto i : c10::irange(num_batches)) {
      at::cuda::blas::gemm<at::Half>(
        transa, transb,
        m, n, k,
--- a/aten/src/ATen/cudnn/Descriptors.cpp
+++ b/aten/src/ATen/cudnn/Descriptors.cpp
@ -1,6 +1,7 @@
 #include <ATen/cudnn/Descriptors.h>
 #include <ATen/ATen.h>
 #include <c10/util/irange.h>
 #include <iostream>
 #include <sstream>
@ -47,11 +48,11 @@ void TensorDescriptor::set(cudnnDataType_t datatype, IntArrayRef t_sizes, IntArr
 #undef STR
  int size[CUDNN_DIM_MAX];
  int stride[CUDNN_DIM_MAX];
-  for (size_t i = 0; i < dim; ++i) {
+  for (const auto i : c10::irange(dim)) {
    size[i] = static_cast<int>(t_sizes[i]);
    stride[i] = static_cast<int>(t_strides[i]);
  }
-  for (size_t i = dim; i < pad; ++i) {
+  for (const auto i : c10::irange(dim, pad)) {
    size[i] = 1;
    stride[i] = 1;
  }
@ -126,10 +127,10 @@ void FilterDescriptor::set(const at::Tensor &t, const at::MemoryFormat memory_fo
      "cuDNN filters (a.k.a. weights) must be contiguous in desired memory_format");
  int size[CUDNN_DIM_MAX];
-  for (int i = 0; i < dim; ++i) {
+  for (const auto i : c10::irange(dim)) {
    size[i] = (int) t.size(i);
  }
-  for (int i = dim; i < pad; ++i) {
+  for (const auto i : c10::irange(dim, pad)) {
    size[i] = (int) 1;
  }
  dim = std::max(dim, pad);
--- a/aten/src/ATen/miopen/Descriptors.cpp
+++ b/aten/src/ATen/miopen/Descriptors.cpp
@ -1,5 +1,6 @@
 #include <ATen/miopen/Descriptors.h>
 #include <ATen/ATen.h>
 #include <c10/util/irange.h>
 #include <iostream>
@ -39,11 +40,11 @@ void TensorDescriptor::set(miopenDataType_t datatype, IntArrayRef t_sizes, IntAr
 #undef STR
  int size[MIOPEN_DIM_MAX];
  int stride[MIOPEN_DIM_MAX];
-  for (size_t i = 0; i < dim; ++i) {
+  for (const auto i : c10::irange(dim)) {
    size[i] = static_cast<int>(t_sizes[i]);
    stride[i] = static_cast<int>(t_strides[i]);
  }
-  for (size_t i = dim; i < pad; ++i) {
+  for (const auto i : c10::irange(dim, pad)) {
    size[i] = 1;
    stride[i] = 1;
  }
@ -103,10 +104,10 @@ void FilterDescriptor::set(const at::Tensor &t, const at::MemoryFormat memory_fo
  int size[MIOPEN_DIM_MAX];
  int stride[MIOPEN_DIM_MAX];
-  for (int i = 0; i < dim; ++i) {
+  for (const auto i : c10::irange(dim)) {
    size[i] = (int) t.size(i);
  }
-  for (int i = dim; i < pad; ++i) {
+  for (const auto i : c10::irange(dim, pad)) {
    size[i] = (int) 1;
  }
--- a/aten/src/ATen/native/Activation.cpp
+++ b/aten/src/ATen/native/Activation.cpp
@ -500,7 +500,7 @@ inline void _rrelu_with_noise_train(
  scalar_t* noise_data = noise.data_ptr<scalar_t>();
  auto gen  = at::get_generator_or_default<CPUGeneratorImpl>(generator, detail::getDefaultCPUGenerator());
  std::lock_guard<std::mutex> lock(gen->mutex_);
-  for (int64_t i = 0; i < input.numel(); i++) {
+  for (const auto i : c10::irange(input.numel())) {
    if (input_data[i] <= 0) {
      at::uniform_real_distribution<double> uniform(lower, upper);
      const scalar_t r = (scalar_t)uniform(gen);
@ -610,7 +610,7 @@ void inline prelu_cpu_kernel_share_weights(
  auto weight_val = weight.data_ptr<scalar_t>()[0];
  at::parallel_for(0, input_numel, 1000, [&](int64_t start, int64_t end) {
-    for (auto i = start; i < end; i++) {
+    for (const auto i : c10::irange(start, end)) {
      scalar_t input_data_val = input_data[i];
      // to allow for compiler optimization, here splitting into two lines:
      scalar_t r = (input_data_val > 0) ? scalar_t(1) : weight_val;
@ -725,7 +725,7 @@ void inline prelu_cpu_backward_kernel_share_weights(
  scalar_t sum = at::parallel_reduce(0, input_numel, 1000, scalar_t(0),
      [&](int64_t start, int64_t end, scalar_t ident) -> scalar_t {
    scalar_t partial_sum = ident;
-    for (auto i = start; i < end; i++) {
+    for (const auto i : c10::irange(start, end)) {
      scalar_t input_data_val = input_data[i];
      scalar_t grad_out_data_val = grad_out_data[i];
      // to allow for compiler optimization, here splitting into two lines:
@ -839,7 +839,7 @@ std::tuple<Tensor, Tensor> prelu_backward_cpu(const Tensor& grad_out_, const Ten
    std::vector<int64_t> reduce_dims;
    reduce_dims.push_back(0);
    if (dims > 2) {
-      for(int64_t i = 2; i < dims; i++) reduce_dims.push_back(i);
+      for (const auto i : c10::irange(2, dims))reduce_dims.push_back(i);
    }
    weight_grad = weight_grad_collector.sum(reduce_dims);
  }
--- a/aten/src/ATen/native/AdaptiveAveragePooling.cpp
+++ b/aten/src/ATen/native/AdaptiveAveragePooling.cpp
@ -2,6 +2,7 @@
 #include <ATen/NativeFunctions.h>
 #include <ATen/native/AdaptivePooling.h>
 #include <ATen/native/xnnpack/Engine.h>
 #include <c10/util/irange.h>
 namespace at {
@ -16,7 +17,7 @@ namespace {
  {
    TORCH_CHECK(output_size.size() == 2, "adaptive_avg_pool2d: output_size must be 2");
    int64_t ndim = input.ndimension();
-    for (int64_t i = 1; i < ndim; i++) {
+    for (const auto i : c10::irange(1, ndim)) {
      TORCH_CHECK(input.size(i) > 0,
        "adaptive_avg_pool2d(): Expected input to have non-zero size for non-batch dimensions, "
        "but input has sizes ", input.sizes(), " with dimension ", i, " being "
@ -52,7 +53,7 @@ namespace {
    const Tensor& input)
  {
    int64_t ndim = grad_output.ndimension();
-    for (int64_t i = 1; i < ndim; i++) {
+    for (const auto i : c10::irange(1, ndim)) {
      TORCH_CHECK(grad_output.size(i) > 0,
        "adaptive_avg_pool2d_backward(): Expected grad_output to have non-zero size for non-batch dimensions, "
        "but grad_output has sizes ", grad_output.sizes(), " with dimension ", i, " being "
--- a/aten/src/ATen/native/AdaptiveAveragePooling3d.cpp
+++ b/aten/src/ATen/native/AdaptiveAveragePooling3d.cpp
@ -1,6 +1,7 @@
 #include <ATen/ATen.h>
 #include <ATen/NativeFunctions.h>
 #include <ATen/Parallel.h>
 #include <c10/util/irange.h>
 namespace at {
 namespace native {
@ -33,19 +34,19 @@ static void adaptive_avg_pool3d_out_frame(
    int64_t istrideH,
    int64_t istrideW) {
  at::parallel_for(0, sizeD, 1, [&](int64_t start, int64_t end) {
-    for (int64_t d = start; d < end; d++) {
+    for (const auto d : c10::irange(start, end)) {
      /* loop over output */
-      for (int64_t ot = 0; ot < osizeT; ot++) {
+      for (const auto ot : c10::irange(osizeT)) {
        int istartT = start_index(ot, osizeT, isizeT);
        int iendT = end_index(ot, osizeT, isizeT);
        int kT = iendT - istartT;
-        for (int64_t oh = 0; oh < osizeH; oh++) {
+        for (const auto oh : c10::irange(osizeH)) {
          int istartH = start_index(oh, osizeH, isizeH);
          int iendH = end_index(oh, osizeH, isizeH);
          int kH = iendH - istartH;
-          for (int64_t ow = 0; ow < osizeW; ow++) {
+          for (const auto ow : c10::irange(osizeW)) {
            int istartW = start_index(ow, osizeW, isizeW);
            int iendW = end_index(ow, osizeW, isizeW);
            int kW = iendW - istartW;
@ -58,9 +59,9 @@ static void adaptive_avg_pool3d_out_frame(
            /* compute local average: */
            scalar_t sum = 0;
-            for (int it = 0; it < kT; it++) {
+            for (const auto it : c10::irange(kT)) {
-              for (int ih = 0; ih < kH; ih++) {
+              for (const auto ih : c10::irange(kH)) {
-                for (int iw = 0; iw < kW; iw++) {
+                for (const auto iw : c10::irange(kW)) {
                  scalar_t val =
                      *(ip + it * istrideT + ih * istrideH + iw * istrideW);
                  sum += val;
@ -83,7 +84,7 @@ void adaptive_avg_pool3d_out_cpu_template(
    IntArrayRef output_size) {
  TORCH_CHECK(output_size.size() == 3, "adaptive_avg_pool3d: output_size must be 3");
-  for (int64_t i = 1; i < input.ndimension(); i++) {
+  for (const auto i : c10::irange(1, input.ndimension())) {
    TORCH_CHECK(
        input.size(i) > 0,
        "adaptive_avg_pool3d(): Expected input to have non-zero size for non-batch dimensions, "
@ -148,7 +149,7 @@ void adaptive_avg_pool3d_out_cpu_template(
          auto input_data = input.data_ptr<scalar_t>();
          auto output_data = output.data_ptr<scalar_t>();
          at::parallel_for(0, n, 1, [&](int64_t start, int64_t end) {
-            for (int64_t b = start; b < end; ++b) {
+            for (const auto b : c10::irange(start, end)) {
              adaptive_avg_pool3d_out_frame<scalar_t>(
                  input_data + b * input.stride(0),
                  output_data + b * sizeD * osizeT * osizeH * osizeW,
@ -181,22 +182,22 @@ static void adaptive_avg_pool3d_backward_out_frame(
    int64_t osizeH,
    int64_t osizeW) {
  at::parallel_for(0, sizeD, 1, [&](int64_t start, int64_t end) {
-    for (int64_t d = start; d < end; d++) {
+    for (const auto d : c10::irange(start, end)) {
      scalar_t* gradInput_p_d = gradInput_p + d * isizeT * isizeW * isizeH;
      scalar_t* gradOutput_p_d = gradOutput_p + d * osizeT * osizeW * osizeH;
      /* calculate average */
-      for (int64_t ot = 0; ot < osizeT; ot++) {
+      for (const auto ot : c10::irange(osizeT)) {
        int istartT = start_index(ot, osizeT, isizeT);
        int iendT = end_index(ot, osizeT, isizeT);
        int kT = iendT - istartT;
-        for (int64_t oh = 0; oh < osizeH; oh++) {
+        for (const auto oh : c10::irange(osizeH)) {
          int istartH = start_index(oh, osizeH, isizeH);
          int iendH = end_index(oh, osizeH, isizeH);
          int kH = iendH - istartH;
-          for (int64_t ow = 0; ow < osizeW; ow++) {
+          for (const auto ow : c10::irange(osizeW)) {
            int istartW = start_index(ow, osizeW, isizeW);
            int iendW = end_index(ow, osizeW, isizeW);
            int kW = iendW - istartW;
@ -205,9 +206,9 @@ static void adaptive_avg_pool3d_backward_out_frame(
                gradOutput_p_d[ot * osizeH * osizeW + oh * osizeW + ow] / kT /
                kH / kW;
-            for (int it = istartT; it < iendT; it++) {
+            for (const auto it : c10::irange(istartT, iendT)) {
-              for (int ih = istartH; ih < iendH; ih++) {
+              for (const auto ih : c10::irange(istartH, iendH)) {
-                for (int iw = istartW; iw < iendW; iw++) {
+                for (const auto iw : c10::irange(istartW, iendW)) {
                  /* update gradient */
                  gradInput_p_d[it * isizeH * isizeW + ih * isizeW + iw] +=
                      grad_delta;
@ -265,7 +266,7 @@ Tensor& adaptive_avg_pool3d_backward_out_cpu_template(
          scalar_t* gradInput_data = gradInput.data_ptr<scalar_t>();
          scalar_t* gradOutput_data = gradOutput.data_ptr<scalar_t>();
          at::parallel_for(0, n, 1, [&](int64_t start, int64_t end) {
-            for (int64_t b = start; b < end; b++) {
+            for (const auto b : c10::irange(start, end)) {
              adaptive_avg_pool3d_backward_out_frame<scalar_t>(
                  gradInput_data + b * sizeD * isizeT * isizeH * isizeW,
                  gradOutput_data + b * sizeD * osizeT * osizeH * osizeW,
--- a/aten/src/ATen/native/AdaptiveMaxPooling2d.cpp
+++ b/aten/src/ATen/native/AdaptiveMaxPooling2d.cpp
@ -1,6 +1,7 @@
 #include <ATen/ATen.h>
 #include <ATen/NativeFunctions.h>
 #include <ATen/native/AdaptivePooling.h>
 #include <c10/util/irange.h>
 namespace at {
@ -10,7 +11,7 @@ TORCH_META_FUNC(adaptive_max_pool2d) (const Tensor& input, IntArrayRef output_si
  TORCH_CHECK(ndim == 3 || ndim == 4,
              "adaptive_max_pool2d(): Expected 3D or 4D tensor, but got: ",
              input.sizes());
-  for (int64_t i = 1; i < ndim; i++) {
+  for (const auto i : c10::irange(1, ndim)) {
    TORCH_CHECK(input.size(i) > 0,
        "adaptive_max_pool2d(): Expected input to have non-zero size for non-batch dimensions, "
        "but input has sizes ", input.sizes(), " with dimension ", i,
@ -51,7 +52,7 @@ TORCH_META_FUNC(adaptive_max_pool2d_backward)
  int64_t ndim = grad_output.ndimension();
  TORCH_CHECK(ndim == 3 || ndim == 4,
    "adaptive_max_pooling2d_backward(): Expected 3D or 4D grad_output, but got: ", grad_output.sizes());
-  for (int64_t i = 1; i < ndim; i++) {
+  for (const auto i : c10::irange(1, ndim)) {
    TORCH_CHECK(grad_output.size(i) > 0,
      "adaptive_max_pooling2d_backward(): Expected grad_output to have non-zero size for non-batch dimensions, "
      "but grad_output has sizes ", grad_output.sizes(), " with dimension ", i,
--- a/aten/src/ATen/native/AdaptiveMaxPooling3d.cpp
+++ b/aten/src/ATen/native/AdaptiveMaxPooling3d.cpp
@ -1,6 +1,7 @@
 #include <ATen/ATen.h>
 #include <ATen/NativeFunctions.h>
 #include <ATen/Parallel.h>
 #include <c10/util/irange.h>
 #include <tuple>
@ -11,7 +12,7 @@ TORCH_META_FUNC(adaptive_max_pool3d) (const Tensor& input, IntArrayRef output_si
  TORCH_CHECK(
    ndim == 4 || ndim == 5,
    "adaptive_max_pool3d(): Expected 4D or 5D tensor, but got: ", input.sizes());
-  for (int64_t i = 1; i < ndim; i++) {
+  for (const auto i : c10::irange(1, ndim)) {
    TORCH_CHECK(
        input.size(i) > 0,
        "adaptive_max_pool3d(): Expected input to have non-zero size for non-batch dimensions, "
@ -96,8 +97,7 @@ static void adaptive_max_pool3d_single_out_frame(
          int64_t istrideW)
 {
  at::parallel_for(0, sizeD, 0, [&](int64_t start, int64_t end) {
-    for (auto d = start; d < end; d++)
+    for (const auto d : c10::irange(start, end)) {
    {
      /* loop over output */
      int64_t ot, oh, ow;
      for(ot = 0; ot < osizeT; ot++)
@ -176,8 +176,7 @@ static void adaptive_max_pool3d_out_frame(
          int64_t istrideW)
 {
  at::parallel_for(0, sizeB, 0, [&](int64_t start, int64_t end) {
-    for (auto b = start; b < end; b++)
+    for (const auto b : c10::irange(start, end)) {
    {
      adaptive_max_pool3d_single_out_frame<scalar_t>(input_data+b*istrideB, output_data+b*sizeD*osizeT*osizeH*osizeW,
                                                     indices_data+b*sizeD*osizeT*osizeH*osizeW,
                                                     sizeD,
@ -203,8 +202,7 @@ static void adaptive_max_pool3d_backward_single_out_frame(
          int64_t osizeW)
 {
  at::parallel_for(0, sizeD, 0, [&](int64_t start, int64_t end) {
-    for (auto d = start; d < end; d++)
+    for (const auto d : c10::irange(start, end)) {
    {
      scalar_t *gradInput_p_d = gradInput_p + d*isizeT*isizeH*isizeW;
      scalar_t *gradOutput_p_d = gradOutput_p + d*osizeT*osizeH*osizeW;
      int64_t *ind_p_d = ind_p + d*osizeT*osizeH*osizeW;
@ -244,8 +242,7 @@ static void adaptive_max_pool3d_backward_out_frame(
          int64_t osizeW)
 {
  at::parallel_for(0, sizeB, 0, [&](int64_t start, int64_t end) {
-    for (auto b = start; b < end; b++)
+    for (const auto b : c10::irange(start, end)) {
    {
      adaptive_max_pool3d_backward_single_out_frame<scalar_t>(gradInput_data+b*sizeD*isizeT*isizeH*isizeW, gradOutput_data+b*sizeD*osizeT*osizeH*osizeW,
                                                              indices_data+b*sizeD*osizeT*osizeH*osizeW,
                                                              sizeD,
--- a/aten/src/ATen/native/AveragePool3d.cpp
+++ b/aten/src/ATen/native/AveragePool3d.cpp
@ -2,6 +2,7 @@
 #include <ATen/Parallel.h>
 #include <ATen/NativeFunctions.h>
 #include <ATen/native/Pool.h>
 #include <c10/util/irange.h>
 #include <tuple>
@ -169,8 +170,7 @@ static void avg_pool3d_out_frame(
          c10::optional<int64_t> divisor_override)
 {
  at::parallel_for(0, nslices, 0, [&](int64_t start, int64_t end) {
-    for (auto k = start; k < end; k++)
+    for (const auto k : c10::irange(start, end)) {
    {
      // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
      int64_t i, j, ti;
@ -315,7 +315,7 @@ TORCH_IMPL_FUNC(avg_pool3d_out_cpu) (
        scalar_t *output_data = output.data_ptr<scalar_t>();
        at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) {
-          for (auto p = start; p < end; p++) {
+          for (const auto p : c10::irange(start, end)) {
            avg_pool3d_out_frame(
              input_data + p * istride, output_data + p * ostride, nslices,
              itime, iwidth, iheight,
@ -358,8 +358,7 @@ static void avg_pool3d_backward_out_frame(
          c10::optional<int64_t> divisor_override)
 {
  at::parallel_for(0, nslices, 0, [&](int64_t start, int64_t end) {
-    for (auto k = start; k < end; k++)
+    for (const auto k : c10::irange(start, end)) {
    {
      // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
      int64_t i, j, ti;
@ -500,8 +499,7 @@ TORCH_IMPL_FUNC(avg_pool3d_backward_out_cpu) (
        scalar_t *gradOutput_data = gradOutput.data_ptr<scalar_t>();
        at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) {
-          for (auto p = start; p < end; p++)
+          for (const auto p : c10::irange(start, end)) {
          {
            avg_pool3d_backward_out_frame(
              gradInput_data  + p * istride, gradOutput_data + p * ostride, nslices,
              itime, iwidth, iheight,
--- a/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp
+++ b/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp
@ -63,7 +63,7 @@ void apply_reflect_conj_tri_single(scalar_t* self, int64_t n, int64_t stride, bo
  std::function<void(int64_t, int64_t)> loop = [](int64_t, int64_t){};
  if (upper) {
    loop = [&](int64_t start, int64_t end) {
-      for (int64_t i = start; i < end; i++) {
+      for (const auto i : c10::irange(start, end)) {
        for (int64_t j = i + 1; j < n; j++) {
          self[i * stride + j] = conj_impl(self[j * stride + i]);
        }
@ -71,8 +71,8 @@ void apply_reflect_conj_tri_single(scalar_t* self, int64_t n, int64_t stride, bo
    };
  } else {
    loop = [&](int64_t start, int64_t end) {
-      for (int64_t i = start; i < end; i++) {
+      for (const auto i : c10::irange(start, end)) {
-        for (int64_t j = 0; j < i; j++) {
+        for (const auto j : c10::irange(i)) {
          self[i * stride + j] = conj_impl(self[j * stride + i]);
        }
      }
@ -106,7 +106,7 @@ void apply_cholesky_inverse(Tensor& input, Tensor& infos, bool upper) {
  auto n = input.size(-2);
  auto lda = std::max<int64_t>(1, n);
-  for (int64_t i = 0; i < batch_size; i++) {
+  for (const auto i : c10::irange(batch_size)) {
    scalar_t* input_working_ptr = &input_data[i * input_matrix_stride];
    int* info_working_ptr = &infos_data[i];
    lapackCholeskyInverse<scalar_t>(uplo, n, input_working_ptr, lda, info_working_ptr);
@ -501,7 +501,7 @@ inline void apply_orgqr(Tensor& self, const Tensor& tau) {
  lwork = std::max<int>(1, real_impl<scalar_t, value_t>(wkopt));
  Tensor work = at::empty({lwork}, self.options());
-  for (int64_t i = 0; i < batch_size; i++) {
+  for (const auto i : c10::irange(batch_size)) {
    scalar_t* self_working_ptr = &self_data[i * self_matrix_stride];
    scalar_t* tau_working_ptr = &tau_data[i * tau_stride];
--- a/aten/src/ATen/native/BlasKernel.cpp
+++ b/aten/src/ATen/native/BlasKernel.cpp
@ -2,6 +2,7 @@
 #include <algorithm>
 #include <ATen/ATen.h>
 #include <ATen/Config.h>
 #include <c10/util/irange.h>
 #if AT_BUILD_WITH_BLAS()
 extern "C" double ddot_(int *n, double *x, int *incx, double *y, int *incy);
@ -151,7 +152,7 @@ inline void scal(int64_t n, scalar_t a, scalar_t *x, int64_t incx)
    blas_impl::scal_fast_path<scalar_t>(&i_n, &a, x, &i_incx);
    return;
  }
-  for (int64_t i = 0; i < n; i++) {
+  for (const auto i : c10::irange(n)) {
    if (a == scalar_t(0)) {
      x[i * incx] = 0;
    } else {
@ -176,11 +177,10 @@ void gemv(char trans, int64_t m, int64_t n, scalar_t alpha, scalar_t *a, int64_t
  }
  if ((trans == 'T') || (trans == 't')) {
-    for (int64_t i = 0; i < n; i++)
+    for (const auto i : c10::irange(n)) {
    {
      scalar_t sum = 0;
      scalar_t *row_ = a + lda * i;
-      for (int64_t j = 0; j < m; j++) {
+      for (const auto j : c10::irange(m)) {
        sum += x[j * incx] * row_[j];
      }
      if (beta == scalar_t(0)) {
@ -192,10 +192,10 @@ void gemv(char trans, int64_t m, int64_t n, scalar_t alpha, scalar_t *a, int64_t
  } else {
    if (beta != scalar_t(1) && beta != scalar_t(0)) scal<scalar_t>(m, beta, y, incy);
-    for (int64_t j = 0; j < n; j++) {
+    for (const auto j : c10::irange(n)) {
      scalar_t *column_ = a + lda * j;
      scalar_t z = alpha * x[j * incx];
-      for (int64_t i = 0; i < m; i++) {
+      for (const auto i : c10::irange(m)) {
        //output values are ignored if beta is 0, and set to 0, nans and infs are not propagated
        if (j==0 && beta==scalar_t(0)) {
         y[i * incy] = scalar_t(0);
--- a/aten/src/ATen/native/Bucketization.cpp
+++ b/aten/src/ATen/native/Bucketization.cpp
@ -2,6 +2,7 @@
 #include <ATen/Parallel.h>
 #include <ATen/native/BucketizationUtils.h>
 #include <ATen/native/Resize.h>
 #include <c10/util/irange.h>
 /* Implement a TF like searchsorted and a bucketize function running on cpu
 *
@ -58,7 +59,7 @@ void searchsorted_cpu_contiguous(Tensor& result, const Tensor& input, const Tens
  bool is_1d_boundaries = boundaries.dim() == 1;
  at::parallel_for(0, numel_in, SEARCHSORTED_GRAIN_SIZE, [&](int64_t start, int64_t end) {
-    for (int64_t i = start; i < end; ++i) {
+    for (const auto i : c10::irange(start, end)) {
      // If boundaries tensor is 1d, we always search the entire boundary tensor
      int64_t start_bd = is_1d_boundaries ? 0 : i / idim_in * idim_bd;
      const input_t *data_bd_start = &data_bd[start_bd];
--- a/aten/src/ATen/native/Col2Im.cpp
+++ b/aten/src/ATen/native/Col2Im.cpp
@ -5,6 +5,7 @@
 #include <ATen/native/im2col.h>
 #include <ATen/native/im2col_shape_check.h>
 #include <c10/util/irange.h>
 // Note [im2col/col2im output padding]
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -150,7 +151,7 @@ static void col2im_out_cpu_template(
                stride_width +
            1;
-        for (int64_t elt = 0; elt < batch_size; elt++) {
+        for (const auto elt : c10::irange(batch_size)) {
          input_n = input.select(0, elt);
          output_n = output.select(0, elt);
--- a/aten/src/ATen/native/ComplexHelper.h
+++ b/aten/src/ATen/native/ComplexHelper.h
@ -24,7 +24,7 @@ inline Tensor view_tensor(
 inline DimVector computeStrideForViewAsReal(IntArrayRef oldstride) {
  DimVector res(oldstride.size() + 1);
-  for(size_t i = 0; i < oldstride.size(); i++) {
+  for (const auto i : c10::irange(oldstride.size())) {
    res[i] = oldstride[i] * 2;
  }
  res.back() = 1;
--- a/aten/src/ATen/native/ConstantPadNd.cpp
+++ b/aten/src/ATen/native/ConstantPadNd.cpp
@ -47,7 +47,7 @@ Tensor constant_pad_nd(const Tensor& self, IntArrayRef pad, const Scalar& value)
        new_shape.emplace_back(input_sizes[i]);
    }
-    for (size_t i = 0; i < (size_t)l_pad; i++) {
+    for (const auto i : c10::irange((size_t)l_pad)) {
        auto pad_idx = pad.size() - ((i + 1) * 2);
        auto new_dim = input_sizes[l_diff + i] + pad[pad_idx] + pad[pad_idx + 1];
        TORCH_CHECK(new_dim > 0, "The input size ", input_sizes[l_diff + i], ", plus negative padding ",
--- a/aten/src/ATen/native/ConvUtils.h
+++ b/aten/src/ATen/native/ConvUtils.h
@ -1,6 +1,7 @@
 #pragma once
 #include <ATen/detail/CUDAHooksInterface.h>
 #include <c10/util/env.h>
 #include <c10/util/irange.h>
 namespace at { namespace native {
@ -35,7 +36,7 @@ static inline std::vector<int64_t> conv_output_size(
  std::vector<int64_t> output_size(dim);
  output_size[0] = input_size[input_batch_size_dim];
  output_size[1] = weight_size[weight_output_channels_dim];
-  for (size_t d = 2; d < dim; ++d) {
+  for (const auto d : c10::irange(2, dim)) {
    auto dilation_ = has_dilation ? dilation[d - 2] : 1;
    auto kernel = dilation_ * (weight_size[d] - 1) + 1;
    output_size[d] = (input_size[d] + (2 * padding[d - 2]) - kernel) / stride[d - 2] + 1;
@ -53,7 +54,7 @@ static inline std::vector<int64_t> conv_input_size(
  std::vector<int64_t> input_size(dim);
  input_size[0] = output_size[output_batch_size_dim];
  input_size[1] = weight_size[weight_input_channels_dim] * groups;
-  for (size_t d = 2; d < dim; ++d) {
+  for (const auto d : c10::irange(2, dim)) {
    int kernel = dilation[d - 2] * (weight_size[d] - 1) + 1;
    input_size[d] = (output_size[d] - 1) * stride[d - 2] - (2 * padding[d - 2]) +
                     kernel + output_padding[d - 2];
@ -69,7 +70,7 @@ static inline std::vector<int64_t> conv_weight_size(
  std::vector<int64_t> weight_size(dim);
  weight_size[0] = output_size[1];
  weight_size[1] = input_size[1] / groups;
-  for (size_t d = 2; d < dim; ++d) {
+  for (const auto d : c10::irange(2, dim)) {
    int kernel = input_size[d] - (output_size[d] - 1) * stride[d - 2]
               + 2 * padding[d - 2] - output_padding[d - 2];
    weight_size[d] = (kernel - 1) / dilation[d - 2] + 1;
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@ -975,7 +975,7 @@ at::Tensor _convolution(
    } else {
      std::vector<Tensor> outputs(params.groups);
      input = input.contiguous();
-      for (int g = 0; g < params.groups; ++g) {
+      for (const auto g : c10::irange(params.groups)) {
        auto input_g = subtensor(input, 1, params.groups, g);
        auto weight_g = subtensor(weight, 0, params.groups, g);
        auto bias_g = subtensor(bias, 0, params.groups, g);
@ -1212,7 +1212,7 @@ std::tuple<Tensor,Tensor,Tensor> _convolution_double_backward( const c10::option
        }
      } else {
        std::vector<Tensor> gWt_list(groups);
-        for (int g = 0; g < groups; ++g) {
+        for (const auto g : c10::irange(groups)) {
          auto ggIt_g = subvariable(ggIt, 0, groups, g);
          auto gOt_g = subvariable(gOt, 0, groups, g);
          if (gOt_g.is_cuda()) {
@ -1239,7 +1239,7 @@ std::tuple<Tensor,Tensor,Tensor> _convolution_double_backward( const c10::option
      // the ConvForward kernels don't support asymmetric padding.
      auto gW_size = gW.sizes();
      auto w_size = weight.sizes();
-      for (size_t i = 2; i < gW_size.size(); ++i) {
+      for (const auto i : c10::irange(2, gW_size.size())) {
        if (gW_size[i] > w_size[i]) {
            gW = gW.narrow(i, 0, w_size[i]);
            gW_size = gW.sizes();
@ -1268,7 +1268,7 @@ std::tuple<Tensor,Tensor,Tensor> _convolution_double_backward( const c10::option
        // rather than narrowing the computed gI
        auto gI_size = gI.sizes();
        auto i_size = input.sizes();
-        for (size_t i = 2; i < gI_size.size(); ++i) {
+        for (const auto i : c10::irange(2, gI_size.size())) {
          if (gI_size[i] > i_size[i]) {
            gI = gI.narrow(i, 0, i_size[i]);
            gI_size = gI.sizes();
@ -1289,7 +1289,7 @@ std::tuple<Tensor,Tensor,Tensor> _convolution_double_backward( const c10::option
            gi_conv_params.output_padding[1] = input_shape[0] - expected_input_shape;
          }
        } else {
-          for(size_t i = 0; i < kernel_size.size(); ++i) {
+          for (const auto i : c10::irange(kernel_size.size())) {
            // Check if whole input has been used or not
            auto expected_input_shape = (kernel_size[i] - 1) * gi_conv_params.dilation[i]
              - 2 * gi_conv_params.padding[i]
--- a/aten/src/ATen/native/ConvolutionMM2d.cpp
+++ b/aten/src/ATen/native/ConvolutionMM2d.cpp
@ -7,6 +7,7 @@
 #include <ATen/div_rtn.h>
 #include <ATen/native/CPUBlas.h>
 #include <ATen/native/Unfold2d.h>
 #include <c10/util/irange.h>
 namespace at {
 namespace native {
@ -299,7 +300,7 @@ void slow_conv2d_backward_out_cpu_template(
    at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) {
      auto fgrad_input = std::make_unique<scalar_t[]>(
          c10::multiply_integers(finput.sizes().slice(1)));
-      for (int64_t t = start; t < end; t++) {
+      for (const auto t : c10::irange(start, end)) {
        auto grad_input_t = grad_input_a[t];
        auto grad_output_t = grad_output_a[t];
        slow_conv2d_backward_update_grad_input_frame(
@ -478,7 +479,7 @@ std::tuple<Tensor&, Tensor&> slow_conv2d_forward_out_cpu(
    auto weight_2d_a = weight_2d.accessor<scalar_t, 2>();
    at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) {
-      for (int64_t t = start; t < end; t++) {
+      for (const auto t : c10::irange(start, end)) {
        auto input_t = input_a[t];
        auto output_t = output_a[t];
        auto finput_t = finput_a[t];
--- a/aten/src/ATen/native/ConvolutionMM3d.cpp
+++ b/aten/src/ATen/native/ConvolutionMM3d.cpp
@ -6,6 +6,7 @@
 #include <ATen/div_rtn.h>
 #include <ATen/native/CPUBlas.h>
 #include <ATen/native/Unfold3d.h>
 #include <c10/util/irange.h>
 constexpr int64_t CONV3D_GRAIN_SALT = 20;
@ -358,7 +359,7 @@ void slow_conv3d_backward_out_cpu_template(
        auto fgrad_input_a = fgrad_input.accessor<scalar_t, 3>();
        auto weight_2d_a = weight2d.accessor<scalar_t, 2>();
-        for (int64_t t = start; t < end; t++) {
+        for (const auto t : c10::irange(start, end)) {
          auto grad_input_t = grad_input_a[t];
          auto grad_output_t = grad_output_a[t];
          auto fgrad_input_t = fgrad_input_a[t];
@ -462,7 +463,7 @@ static void slow_conv3d_backward_parameters_out_cpu_template(
    auto grad_weight_2d_a = grad_weight_2d.accessor<scalar_t, 2>();
    auto grad_output_a = grad_output_contiguous.accessor<scalar_t, 5>();
    auto finput_a = finput.accessor<scalar_t, 3>();
-    for (int64_t t = 0; t < batch_size; t++) {
+    for (const auto t : c10::irange(batch_size)) {
      auto grad_output_t = grad_output_a[t];
      auto finput_t = finput_a[t];
      slow_conv3d_backward_weight_frame(
@ -564,7 +565,7 @@ std::tuple<Tensor&, Tensor&, Tensor&> slow_conv3d_forward_out_cpu(const Tensor&
    at::parallel_for(
        0, batch_size, CONV3D_GRAIN_SALT, [&](int64_t start, int64_t end) {
-          for (int64_t t = start; t < end; t++) {
+          for (const auto t : c10::irange(start, end)) {
            auto input_t = input_a[t];
            auto output_t = output_a[t];
            auto finput_t = finput_a[t];
--- a/aten/src/ATen/native/ConvolutionTBC.cpp
+++ b/aten/src/ATen/native/ConvolutionTBC.cpp
@ -1,5 +1,6 @@
 #include <ATen/ATen.h>
 #include <ATen/NativeFunctions.h>
 #include <c10/util/irange.h>
 #include <tuple>
 namespace at {
@ -39,7 +40,7 @@ Tensor conv_tbc(const Tensor& self, const Tensor& weight, const Tensor& bias, in
    weight_size[2],
  }, self.options());
  output.copy_(bias.expand(output.sizes()));
-  for (int k = 0; k < kw; k++) {
+  for (const auto k : c10::irange(kw)) {
    int iShift = std::max(0, static_cast<int>(k - real_pad));
    int oShift = std::max(0, static_cast<int>(real_pad - k));
    // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
--- a/aten/src/ATen/native/Copy.cpp
+++ b/aten/src/ATen/native/Copy.cpp
@ -12,6 +12,7 @@
 #include <ATen/MemoryOverlap.h>
 #include <ATen/NamedTensorUtils.h>
 #include <ATen/Parallel.h>
 #include <c10/util/irange.h>
 #include <torch/library.h>
 #ifdef USE_FBGEMM
@ -65,16 +66,16 @@ void copy_same_type_transpose_(Tensor& self, const Tensor& src) {
        int nc = std::min(NC - C, BLOCK_SZ);
        // 1. copy columns from src to buf
-        for (int c = 0; c < nc; c++) {
+        for (const auto c : c10::irange(nc)) {
          memcpy(bp + c * BLOCK_SZ, spo + c * NR, nr * sizeof(scalar_t));
        }
        // 2. transpose buf in place
        int rc_max = std::max(nr, nc);
        int rc_min = std::min(nr, nc);
-        for (int r = 0; r < rc_max; r++) {
+        for (const auto r : c10::irange(rc_max)) {
          int end = std::min(r, rc_min);
-          for (int c = 0; c < end; c++) {
+          for (const auto c : c10::irange(end)) {
            scalar_t tmp = bp[r + BLOCK_SZ * c];
            bp[r + BLOCK_SZ * c] = bp[r * BLOCK_SZ + c];
            bp[r * BLOCK_SZ + c] = tmp;
@ -82,7 +83,7 @@ void copy_same_type_transpose_(Tensor& self, const Tensor& src) {
        }
        // 3. copy rows from buf to dst
-        for (int r = 0; r < nr; r++) {
+        for (const auto r : c10::irange(nr)) {
          memcpy(rpo + r * NC, bp + r * BLOCK_SZ, nc * sizeof(scalar_t));
        }
      }
--- a/aten/src/ATen/native/Cross.cpp
+++ b/aten/src/ATen/native/Cross.cpp
@ -3,6 +3,7 @@
 #include <ATen/NativeFunctions.h>
 #include <ATen/native/Cross.h>
 #include <c10/util/irange.h>
 namespace at { namespace native {
@ -30,7 +31,7 @@ Tensor & cross_out(const Tensor & input, const Tensor & other, const c10::option
  int64_t dim = -1;
  if(!dimension.has_value()) {
-    for(int64_t i = 0; i < input.dim(); i++) {
+    for (const auto i : c10::irange(input.dim())) {
      if(input.size(i) == 3) {
        dim = i;
        break;
--- a/aten/src/ATen/native/DilatedConvolutionUtils.h
+++ b/aten/src/ATen/native/DilatedConvolutionUtils.h
@ -5,6 +5,7 @@
 #include <ATen/div_rtn.h>
 #include <ATen/ATen.h>
 #include <c10/util/irange.h>
 #define TORCH_CHECK_DIM_SIZE(T, DIM, DIM_SIZE, SIZE) \
  TORCH_CHECK(                                       \
@ -43,7 +44,7 @@ std::vector<int64_t> get_output_size(
    IntArrayRef pad_size,
    IntArrayRef dilation_size) {
  std::vector<int64_t> sizes;
-  for (int index = 0; index < dim; index++) {
+  for (const auto index : c10::irange(dim)) {
    sizes.push_back(
        div_rtn<int64_t>(
            input.size(index + input.dim() - dim) + 2 * pad_size[index] -
--- a/aten/src/ATen/native/DilatedMaxPool3d.cpp
+++ b/aten/src/ATen/native/DilatedMaxPool3d.cpp
@ -3,6 +3,7 @@
 #include <ATen/NamedTensorUtils.h>
 #include <ATen/NativeFunctions.h>
 #include <ATen/native/Pool.h>
 #include <c10/util/irange.h>
 #include <tuple>
@ -37,8 +38,7 @@ static void max_pool3d_with_indices_single_out_frame(
          int dilationH)
 {
  at::parallel_for(0, nslices, 0, [&](int64_t start, int64_t end) {
-    for (auto k = start; k < end; k++)
+    for (const auto k : c10::irange(start, end)) {
    {
      /* loop over output */
      // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
      int64_t i, j, ti;
@ -120,8 +120,7 @@ static void max_pool3d_with_indices_out_frame(
          int dilationT, int dilationW, int dilationH)
 {
  at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) {
-    for (auto p = start; p < end; p++)
+    for (const auto p : c10::irange(start, end)) {
    {
      max_pool3d_with_indices_single_out_frame(
        input_data   + p * istride,
        output_data  + p * ostride,
@ -285,8 +284,7 @@ static void max_pool3d_with_indices_backward_single_out_frame(
          int dilationH)
 {
  at::parallel_for(0, nslices, 0, [&](int64_t start, int64_t end) {
-    for (auto k = start; k < end; k++)
+    for (const auto k : c10::irange(start, end)) {
    {
      scalar_t *gradInput_p_k  = gradInput_p  + k * itime * iwidth * iheight;
      scalar_t *gradOutput_p_k = gradOutput_p + k * otime * owidth * oheight;
      int64_t *indz_p_k = indz_p + k * otime * owidth * oheight;
@ -330,8 +328,7 @@ static void max_pool3d_with_indices_backward_out_frame(
          int dilationT, int dilationW, int dilationH)
 {
  at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) {
-    for (auto p = start; p < end; p++)
+    for (const auto p : c10::irange(start, end)) {
    {
      max_pool3d_with_indices_backward_single_out_frame<scalar_t>(
        gradInput_data + p * istride,
        gradOutput_data + p * ostride,
--- a/aten/src/ATen/native/Dropout.cpp
+++ b/aten/src/ATen/native/Dropout.cpp
@ -1,6 +1,7 @@
 #include <ATen/ATen.h>
 #include <ATen/Dispatch.h>
 #include <ATen/NamedTensorUtils.h>
 #include <c10/util/irange.h>
 namespace at { namespace native {
@ -16,8 +17,10 @@ Tensor make_feature_noise(const Tensor& input) {
  sizes.reserve(input.dim());
  sizes.push_back(input_sizes[0]);
  sizes.push_back(input_sizes[1]);
-  for (int64_t i = 2; i < input.dim(); ++i)
+  for (const auto i : c10::irange(2, input.dim())) {
    (void)i; //Suppress unused variable warning
    sizes.push_back(1);
  }
  return at::empty(sizes, input.options());
 }
--- a/aten/src/ATen/native/Embedding.cpp
+++ b/aten/src/ATen/native/Embedding.cpp
@ -123,7 +123,7 @@ Tensor embedding_dense_backward_cpu(
    auto parallel_section = [&](index_t start, index_t end) {
      TensorIterator iter(add_iter);
-      for (int64_t i = 0; i < numel; i++) {
+      for (const auto i : c10::irange(numel)) {
        if (indices_data[i] != padding_idx) {
          index_t k = indices_data[i];
          if (k >= start && k < end) {
@ -167,7 +167,7 @@ Tensor & embedding_renorm_cpu_(
    // Note that we cannot use at::parallel_for here because we perform operations on
    // Tensor inside the loop. See github.com/pytorch/pytorch/issues/28370 for more details.
-    for (auto i = 0; i < num_indices; i++) {
+    for (const auto i : c10::irange(num_indices)) {
      if (i > 0 && sorted_indices[i] == sorted_indices[i - 1]) {
        continue;
      }
--- a/aten/src/ATen/native/EmbeddingBag.cpp
+++ b/aten/src/ATen/native/EmbeddingBag.cpp
@ -107,7 +107,7 @@ index_select_add(const Tensor &select_indices,
  auto output_stride0 = output.strides()[0];
  auto output_stride1 = output.strides()[1];
-  for (int64_t i = 0; i < numel; i++) {
+  for (const auto i : c10::irange(numel)) {
    // We can skip indices equal to padding_idx so they are not included in
    // the reduction
    if (select_indices_data[i] != padding_idx) {
@ -247,7 +247,7 @@ index_select_add(const Tensor &select_indices,
    auto output_stride0 = output.strides()[0];
    auto output_stride1 = output.strides()[1];
    auto numel = add_indices.numel();
-    for (int64_t i = 0; i < numel; i++) {
+    for (const auto i : c10::irange(numel)) {
      // We can skip indices equal to padding_idx so they are not included in
      // the reduction
      if (select_indices_data[i] != padding_idx) {
@ -302,14 +302,14 @@ index_select_scale_add(const Tensor &select_indices,
  auto* scale_data = scale.data_ptr<data_t>();
  auto scale_stride = scale.strides()[0];
-  for (int64_t i = 0; i < numel; i++) {
+  for (const auto i : c10::irange(numel)) {
    // We can skip indices equal to padding_idx so they are not included in
    // the reduction
    if (select_indices_data[i] != padding_idx) {
      auto* src_base = src_data + src_stride0 * select_indices_data[i];
      auto* output_base = output_data + output_stride0 * add_indices_data[i];
      auto scale = scale_data[i * scale_stride];
-      for (int64_t j = 0; j < ddim; j++) {
+      for (const auto j : c10::irange(ddim)) {
        output_base[j * output_stride1] += src_base[j * src_stride1] * scale;
      }
    } else if (bag_size.defined()) {
@ -419,14 +419,14 @@ index_select_scale_add(const Tensor &select_indices,
    auto numel = add_indices.numel();
-    for (int64_t i = 0; i < numel; i++) {
+    for (const auto i : c10::irange(numel)) {
      // We can skip indices equal to padding_idx so they are not included in
      // the reduction
      if (select_indices_data[i] != padding_idx) {
        auto* src_base = src_data + src_stride0 * select_indices_data[i];
        auto* output_base = output_data + output_stride0 * add_indices_data[i];
        auto scale = scale_data[i * scale_stride];
-        for (int64_t j = 0; j < ddim; j++) {
+        for (const auto j : c10::irange(ddim)) {
          output_base[j * output_stride1] += src_base[j * src_stride1] * scale;
        }
      } else if (bag_size.defined()) {
--- a/aten/src/ATen/native/Fill.cpp
+++ b/aten/src/ATen/native/Fill.cpp
@ -6,6 +6,7 @@
 #include <ATen/native/TensorIterator.h>
 #include <ATen/Utils.h>
 #include <c10/util/accumulate.h>
 #include <c10/util/irange.h>
 namespace at {
 namespace native {
@ -63,7 +64,7 @@ Tensor& fill_diagonal_(Tensor& self, const Scalar& fill_value, bool wrap) {
  if (nDims > 2) {
    int64_t dim1 = height;
-    for (int64_t i = 1; i < nDims; i++) {
+    for (const auto i : c10::irange(1, nDims)) {
      if (self.size(i) != dim1) {
        AT_ERROR("all dimensions of input must be of equal length");
      }
@ -76,7 +77,7 @@ Tensor& fill_diagonal_(Tensor& self, const Scalar& fill_value, bool wrap) {
  int64_t size = std::min(height, width);
  int64_t stride = 0;
-  for (int64_t i = 0; i < nDims; i++) {
+  for (const auto i : c10::irange(nDims)) {
    stride += self.stride(i);
  }
  strides.push_back(stride);
--- a/aten/src/ATen/native/FractionalMaxPool2d.cpp
+++ b/aten/src/ATen/native/FractionalMaxPool2d.cpp
@ -1,6 +1,7 @@
 #include <ATen/ATen.h>
 #include <ATen/NativeFunctions.h>
 #include <ATen/Parallel.h>
 #include <c10/util/irange.h>
 #include <tuple>
 #include <vector>
@ -32,7 +33,7 @@ TORCH_META_FUNC(fractional_max_pool2d) (
  int64_t ndims = input.ndimension();
  TORCH_CHECK(ndims == 3 || ndims == 4,
              "fractional_max_pool2d(): Expected 3D or 4D tensor, but got: ", input.sizes());
-  for (int64_t i = 1; i < ndims; ++i) {
+  for (const auto i : c10::irange(1, ndims)) {
    TORCH_CHECK(input.size(i) > 0,
                "fractional_max_pool2d(): Expected input to have non-zero size for non-batch dimensions, but got",
                input.sizes(), " with dimension ", i, " being empty.");
@ -106,7 +107,7 @@ static void fractional_max_pool2d_out_single_batch_frame(
  int outputW, int outputH,
  int poolSizeW, int poolSizeH) {
  at::parallel_for(0, numPlanes, 0, [&](int64_t start, int64_t end) {
-    for (auto plane = start; plane < end; ++plane) {
+    for (const auto plane : c10::irange(start, end)) {
      /* each plane contains 2 random samples, one for W and one for H */
      scalar_t* randomSamplesForPlane = randomSamples + plane * 2;
@ -177,7 +178,7 @@ static void fractional_max_pool2d_out_frame(
      return;
    }
    at::parallel_for(0, numBatch, 0, [&](int64_t start, int64_t end) {
-      for (auto batch = start; batch < end; ++batch) {
+      for (const auto batch : c10::irange(start, end)) {
        fractional_max_pool2d_out_single_batch_frame<scalar_t>(
          input + batch * numPlanes * inputH * inputW,
          output + batch * numPlanes * outputH * outputW,
@ -254,7 +255,7 @@ static void fractional_max_pool2d_backward_out_single_batch_frame(
  int inputW, int inputH,
  int outputW, int outputH) {
  at::parallel_for(0, numPlanes, 0, [&](int64_t start, int64_t end) {
-    for (auto plane = start; plane < end; plane++) {
+    for (const auto plane : c10::irange(start, end)) {
      scalar_t* gradInputForPlane = gradInput + plane * inputW * inputH;
      scalar_t* gradOutputForPlane = gradOutput + plane * outputW * outputH;
      int64_t* indicesForPlane = indices + plane * outputW * outputH;
@ -291,7 +292,7 @@ static void fractional_max_pool2d_backward_out_frame(
      return;
    }
    at::parallel_for(0, numBatch, 0, [&](int64_t start, int64_t end) {
-      for (auto batch = start; batch < end; ++batch) {
+      for (const auto batch : c10::irange(start, end)) {
        fractional_max_pool2d_backward_out_single_batch_frame<scalar_t>(
          gradInput + batch * numPlanes * inputH * inputW,
          gradOutput + batch * numPlanes * outputH * outputW,
--- a/aten/src/ATen/native/FractionalMaxPool3d.cpp
+++ b/aten/src/ATen/native/FractionalMaxPool3d.cpp
@ -44,7 +44,7 @@ static void fractional_max_pool3d_out_single_batch_frame(
  int64_t poolSizeT, int64_t poolSizeH, int64_t poolSizeW) {
  at::parallel_for(0, numPlanes, 0, [&](int64_t start, int64_t end) {
-    for (auto plane = start; plane < end; ++plane) {
+    for (const auto plane : c10::irange(start, end)) {
      /* each plane contains 3 random samples,
         one for T, one for W, and one for H */
      scalar_t* randomSamplesForPlane = randomSamples + plane * 3;
@ -126,7 +126,7 @@ static void fractional_max_pool3d_out_frame(
    }
    at::parallel_for(0, numBatch, 0, [&](int64_t start, int64_t end) {
-      for (auto batch = start; batch < end; ++batch) {
+      for (const auto batch : c10::irange(start, end)) {
        fractional_max_pool3d_out_single_batch_frame<scalar_t>(
          input + batch * numPlanes * inputW * inputH * inputT,
          output + batch * numPlanes * outputW * outputH * outputT,
@ -171,7 +171,7 @@ void fractional_max_pool3d_out_cpu_template(
  TORCH_CHECK(ndims == 4 || ndims == 5,
              "fractional_max_pool3d_out(): Expected 4D or 5D tensor, but got: ",
              input_.sizes());
-  for (int64_t i = 1; i < ndims; ++i) {
+  for (const auto i : c10::irange(1, ndims)) {
    TORCH_CHECK(input_.size(i) > 0,
                "fractional_max_pool3d_out(): Expected input to have non-zero size for non-batch dimensions, but got",
                input_.sizes(), " with dimension ", i, " being empty.");
@ -243,7 +243,7 @@ static void fractional_max_pool3d_backward_out_single_batch_frame(
  int64_t outputT, int64_t outputH, int64_t outputW) {
  at::parallel_for(0, numPlanes, 0, [&](int64_t start, int64_t end) {
-    for (auto plane = start; plane < end; plane++) {
+    for (const auto plane : c10::irange(start, end)) {
      scalar_t* gradInputForPlane = gradInput + plane * inputT * inputH * inputW;
      scalar_t* gradOutputForPlane = gradOutput +
                  plane * outputT * outputH * outputW;
@ -284,7 +284,7 @@ static void fractional_max_pool3d_backward_out_frame(
    }
    at::parallel_for(0, numBatch, 0, [&](int64_t start, int64_t end) {
-      for (auto batch = start; batch < end; ++batch) {
+      for (const auto batch : c10::irange(start, end)) {
        fractional_max_pool3d_backward_out_single_batch_frame<scalar_t>(
          gradInput + batch * numPlanes * inputW * inputH * inputT,
          gradOutput + batch * numPlanes * outputW * outputH * outputT,
--- a/aten/src/ATen/native/GridSampler.cpp
+++ b/aten/src/ATen/native/GridSampler.cpp
@ -9,6 +9,7 @@
 #include <ATen/native/UpSample.h>
 #include <ATen/native/cpu/GridSamplerKernel.h>
 #include <c10/util/Exception.h>
 #include <c10/util/irange.h>
 namespace at { namespace native {
@ -51,12 +52,12 @@ namespace {
    scalar_t *grid_ptr = grid.data_ptr<scalar_t>();
    // loop over each output pixel
    at::parallel_for(0, N, 0, [&](int64_t start, int64_t end) {
-      for (int64_t n = start; n < end; ++n) {
+      for (const auto n : c10::irange(start, end)) {
        scalar_t *grid_ptr_N = grid_ptr + n * grid_sN;
        scalar_t *inp_ptr_N = inp_ptr + n * inp_sN;
-        for (int64_t d = 0; d < out_D; ++d) {
+        for (const auto d : c10::irange(out_D)) {
-          for (int64_t h = 0; h < out_H; ++h) {
+          for (const auto h : c10::irange(out_H)) {
-            for (int64_t w = 0; w < out_W; ++w) {
+            for (const auto w : c10::irange(out_W)) {
              // get the corresponding input x, y, z co-ordinates from grid
              scalar_t *grid_ptr_NDHW = grid_ptr_N + d * grid_sD + h * grid_sH + w * grid_sW;
              scalar_t ix = *grid_ptr_NDHW;
@ -222,12 +223,12 @@ namespace {
    scalar_t *gGrid_ptr = grad_grid.data_ptr<scalar_t>();
    // loop over each output pixel
    at::parallel_for(0, N, 0, [&](int64_t start, int64_t end) {
-      for (int64_t n = start; n < end; ++n) {
+      for (const auto n : c10::irange(start, end)) {
        scalar_t *grid_ptr_N = grid_ptr + n * grid_sN;
        scalar_t *inp_ptr_N = inp_ptr + n * inp_sN;
        scalar_t *gGrid_ptr_NDHW = gGrid_ptr + n * gGrid_sN;
-        for (int64_t d = 0; d < out_D; ++d) {
+        for (const auto d : c10::irange(out_D)) {
-          for (int64_t h = 0; h < out_H; ++h) {
+          for (const auto h : c10::irange(out_H)) {
            for (int64_t w = 0; w < out_W; ++w, gGrid_ptr_NDHW += gGrid_sW /* grad_grid is contiguous */ ) {
              // get the corresponding input x, y, z co-ordinates from grid
              scalar_t *grid_ptr_NDHW = grid_ptr_N + d * grid_sD + h * grid_sH + w * grid_sW;
@ -416,11 +417,11 @@ Tensor _grid_sampler_2d_cpu_fallback(const Tensor& input, const Tensor& grid,
  scalar_t *grid_ptr = grid.data_ptr<scalar_t>();
  // loop over each output pixel
  at::parallel_for(0, N, 0, [&](int64_t start, int64_t end) {
-    for (int64_t n = start; n < end; ++n) {
+    for (const auto n : c10::irange(start, end)) {
      scalar_t *grid_ptr_N = grid_ptr + n * grid_sN;
      scalar_t *inp_ptr_N = inp_ptr + n * inp_sN;
-      for (int64_t h = 0; h < out_H; ++h) {
+      for (const auto h : c10::irange(out_H)) {
-        for (int64_t w = 0; w < out_W; ++w) {
+        for (const auto w : c10::irange(out_W)) {
          // get the corresponding input x, y, z co-ordinates from grid
          scalar_t *grid_ptr_NHW = grid_ptr_N + h * grid_sH + w * grid_sW;
          scalar_t x = *grid_ptr_NHW;
@ -505,7 +506,7 @@ Tensor _grid_sampler_2d_cpu_fallback(const Tensor& input, const Tensor& grid,
              scalar_t coefficients[4];
              // Interpolate 4 values in the x directon
-              for (int64_t i = 0; i < 4; ++i) {
+              for (const auto i : c10::irange(4)) {
                coefficients[i] = cubic_interp1d<scalar_t>(
                  get_value_bounded<scalar_t>(inp_ptr_NC, ix_nw - 1, iy_nw - 1 + i, inp_W, inp_H, inp_sW, inp_sH, padding_mode, align_corners),
                  get_value_bounded<scalar_t>(inp_ptr_NC, ix_nw + 0, iy_nw - 1 + i, inp_W, inp_H, inp_sW, inp_sH, padding_mode, align_corners),
@ -578,11 +579,11 @@ _grid_sampler_2d_cpu_fallback_backward(const Tensor& grad_output,
  scalar_t *gGrid_ptr = grad_grid.data_ptr<scalar_t>();
  // loop over each output pixel
  at::parallel_for(0, N, 0, [&](int64_t start, int64_t end) {
-    for (int64_t n = start; n < end; ++n) {
+    for (const auto n : c10::irange(start, end)) {
      scalar_t *grid_ptr_N = grid_ptr + n * grid_sN;
      scalar_t *inp_ptr_N = inp_ptr + n * inp_sN;
      scalar_t *gGrid_ptr_NHW = gGrid_ptr + n * gGrid_sN;
-      for (int64_t h = 0; h < out_H; ++h) {
+      for (const auto h : c10::irange(out_H)) {
        for (int64_t w = 0; w < out_W; ++w, gGrid_ptr_NHW += gGrid_sW /* grad_grid is contiguous */ ) {
          // get the corresponding input x, y co-ordinates from grid
          scalar_t *grid_ptr_NHW = grid_ptr_N + h * grid_sH + w * grid_sW;
@ -703,8 +704,8 @@ _grid_sampler_2d_cpu_fallback_backward(const Tensor& grad_output,
            for (int64_t c = 0; c < C; ++c, gOut_ptr_NCHW += gOut_sC, gInp_ptr_NC += gInp_sC, inp_ptr_NC+= inp_sC) {
              scalar_t gOut = *gOut_ptr_NCHW;
-              for (int64_t i = 0; i < 4; ++i) {
+              for (const auto i : c10::irange(4)) {
-                for (int64_t j = 0; j < 4; ++j) {
+                for (const auto j : c10::irange(4)) {
                  // set input gradient
                  add_value_bounded<scalar_t>(gInp_ptr_NC, ix_nw - 1 + i, iy_nw - 1 + j,
@ -857,7 +858,7 @@ Tensor grid_sampler(const Tensor& input, const Tensor& grid,
    !(input.dim() == 5 && static_cast<GridSamplerInterpolation>(interpolation_mode) == GridSamplerInterpolation::Bicubic),
    "grid_sampler(): bicubic interpolation only supports 4D input"
  );
-  for (int64_t i = 2; i < input.dim(); i++) {
+  for (const auto i : c10::irange(2, input.dim())) {
    TORCH_CHECK(input.size(i) > 0,
      "grid_sampler(): expected input to have non-empty spatial dimensions, "
      "but input has sizes ", input.sizes(), " with dimension ", i, " being "
--- a/aten/src/ATen/native/Im2Col.cpp
+++ b/aten/src/ATen/native/Im2Col.cpp
@ -5,6 +5,7 @@
 #include <ATen/native/im2col.h>
 #include <ATen/native/im2col_shape_check.h>
 #include <c10/util/irange.h>
 namespace at {
 namespace native {
@ -91,7 +92,7 @@ static void im2col_out_cpu_template(
        Tensor input_n;
        Tensor output_n;
-        for (int64_t elt = 0; elt < batch_size; elt++) {
+        for (const auto elt : c10::irange(batch_size)) {
          input_n = input.select(0, elt);
          output_n = output.select(0, elt);
--- a/aten/src/ATen/native/IndexingUtils.h
+++ b/aten/src/ATen/native/IndexingUtils.h
@ -2,6 +2,7 @@
 #include <ATen/ExpandUtils.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/core/List.h>
 #include <c10/util/irange.h>
 #include <limits>
@ -31,7 +32,7 @@ static C10_UNUSED std::vector<Tensor> expandTensors(const Tensor & self, const t
        }
        // The sizes of the ByteTensor mask or bool tensor must match the sizes of the
        // corresponding dimensions in self
-        for (int64_t j = 0; j < index.dim(); j++) {
+        for (const auto j : c10::irange(index.dim())) {
          int64_t srcIdx = result.size() + j;
          if (index.size(j) != self.size(srcIdx)) {
            invalid_mask(self, srcIdx, index, j);
@ -39,7 +40,7 @@ static C10_UNUSED std::vector<Tensor> expandTensors(const Tensor & self, const t
        }
        // Replace with nonzeros
        auto nonzero = index.nonzero();
-        for (int64_t j = 0; j < index.dim(); j++) {
+        for (const auto j : c10::irange(index.dim())) {
          result.emplace_back(nonzero.select(1, j));
        }
      } else {
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@ -1158,7 +1158,7 @@ static void addbmm_impl_(
  }
  auto adjusted_beta(beta);
-  for (int64_t batch = 0; batch < num_batches; ++batch) {
+  for (const auto batch : c10::irange(num_batches)) {
    result.addmm_(batch1[batch], batch2[batch], adjusted_beta, alpha);
    adjusted_beta = 1; // accumulate output once
  }
@ -1215,23 +1215,23 @@ inline void baddbmm_cpu_kernel(const Tensor& result, const Tensor& self, const T
  int64_t grain_size = std::min(internal::GRAIN_SIZE / (is * js * ks), (int64_t)1);
  parallel_for(0, bs, grain_size, [&](int64_t b_begin, int64_t b_end) {
-      for (int64_t b = b_begin; b < b_end; b++) {
+      for (const auto b : c10::irange(b_begin, b_end)) {
        auto r1 = r0[b];
        auto s1 = s0[b];
        auto m1 = m0[b];
-        for (int64_t i = 0; i < is; i++) {
+        for (const auto i : c10::irange(is)) {
          auto r2 = r1[i];
          auto s2 = s1[i];
-          for (int64_t j = 0; j < js; j++) {
+          for (const auto j : c10::irange(js)) {
            scalar_t &r = r2[j];
            if (is_bmm) {
              r = 0;
-              for (int64_t k = 0; k < ks; k++) {
+              for (const auto k : c10::irange(ks)) {
                r += s2[k] * m1[k][j];
              }
            } else {
              r *= beta;
-              for (int64_t k = 0; k < ks; k++) {
+              for (const auto k : c10::irange(ks)) {
                r += alpha * s2[k] * m1[k][j];
              }
            }
@ -1994,10 +1994,11 @@ void compute_T18_scale_square(
  auto mexp_scaled = at::native::compute_T18<scalar_t>(a_scaled);
  auto s_cpu = (s.device().type() == at::kCPU)
    ? s : s.to(at::kCPU);
-  for (int64_t i = 0; i < mexp_scaled.size(0); ++i) {
+  for (const auto i : c10::irange(mexp_scaled.size(0))) {
    auto s_val = s_cpu.select(0, i).template item<int64_t>();
    auto mexp = mexp_scaled.select(0, i);
-    for (int64_t p = 0; p < s_val; ++p) {
+    for (const auto p : c10::irange(s_val)) {
      (void)p; //Suppress unused variable warning
      mexp = at::matmul(mexp, mexp);
    }
    mexp_out.select(0, i).copy_(mexp);
@ -2265,7 +2266,7 @@ Tensor& nuclear_norm_out(const Tensor& self, IntArrayRef dim, bool keepdim, Tens
 // (e.g. [0, 1, 2, ..., ndim-1])
 static std::vector<int64_t> make_dim_list(int64_t ndim) {
  std::vector<int64_t> dim_list(ndim);
-  for (int64_t ind = 0; ind < ndim; ind++) {
+  for (const auto ind : c10::irange(ndim)) {
    dim_list[ind] = ind;
  }
  return dim_list;
@ -2818,7 +2819,7 @@ struct KronImpl final {
      a_reshape = c10::SmallVector<int64_t, 10>(2 * maxdim);
      b_reshape = c10::SmallVector<int64_t, 10>(2 * maxdim);
      result_reshape = c10::SmallVector<int64_t, 10>(maxdim);
-      for (int64_t i = 0; i < maxdim; i++) {
+      for (const auto i : c10::irange(maxdim)) {
        a_reshape[2 * i] = (i >= pad_self ? self.sizes()[i - pad_self] : 1);
        a_reshape[2 * i + 1] = 1;
        b_reshape[2 * i] = 1;
@ -2833,7 +2834,7 @@ struct KronImpl final {
      TORCH_INTERNAL_ASSERT(result.defined(), "Cannot call kron_out with an undefined result tensor as the out argument. Please allocate a Tensor before calling kron_out with it.");
      c10::SmallVector<int64_t, 10> mul_shape(2 * maxdim);
-      for (int64_t i = 0; i < maxdim; i++) {
+      for (const auto i : c10::irange(maxdim)) {
        mul_shape[2 * i] = a_reshape[2 * i];
        mul_shape[2 * i + 1] = b_reshape[2 * i + 1];
      }
--- a/aten/src/ATen/native/LinearAlgebraUtils.h
+++ b/aten/src/ATen/native/LinearAlgebraUtils.h
@ -1,6 +1,7 @@
 #pragma once
 #include <c10/core/ScalarType.h>
 #include <c10/util/irange.h>
 #include <ATen/ATen.h>
 #include <ATen/ExpandUtils.h>
 #include <ATen/TensorUtils.h>
@ -169,7 +170,8 @@ void batch_iterator_with_broadcasting(const Tensor& a, const Tensor& b, const fu
    auto* b_batch_idx_ptr = data[0];
    auto* a_batch_idx_ptr = data[1];
-    for (int64_t elem = 0; elem < nelems; ++elem) {
+    for (const auto elem : c10::irange(nelems)) {
      (void)elem; //Suppress unused variable warning
      auto b_curr_linear_batch_idx = *reinterpret_cast<int64_t*>(b_batch_idx_ptr);
      auto a_curr_linear_batch_idx = *reinterpret_cast<int64_t*>(a_batch_idx_ptr);
@ -332,7 +334,7 @@ static inline Tensor _move_to_end(const Tensor& self, IntArrayRef axes) {
  const int64_t ndim = self.ndimension();
  std::vector<int64_t> perm;
-  for (int64_t i = 0; i < ndim; i++) {
+  for (const auto i : c10::irange(ndim)) {
    auto it = std::find(a.begin(), a.end(), i);
    if (it == a.end()) {
       perm.push_back(i);
@ -476,7 +478,7 @@ static inline std::vector<int64_t> create_dim_backshift_permutation(int64_t dim0
    "duplicate or invalid dimensions");
  std::vector<int64_t> permutation(ndim);
  int64_t cur_permuted_dim = 0;
-  for (int64_t dim_ind = 0; dim_ind < ndim; dim_ind++) {
+  for (const auto dim_ind : c10::irange(ndim)) {
    if ((dim_ind != dim0) && (dim_ind != dim1)) {
      permutation[cur_permuted_dim++] = dim_ind;
    }
@ -493,7 +495,7 @@ static inline std::vector<int64_t> create_dim_backshift_permutation(int64_t dim0
 static inline std::vector<int64_t> create_reverse_permutation(std::vector<int64_t> permutation) {
  int64_t ndim = permutation.size();
  std::vector<int64_t> reverse_permutation(ndim);
-  for (int64_t dim_ind = 0; dim_ind < ndim; dim_ind++) {
+  for (const auto dim_ind : c10::irange(ndim)) {
    reverse_permutation[permutation[dim_ind]] = dim_ind;
  }
  return reverse_permutation;
--- a/aten/src/ATen/native/LossCTC.cpp
+++ b/aten/src/ATen/native/LossCTC.cpp
@ -11,6 +11,7 @@
 #include <ATen/Parallel.h>
 #include <ATen/TensorUtils.h>
 #include <ATen/native/Fill.h>
 #include <c10/util/irange.h>
 #include <numeric>
 #include <type_traits>
@ -60,7 +61,7 @@ std::tuple<Tensor, Tensor> ctc_loss_cpu_template(const Tensor& log_probs, const
  std::vector<int64_t> tg_batch_offsets(batch_size);
  if (targets.dim() == 1) { // concatenated targets
    int64_t pos = 0;
-    for (int64_t i = 0; i < batch_size; i++) {
+    for (const auto i : c10::irange(batch_size)) {
      tg_batch_offsets[i] = pos;
      pos += target_lengths[i];
      if (max_target_length < target_lengths[i])
@ -72,7 +73,7 @@ std::tuple<Tensor, Tensor> ctc_loss_cpu_template(const Tensor& log_probs, const
  else { // batch x max_target_length
    // dim is 2
    int64_t tg_batch_stride = targets.stride(0);
-    for (int64_t i = 0; i < batch_size; i++) {
+    for (const auto i : c10::irange(batch_size)) {
      tg_batch_offsets[i] = i * tg_batch_stride;
      if (max_target_length < target_lengths[i])
        max_target_length = target_lengths[i];
@ -84,7 +85,7 @@ std::tuple<Tensor, Tensor> ctc_loss_cpu_template(const Tensor& log_probs, const
             " (while checking arguments for ", c, ")");
  }
  int64_t max_input_length = log_probs.size(0);
-  for (int64_t b = 0; b < batch_size; b++) {
+  for (const auto b : c10::irange(batch_size)) {
    TORCH_CHECK(input_lengths[b] <= max_input_length,
             "Expected input_lengths to have value at most ", max_input_length, ", but got value ", input_lengths[b],
             " (while checking arguments for ", c, ")");
@ -103,7 +104,7 @@ std::tuple<Tensor, Tensor> ctc_loss_cpu_template(const Tensor& log_probs, const
  // first the default
  log_alpha.narrow(1, 0, 1).fill_(neginf);
  at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) {
-    for (int64_t b = start; b < end; b++) {
+    for (const auto b : c10::irange(start, end)) {
      int64_t input_length = input_lengths[b];
      int64_t target_length = target_lengths[b];
      auto log_probs_a = log_probs_a_global[b];
@ -116,7 +117,7 @@ std::tuple<Tensor, Tensor> ctc_loss_cpu_template(const Tensor& log_probs, const
        log_alpha_a[0][1] = log_probs_a[0][get_target_prime(targets_data, tg_batch_offset, tg_target_stride, 1, BLANK)];
      // now the loop over the inputs
-      for (int64_t t=1; t<input_length; t++) {
+      for (const auto t : c10::irange(1, input_length)) {
        for (int64_t s=0; s<2*target_length+1; s++) {
          auto current_target_prime = get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s, BLANK);
          // this loop over s could be parallel/vectorized, too, but the required items are one index apart
@ -189,7 +190,7 @@ Tensor ctc_loss_backward_cpu_template(const Tensor& grad_out, const Tensor& log_
  if (targets.dim() == 1) { // concatenated targets
    int64_t pos = 0;
    max_target_length = 0;
-    for (int64_t i = 0; i < batch_size; i++) {
+    for (const auto i : c10::irange(batch_size)) {
      tg_batch_offsets[i] = pos;
      pos += target_lengths[i];
      if (max_target_length < target_lengths[i])
@ -200,7 +201,7 @@ Tensor ctc_loss_backward_cpu_template(const Tensor& grad_out, const Tensor& log_
  else { // batch x max_target_length
    // dim is 2
    int64_t tg_batch_stride = targets.stride(0);
-    for (int64_t i = 0; i < batch_size; i++) {
+    for (const auto i : c10::irange(batch_size)) {
      tg_batch_offsets[i] = i * tg_batch_stride;
    }
    tg_target_stride = targets.stride(1);
@ -234,7 +235,7 @@ Tensor ctc_loss_backward_cpu_template(const Tensor& grad_out, const Tensor& log_
    TensorIterator fill_1d_iter_local(fill_1d_iter);
    TensorIterator fill_log_beta_1d_iter_local(fill_log_beta_1d_iter);
-    for (int64_t b = start; b < end; b++) {
+    for (const auto b : c10::irange(start, end)) {
      scalar_t nll = neg_log_likelihood.accessor<scalar_t, 1>()[b];
      auto grad_a = grad_a_global[b];
      if (zero_infinity && nll == std::numeric_limits<scalar_t>::infinity()) {
@ -322,8 +323,8 @@ Tensor ctc_loss_backward_cpu_template(const Tensor& grad_out, const Tensor& log_
      // this could be a great target for further vectorization.
      // grad is the output gradient, nll is the loss. Note that the likelihood -nll is the Z of eq (16)
      scalar_t gr = grad_out.accessor<scalar_t, 1>()[b];
-      for (int64_t t = 0; t < input_length; t++) { // or go for the full thing?
+      for (const auto t : c10::irange(input_length)) { // or go for the full thing?
-        for (int64_t c = 0; c < num_labels; c++) {
+        for (const auto c : c10::irange(num_labels)) {
          scalar_t& res = grad_a[t][c];
          scalar_t lp = log_probs_a[t][c];
          res = (std::exp(lp)-std::exp(res + nll - lp)) * gr;
--- a/aten/src/ATen/native/LossMultiLabelMargin.cpp
+++ b/aten/src/ATen/native/LossMultiLabelMargin.cpp
@ -3,6 +3,7 @@
 #include <ATen/Dispatch.h>
 #include <ATen/TensorUtils.h>
 #include <ATen/native/LossMulti.h>
 #include <c10/util/irange.h>
 namespace at {
 namespace native {
@ -17,21 +18,21 @@ inline scalar_t multilabel_margin_loss_forward_inner_sum_cpu(
    int64_t dim) {
  using accscalar_t = at::acc_type<scalar_t, false>;
  accscalar_t sum = 0;
-  for (int64_t ddt = 0; ddt < dim; ddt++) {
+  for (const auto ddt : c10::irange(dim)) {
    int64_t target_idx = target_data[ddt];
    if (target_idx < 0) {
      break;
    }
    is_target_data[target_idx] = 1;
  }
-  for (int64_t dt = 0; dt < dim; dt++) {
+  for (const auto dt : c10::irange(dim)) {
    int64_t target_idx = target_data[dt];
    if (target_idx < 0) {
      break;
    }
    scalar_t input_target = input_data[target_idx];
-    for (int64_t d = 0; d < dim; d++) {
+    for (const auto d : c10::irange(dim)) {
      if (!is_target_data[d]) {
        scalar_t z = 1 - input_target + input_data[d];
        if (z > 0) {
@ -63,7 +64,8 @@ static void multilabel_margin_loss_forward_out_frame(
    accscalar_t sum = 0;
-    for (int64_t t = 0; t < nframe; t++) {
+    for (const auto t : c10::irange(nframe)) {
      (void)t; //Suppress unused variable warning
      sum += multilabel_margin_loss_forward_inner_sum_cpu(
          input_data, target_data, is_target_data, dim);
@ -81,7 +83,7 @@ static void multilabel_margin_loss_forward_out_frame(
  } else {
    auto output_acc = output.accessor<scalar_t, 1>();
-    for (int64_t t = 0; t < nframe; t++) {
+    for (const auto t : c10::irange(nframe)) {
      scalar_t sum = multilabel_margin_loss_forward_inner_sum_cpu(
          input_data, target_data, is_target_data, dim);
@ -171,15 +173,16 @@ static void multilabel_margin_loss_backward_out_frame(
      reduction == Reduction::Mean ? 1. / (nframe * dim) : 1. / dim);
  scalar_t* grad_input_row_data = grad_input.data_ptr<scalar_t>();
-  for (int64_t t = 0; t < nframe; t++) {
+  for (const auto t : c10::irange(nframe)) {
-    for (int64_t dt = 0; dt < dim; dt++) {
+    (void)t; //Suppress unused variable warning
    for (const auto dt : c10::irange(dim)) {
      int64_t target_idx = target_data[dt];
      if (target_idx < 0) {
        break;
      }
      scalar_t input_target = input_data[target_idx];
-      for (int64_t d = 0; d < dim; d++) {
+      for (const auto d : c10::irange(dim)) {
        if (!is_target_data[d]) {
          scalar_t z = 1 - input_target + input_data[d];
          if (z > 0) {
@ -206,8 +209,8 @@ static void multilabel_margin_loss_backward_out_frame(
  } else {
    check_dim_size(grad_output, 1, 0, nframe);
    auto grad_output_acc = grad_output.accessor<scalar_t, 1>();
-    for (int64_t t = 0; t < nframe; t++) {
+    for (const auto t : c10::irange(nframe)) {
-      for (int64_t d = 0; d < dim; d++) {
+      for (const auto d : c10::irange(dim)) {
        grad_input_data[t * dim + d] *= grad_output_acc[t];
      }
    }
--- a/aten/src/ATen/native/LossMultiMargin.cpp
+++ b/aten/src/ATen/native/LossMultiMargin.cpp
@ -2,6 +2,7 @@
 #include <ATen/Dispatch.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/native/LossMulti.h>
 #include <c10/util/irange.h>
 namespace at {
 namespace native {
@ -18,7 +19,7 @@ inline scalar_t multi_margin_inner_sum_cpu(
    const int64_t target_idx) {
  const scalar_t input_target = input_data[target_idx];
  scalar_t sum = 0;
-  for (int64_t d = 0; d < dim; d++) {
+  for (const auto d : c10::irange(dim)) {
    if (d == target_idx) {
      continue;
    }
@ -63,7 +64,7 @@ static inline void multi_margin_loss_cpu_kernel(
  // cannot be handled by TensorAccessor)
  if (reduction == Reduction::None && output.dim() > 0) {
    auto output_acc = output.accessor<scalar_t, 1>();
-    for (int64_t t = 0; t < nframe; t++) {
+    for (const auto t : c10::irange(nframe)) {
      const auto idx = target_index_checked(target_data, t, dim);
      auto sum = multi_margin_inner_sum_cpu(
          input_data, weight_data, p, margin, dim, idx);
@ -73,7 +74,7 @@ static inline void multi_margin_loss_cpu_kernel(
  } else {
    accscalar_t sum = 0;
    auto output_acc = output.data_ptr<scalar_t>();
-    for (int64_t t = 0; t < nframe; t++) {
+    for (const auto t : c10::irange(nframe)) {
      const auto idx = target_index_checked(target_data, t, dim);
      sum += multi_margin_inner_sum_cpu(
          input_data, weight_data, p, margin, dim, idx);
@ -149,11 +150,11 @@ static void multi_margin_loss_backward_cpu_kernel(
    int64_t dim,
    int64_t reduction) {
  scalar_t* grad_input_row_data = grad_input_data;
-  for (int64_t t = 0; t < nframe; t++) {
+  for (const auto t : c10::irange(nframe)) {
    int64_t target_idx = target_index_checked(target_data, t, dim);
    scalar_t input_target = input_data[target_idx];
    scalar_t grad_input_target = 0;
-    for (int64_t d = 0; d < dim; d++) {
+    for (const auto d : c10::irange(dim)) {
      scalar_t z = margin - input_target + input_data[d];
      if (d == target_idx) {
        continue;
@ -186,8 +187,8 @@ static void multi_margin_loss_backward_cpu_kernel(
    }
  } else {
    auto grad_output_acc = grad_output.accessor<scalar_t, 1>();
-    for (int64_t t = 0; t < nframe; t++) {
+    for (const auto t : c10::irange(nframe)) {
-      for (int64_t d = 0; d < dim; d++) {
+      for (const auto d : c10::irange(dim)) {
        grad_input_data[t * dim + d] *= grad_output_acc[t];
      }
    }
--- a/aten/src/ATen/native/LossNLL.cpp
+++ b/aten/src/ATen/native/LossNLL.cpp
@ -9,6 +9,7 @@
 #include <c10/util/SmallBuffer.h>
 #include <c10/core/TensorOptions.h>
 #include <c10/util/irange.h>
 namespace at {
 namespace meta {
@ -155,7 +156,7 @@ static void nll_loss_out_frame(
    auto output_acc = output.accessor<scalar_t, 1>();
    at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) {
-      for (auto i = start; i < end; i++) {
+      for (const auto i : c10::irange(start, end)) {
        const auto cur_target = target_acc[i];
        if (cur_target == ignore_index) {
@ -215,7 +216,7 @@ static void nll_loss_out_frame(
  scalar_t weight_partial_sums[cascade_sum_num_levels] = {0};
  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
  scalar_t loss_partial_sums[cascade_sum_num_levels] = {0};
-  for (int64_t b = 0; b < batch_size; b++) {
+  for (const auto b : c10::irange(batch_size)) {
    const int64_t cur_target = target_data[b];
    if (cur_target == ignore_index) {
      ++num_ignored;
@ -330,7 +331,7 @@ static void nll_loss_backward_out_frame(
    auto grad_input_acc = grad_input.accessor<scalar_t, 2>();
    auto grad_output_acc = grad_output.accessor<scalar_t, 1>();
    at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) {
-      for (auto i = start; i < end; i++) {
+      for (const auto i : c10::irange(start, end)) {
        auto cur_target = target_acc[i];
        if (cur_target == ignore_index) {
          continue;
--- a/aten/src/ATen/native/LossNLL2d.cpp
+++ b/aten/src/ATen/native/LossNLL2d.cpp
@ -5,6 +5,7 @@
 #include <ATen/TensorUtils.h>
 #include <ATen/native/cpu/utils.h>
 #include <ATen/native/Resize.h>
 #include <c10/util/irange.h>
 namespace at {
 namespace native {
@ -109,9 +110,9 @@ static void nll_loss2d_forward_out_frame(
    auto target_acc = target.accessor<int64_t, 3>();
    at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) {
-      for (int64_t b = start; b < end; b++) {
+      for (const auto b : c10::irange(start, end)) {
-        for (int64_t h = 0; h < H; h++) {
+        for (const auto h : c10::irange(H)) {
-          for (int64_t w = 0; w < W; w++) {
+          for (const auto w : c10::irange(W)) {
            const int64_t cur_target = (int64_t)target_acc[b][h][w];
            if (cur_target == ignore_index) {
@ -176,8 +177,8 @@ static void nll_loss2d_forward_out_frame(
  const int64_t level_mask = level_step - 1;
  int64_t num_ignored = 0;
-  for (int64_t b = 0; b < batch_size; b++) {
+  for (const auto b : c10::irange(batch_size)) {
-    for (int64_t elem = 0; elem < map_size; elem++) {
+    for (const auto elem : c10::irange(map_size)) {
      const int64_t cur_target = target_data[b * map_size + elem];
      if (cur_target == ignore_index) {
        ++num_ignored;
@ -286,9 +287,9 @@ static void nll_loss2d_backward_out_frame(
    auto target_acc = target.accessor<int64_t, 3>();
    at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) {
-      for (int64_t b = start; b < end; b++) {
+      for (const auto b : c10::irange(start, end)) {
-        for (int64_t h = 0; h < H; h++) {
+        for (const auto h : c10::irange(H)) {
-          for (int64_t w = 0; w < W; w++) {
+          for (const auto w : c10::irange(W)) {
            const int64_t cur_target = target_acc[b][h][w];
            if (cur_target == ignore_index) {
              continue;
@ -329,8 +330,8 @@ static void nll_loss2d_backward_out_frame(
                                                   : grad_output_value);
  at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) {
-    for (int64_t b = start; b < end; b++) {
+    for (const auto b : c10::irange(start, end)) {
-      for (int64_t elem = 0; elem < map_size; elem++) {
+      for (const auto elem : c10::irange(map_size)) {
        const int64_t t = target_data[b * map_size + elem];
        if (t != ignore_index) {
--- a/aten/src/ATen/native/NNPACK.cpp
+++ b/aten/src/ATen/native/NNPACK.cpp
@ -60,6 +60,7 @@ bool _nnpack_available() {
 #include <caffe2/utils/threadpool/pthreadpool-cpp.h>
 #include <ATen/native/ConvUtils.h>
 #include <ATen/Parallel.h>
 #include <c10/util/irange.h>
 namespace at {
 namespace native {
@ -238,7 +239,7 @@ Tensor _nnpack_spatial_convolution(
      const size_t input_size_per_batch = input_channels * input_size.width * input_size.height;
      const size_t output_size_per_batch = output_channels * output_size.width * output_size.height;
-      for (size_t batch = 0u; batch < batch_size; ++batch) {
+      for (const auto batch : c10::irange(0u, batch_size)) {
        const nnp_status status = nnp_convolution_inference(
            algorithm,
            nnp_convolution_transform_strategy_compute,
--- a/aten/src/ATen/native/NamedTensor.cpp
+++ b/aten/src/ATen/native/NamedTensor.cpp
@ -100,7 +100,7 @@ Tensor refine_names(const Tensor& self, DimnameList names) {
      self_names.size(), " and ", names.size(), " respectively).");
  check_names_valid_for(self, names);
-  for (size_t idx = 0; idx < self_names.size(); idx++) {
+  for (const auto idx : c10::irange(self_names.size())) {
    const auto& self_name = self_names[idx];
    const auto& out_name = names[idx];
    if (self_name == out_name || self_name.isWildcard()) {
@ -221,7 +221,7 @@ Tensor align_to(const Tensor& tensor, DimnameList order, int64_t ellipsis_idx) {
  };
  // Fill in the non-ellipsis dimensions
-  for (auto order_idx = 0U; order_idx < order.size(); ++order_idx) {
+  for (const auto order_idx : c10::irange(0U, order.size())) {
    auto out_idx = order_idx;
    if (order_idx >= ellipsis_idx) {
      out_idx = order_idx + num_ellipsis_names;
--- a/aten/src/ATen/native/Normalization.cpp
+++ b/aten/src/ATen/native/Normalization.cpp
@ -10,6 +10,7 @@
 #include <ATen/native/cpu/Loops.h>
 #include <ATen/native/batch_norm.h>
 #include <ATen/native/Normalization.h>
 #include <c10/util/irange.h>
 #include <vector>
@ -156,7 +157,7 @@ std::tuple<Tensor,Tensor> batch_norm_cpu_update_stats_template(
  // Reduce all dimensions except dim=1
  DimVector reduce_dims(ndim - 1);
  reduce_dims[0] = 0;
-  for (int64_t i = 2; i < ndim; ++i) {
+  for (const auto i : c10::irange(2, ndim)) {
    reduce_dims[i - 1] = i;
  }
@ -178,7 +179,7 @@ std::tuple<Tensor,Tensor> batch_norm_cpu_update_stats_template(
    batch_norm_cpu_collect_stats_stub(kCPU, _mean, _var_sum, input);
    parallel_for(0, n_input, 1, [&](int64_t b_begin, int64_t b_end) {
-      for (int64_t f = b_begin; f < b_end; ++f) {
+      for (const auto f : c10::irange(b_begin, b_end)) {
        save_mean_a[f] = _mean_a[f];
        save_var_transform_a[f] = VarTransform<accscalar_t>{}(_var_sum_a[f] / n, eps);
@ -206,7 +207,7 @@ std::tuple<Tensor,Tensor> batch_norm_cpu_update_stats_template(
  parallel_for(0, n_input, 1, [&](int64_t b_begin, int64_t b_end) {
    TensorIterator iter(reduce_iter);
-    for (int64_t f = b_begin; f < b_end; ++f) {
+    for (const auto f : c10::irange(b_begin, b_end)) {
      // compute variance per input
      iter.unsafe_replace_operand(0, in_data + channel_stride * f);
      accscalar_t var_sum = 0;
@ -283,7 +284,7 @@ std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_cpu_template(
  // Reduce all dimensions except dim=1
  DimVector reduce_dims(ndim - 1);
  reduce_dims[0] = 0;
-  for (int64_t i = 2; i < ndim; ++i) {
+  for (const auto i : c10::irange(2, ndim)) {
    reduce_dims[i - 1] = i;
  }
@ -330,7 +331,7 @@ std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_cpu_template(
      TensorIterator unary_iter_local(unary_iter);
      TensorIterator binary_iter_local(binary_iter);
-      for (int64_t f = b_begin; f < b_end; ++f) {
+      for (const auto f : c10::irange(b_begin, b_end)) {
        scalar_t w = weight.defined() ? weight_a[f] : 1;
        scalar_t mean, invstd;
--- a/aten/src/ATen/native/PackedSequence.cpp
+++ b/aten/src/ATen/native/PackedSequence.cpp
@ -77,7 +77,7 @@ std::tuple<Tensor, Tensor> _pack_padded_sequence(const Tensor& _input, const Ten
  // more elements below in our column, we lower the counter (prev_l), and append the new
  // block to the output.
  int64_t prev_l = 0;
-  for (int64_t i = 0; i < batch_size; ++i) {
+  for (const auto i : c10::irange(batch_size)) {
    int64_t l = lengths[batch_size - 1 - i];
    if (l > prev_l) {
      auto current_batch_size = batch_size - i;
@ -109,7 +109,7 @@ Tensor _pack_padded_sequence_backward(const Tensor& grad, at::IntArrayRef input_
  int64_t offset = 0;
  int64_t max_seq_len = batch_sizes_t.size(0);
  int64_t * batch_sizes = batch_sizes_t.data_ptr<int64_t>();
-  for (int64_t i = 0; i < max_seq_len; ++i) {
+  for (const auto i : c10::irange(max_seq_len)) {
    grad_input[i].slice(0, 0, batch_sizes[i]).copy_(grad.slice(0, offset, offset + batch_sizes[i]));
    offset += batch_sizes[i];
  }
@ -170,7 +170,8 @@ std::tuple<Tensor, Tensor> _pad_packed_sequence(const Tensor& data, const Tensor
    }
    int64_t dec = prev_batch_size - batch_size;
    if (dec > 0) {
-      for (int64_t j = 0; j < dec; ++j) {
+      for (const auto j : c10::irange(dec)) {
        (void)j; //Suppress unused variable warning
        (*lengths--) = i;
      }
    }
@ -206,7 +207,7 @@ Tensor pad_sequence(TensorList sequences, bool batch_first, double padding_value
  out_dims.insert(out_dims.end(), trailing_dims.begin(), trailing_dims.end());
  Tensor out = at::full(out_dims, padding_value, sequences[0].options());
-  for (int64_t i = 0; i < sequences_size; i++) {
+  for (const auto i : c10::irange(sequences_size)) {
    const Tensor currseq = sequences[i];
    const int64_t length_i = currseq.size(0);
    // use index notation to prevent duplicate references to the tensor
--- a/aten/src/ATen/native/Pool.h
+++ b/aten/src/ATen/native/Pool.h
@ -2,6 +2,7 @@
 #include <ATen/NativeFunctions.h>
 #include <ATen/div_rtn.h>
 #include <ATen/native/DispatchStub.h>
 #include <c10/util/irange.h>
 #pragma once
@ -212,7 +213,7 @@ pool3d_shape_check(
  TORCH_CHECK(ndim == 4 || ndim == 5,
              fn_name, ": Expected 4D or 5D tensor for input, but got: ", input.sizes());
-  for (int64_t i = 1; i < ndim; ++i) {
+  for (const auto i : c10::irange(1, ndim)) {
    TORCH_CHECK(input.size(i) > 0,
                fn_name, "Expected input to have non-zero size for non-batch dimensions, but got",
                input.sizes(), " with dimension ", i, " being empty.");
--- a/aten/src/ATen/native/QuantizedLinear.cpp
+++ b/aten/src/ATen/native/QuantizedLinear.cpp
@ -206,9 +206,9 @@ void CalcColOffsetsTranspose(
    const int8_t* Bint8,
    int32_t B_zero_point,
    int32_t* col_offsets) {
-  for (int i = 0; i < N; ++i) {
+  for (const auto i : c10::irange(N)) {
    int32_t sum = 0;
-    for (int j = 0; j < K; ++j) {
+    for (const auto j : c10::irange(K)) {
      sum += Bint8[i * K + j];
    }
    col_offsets[i] = sum - B_zero_point * K;
@ -353,7 +353,7 @@ bool CheckAndSaturate(T max_val, T* element) {
 void HandleWeightsSaturation(int64_t N, float* weight) {
  const float kFp16Max = RawUint16ToFp16(0x7BFF);
  bool found_out_of_range = false;
-  for (int64_t i = 0; i < N; ++i) {
+  for (const auto i : c10::irange(N)) {
    if (CheckAndSaturate<float>(kFp16Max, weight + i)) {
      found_out_of_range = true;
    }
--- a/Show More
+++ b/Show More