use irange for loops 2 (#66746)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/66746 Modified loops in files under fbsource/fbcode/caffe2/ from the format `for(TYPE var=x0;var<x_max;x++)` to the format `for(const auto var: irange(xmax))` This was achieved by running r-barnes's loop upgrader script (D28874212) with some modification to exclude all files under /torch/jit and a number of reversions or unused variable suppression warnings added by hand. Test Plan: Sandcastle Reviewed By: malfet Differential Revision: D31705361 fbshipit-source-id: 33fd22eb03086d114e2c98e56703e8ec84460268
2025-10-20 21:14:14 +08:00 · 2021-12-10 04:24:48 -08:00
parent 91d16cb633
commit 29d759948e
96 changed files with 19710 additions and 19683 deletions
--- a/aten/src/ATen/native/quantized/cpu/qlinear.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
@ -303,7 +303,7 @@ at::Tensor PackedLinearWeightsQnnp::apply_impl(
        w_zero_points[0]);
    auto* qnnp_w_data = qnnp_weight.data_ptr<c10::quint8>();
    auto wt_numel = weight_contig.numel();
-    for (int i = 0; i < wt_numel; ++i) {
+    for (const auto i : c10::irange(wt_numel)) {
      qnnp_w_data[i] = static_cast<c10::quint8>(w_data[i] + 128);
    }
    // Original bias was float, so we requantize it here.
--- a/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp
@ -301,7 +301,7 @@ at::Tensor PackedLinearWeightsQnnp::apply_dynamic_impl(
    auto* qnnp_w_data = qnnp_weight.data_ptr<c10::quint8>();
    int8_t* w_data = (int8_t*)weight_contig.data_ptr<c10::qint8>();
    auto wt_numel = weight_contig.numel();
-    for (int i = 0; i < wt_numel; ++i) {
+    for (const auto i : c10::irange(wt_numel)) {
      qnnp_w_data[i] = static_cast<c10::quint8>(w_data[i] + 128);
    }

--- a/aten/src/ATen/native/quantized/cpu/qpool.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qpool.cpp
@ -9,6 +9,7 @@
 #include <ATen/native/quantized/cpu/quantized_ops.h>
 #include <ATen/native/quantized/cpu/init_qnnpack.h>
 #include <ATen/native/quantized/cpu/qnnpack_utils.h>
+#include <c10/util/irange.h>
 #include <caffe2/utils/threadpool/pthreadpool-cpp.h>

 #include <algorithm>
@ -43,7 +44,7 @@ void spatial_dilated_max_pooling(
    int64_t dW, // dilation
    T* oData) { // output arrays (data and max-index)
  at::parallel_for(0, iC, 0, [&](int64_t start, int64_t end) {
-    for (auto p = start; p < end; ++p) {
+    for (const auto p : c10::irange(start, end)) {
      // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
      int64_t row, col;
      const T* i_p = iData + p * iW * iH;
@ -195,7 +196,7 @@ Tensor q_maxpool_2d(
          oData);
    } else {
      at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) {
-        for (auto p = start; p < end; ++p) {
+        for (const auto p : c10::irange(start, end)) {
          auto* iData = qxd + p * iC * iW * iH;
          auto* oData = qyd + p * oC * oW * oH;
          spatial_dilated_max_pooling<Q>(
--- a/aten/src/ATen/native/quantized/cpu/qrelu.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qrelu.cpp
@ -6,6 +6,7 @@
 #include <ATen/native/quantized/cpu/init_qnnpack.h>
 #include <ATen/native/quantized/cpu/qnnpack_utils.h>
 #include <ATen/native/quantized/cpu/quantized_ops.h>
+#include <c10/util/irange.h>
 #include <caffe2/utils/threadpool/pthreadpool-cpp.h>
 #include <torch/library.h>

@ -30,7 +31,7 @@ Tensor qnnpack_relu(Tensor input) {
  initQNNPACK();

  size_t num_elems = 1;
-  for (int i = 1; i < input_contig.ndimension(); ++i) {
+  for (const auto i : c10::irange(1, input_contig.ndimension())) {
    num_elems *= input_contig.size(i);
  }

--- a/aten/src/ATen/native/quantized/cpu/qsigmoid.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qsigmoid.cpp
@ -7,6 +7,7 @@
 #include <ATen/native/quantized/cpu/quantized_ops.h>
 #include <ATen/native/quantized/cpu/init_qnnpack.h>
 #include <ATen/native/quantized/cpu/qnnpack_utils.h>
+#include <c10/util/irange.h>
 #include <caffe2/utils/threadpool/pthreadpool-cpp.h>

 #include <algorithm>
@ -26,7 +27,7 @@ Tensor qnnpack_sigmoid(

  Tensor input_contig = input.contiguous(input.suggest_memory_format());
  size_t num_elems = 1;
-  for (int i = 1; i < input_contig.ndimension(); ++i) {
+  for (const auto i : c10::irange(1, input_contig.ndimension())) {
    num_elems *= input_contig.size(i);
  }

--- a/aten/src/ATen/native/quantized/cpu/qtanh.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qtanh.cpp
@ -7,6 +7,7 @@
 #include <ATen/native/quantized/cpu/quantized_ops.h>
 #include <ATen/native/quantized/cpu/init_qnnpack.h>
 #include <ATen/native/quantized/cpu/qnnpack_utils.h>
+#include <c10/util/irange.h>
 #include <caffe2/utils/threadpool/pthreadpool-cpp.h>

 #include <algorithm>
@ -29,7 +30,7 @@ Tensor qnnpack_tanh(Tensor input) {

  Tensor input_contig = input.contiguous(input.suggest_memory_format());
  size_t num_elems = 1;
-  for (int i = 1; i < input_contig.ndimension(); ++i) {
+  for (const auto i : c10::irange(1, input_contig.ndimension())) {
    num_elems *= input_contig.size(i);
  }
  const auto zero_point = input_contig.q_zero_point();
--- a/aten/src/ATen/native/quantized/cpu/quant_utils.h
+++ b/aten/src/ATen/native/quantized/cpu/quant_utils.h
@ -1,6 +1,7 @@
 #pragma once

 #include <ATen/ATen.h>
+#include <c10/util/irange.h>
 #include <algorithm>
 #include <cmath>

@ -193,7 +194,7 @@ static C10_UNUSED torch::List<int64_t> MakeArgForConv1d(const torch::List<int64_
 inline void HandleWeightsSaturation(int64_t N, float* weight) {
  const float kFp16Max = RawUint16ToFp16(0x7BFF);
  bool found_out_of_range = false;
-  for (int64_t i = 0; i < N; ++i) {
+  for (const auto i : c10::irange(N)) {
    bool saturate = CheckAndSaturate<float>(kFp16Max, weight + i);
    if (saturate) {
      found_out_of_range = true;
--- a/aten/src/ATen/native/quantized/cpu/qupsample_bilinear2d.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qupsample_bilinear2d.cpp
@ -2,6 +2,7 @@
 #include <ATen/native/UpSample.h>
 #include <ATen/native/quantized/affine_quantizer.h>
 #include <ATen/native/quantized/cpu/quantized_ops.h>
+#include <c10/util/irange.h>

 #include <algorithm>
 #include <cmath>
@ -57,7 +58,7 @@ static void upsample_bilinear2d_out_frame(
  const int64_t input_q_zero_point = input.q_zero_point();
  const int64_t output_q_zero_point = output.q_zero_point();

-  for (int64_t h2 = 0; h2 < output_height; ++h2) {
+  for (const auto h2 : c10::irange(output_height)) {
    const auto h1r = area_pixel_compute_source_index<float>(
        rheight, h2, align_corners, /*cubic=*/false);

@ -67,7 +68,7 @@ static void upsample_bilinear2d_out_frame(
    const float h1lambda = h1r - h1;
    const float h0lambda = static_cast<float>(1.) - h1lambda;

-    for (int64_t w2 = 0; w2 < output_width; ++w2) {
+    for (const auto w2 : c10::irange(output_width)) {
      const auto w1r = area_pixel_compute_source_index<float>(
          rwidth, w2, align_corners, /*cubic=*/false);

@ -79,7 +80,8 @@ static void upsample_bilinear2d_out_frame(
      const typename scalar_t::underlying* pos1 = i_p + h1 * input_width + w1;
      typename scalar_t::underlying* pos2 = o_p + h2 * output_width + w2;

-      for (int64_t c = 0; c < channels; ++c) {
+      for (const auto c : c10::irange(channels)) {
+        (void)c; //Suppress unused variable warning
        float result = h0lambda * (w0lambda * pos1[0] + w1lambda * pos1[w1p]) +
            h1lambda *
                (w0lambda * pos1[h1p * input_width] +
--- a/aten/src/ATen/native/quantized/cpu/qupsample_nearest2d.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qupsample_nearest2d.cpp
@ -44,18 +44,19 @@ static void upsample_nearest2d_out_frame(
    return;
  }

-  for (int64_t h2 = 0; h2 < output_height; ++h2) {
+  for (const auto h2 : c10::irange(output_height)) {
    const int64_t h1 =
        nn_compute_source_index_fn(height_scale, h2, input_height);

-    for (int64_t w2 = 0; w2 < output_width; ++w2) {
+    for (const auto w2 : c10::irange(output_width)) {
      const int64_t w1 =
          nn_compute_source_index_fn(width_scale, w2, input_width);

      const auto* pos1 = &i_p[h1 * input_width + w1];
      auto* pos2 = &o_p[h2 * output_width + w2];

-      for (int64_t c = 0; c < channels; ++c) {
+      for (const auto c : c10::irange(channels)) {
+        (void)c; //Suppress unused variable warning
        pos2[0] = pos1[0];
        pos1 += input_height * input_width;
        pos2 += output_height * output_width;
@ -88,11 +89,11 @@ static void upsample_nearest2d_out_frame_nhwc(
      return;
    }

-    for (int64_t h2 = 0; h2 < output_height; ++h2) {
+    for (const auto h2 : c10::irange(output_height)) {
      const int64_t h1 =
          nn_compute_source_index_fn(height_scale, h2, input_height);

-      for (int64_t w2 = 0; w2 < output_width; ++w2) {
+      for (const auto w2 : c10::irange(output_width)) {
        const int64_t w1 =
            nn_compute_source_index_fn(width_scale, w2, input_width);

--- a/aten/src/ATen/native/quantized/cpu/qupsample_nearest3d.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qupsample_nearest3d.cpp
@ -48,22 +48,23 @@ static void upsample_nearest3d_out_frame(
    return;
  }

-  for (int64_t d2 = 0; d2 < output_depth; ++d2) {
+  for (const auto d2 : c10::irange(output_depth)) {
    const int64_t d1 =
          nn_compute_source_index_fn(depth_scale, d2, input_depth);

-    for (int64_t h2 = 0; h2 < output_height; ++h2) {
+    for (const auto h2 : c10::irange(output_height)) {
      const int64_t h1 =
          nn_compute_source_index_fn(height_scale, h2, input_height);

-      for (int64_t w2 = 0; w2 < output_width; ++w2) {
+      for (const auto w2 : c10::irange(output_width)) {
        const int64_t w1 =
            nn_compute_source_index_fn(width_scale, w2, input_width);

        const auto* pos1 = &i_p[d1 * input_height * input_width + h1 * input_width + w1];
        auto* pos2 = &o_p[d2 * output_height * output_width + h2 * output_width + w2];

-        for (int64_t c = 0; c < channels; ++c) {
+        for (const auto c : c10::irange(channels)) {
+          (void)c; //Suppress unused variable warning
          pos2[0] = pos1[0];
          pos1 += input_depth * input_height * input_width;
          pos2 += output_depth * output_height * output_width;
@ -101,14 +102,14 @@ static void upsample_nearest3d_out_frame_nhwc(
      return;
    }

-    for (int64_t d2 = 0; d2 < output_depth; ++d2) {
+    for (const auto d2 : c10::irange(output_depth)) {
      const int64_t d1 =
          nn_compute_source_index_fn(depth_scale, d2, input_depth);
-      for (int64_t h2 = 0; h2 < output_height; ++h2) {
+      for (const auto h2 : c10::irange(output_height)) {
        const int64_t h1 =
            nn_compute_source_index_fn(height_scale, h2, input_height);

-        for (int64_t w2 = 0; w2 < output_width; ++w2) {
+        for (const auto w2 : c10::irange(output_width)) {
          const int64_t w1 =
              nn_compute_source_index_fn(width_scale, w2, input_width);

--- a/aten/src/ATen/native/quantized/fake_quant_per_channel_affine.cpp
+++ b/aten/src/ATen/native/quantized/fake_quant_per_channel_affine.cpp
@ -218,7 +218,7 @@ std::tuple<Tensor, Tensor, Tensor> _fake_quantize_learnable_per_channel_affine_b
  // into the same shapes as X along the channel axis.
  // NOLINTNEXTLINE(cppcoreguidelines-no-malloc)
  int64_t* axis_mask = (int64_t *) calloc(numDimensions, sizeof(int64_t));
-  for (int i = 0; i < numDimensions; ++i) {
+  for (const auto i : c10::irange(numDimensions)) {
    axis_mask[i] = (i == axis) ? X.size(axis) : 1;
  }
  auto X_shape = X.sizes();
--- a/aten/src/ATen/native/sparse/SoftMax.cpp
+++ b/aten/src/ATen/native/sparse/SoftMax.cpp
@ -7,6 +7,7 @@
 #include <ATen/Parallel.h>
 #include <ATen/SparseTensorUtils.h>
 #include <c10/util/accumulate.h>
+#include <c10/util/irange.h>

 #include <map>

@ -71,9 +72,9 @@ std::vector<int64_t> get_offsets(const Tensor& indices, const IntArrayRef& sizes
    }
  }

-  for (int64_t i=0; i < nnz; i++) {
+  for (const auto i : c10::irange(nnz)) {
    int64_t acc = 0;
-    for (int64_t j=0; j < ndim; j++) {
+    for (const auto j : c10::irange(ndim)) {
      auto indices_row = indices_accessor[j];
      auto stride = strides[j];
      if (j != dim) {
@ -119,9 +120,9 @@ std::vector<std::vector<int64_t>> get_pools(const Tensor& indices, const IntArra
    }
  }

-  for (int64_t i=0; i < nnz; i++) {
+  for (const auto i : c10::irange(nnz)) {
    int64_t pool_index = 0;
-    for (int64_t j=0; j < ndim; j++) {
+    for (const auto j : c10::irange(ndim)) {
      if (j != dim) {
        const auto indices_row = indices_accessor[j];
        const auto stride = strides[j];
@ -315,7 +316,7 @@ void cpu_sparse_coo_softmax(Tensor output, const Tensor& input, const int64_t di

  int64_t grain_size = 1;
  parallel_for(0, pools.size(), grain_size, [&](int64_t begin, int64_t end) {
-      for (auto p = begin; p < end; p++) {
+      for (const auto p : c10::irange(begin, end)) {
        auto pool_indices = pools[p];

        // Skip empty pools
@ -329,7 +330,7 @@ void cpu_sparse_coo_softmax(Tensor output, const Tensor& input, const int64_t di
        /* Compute mx */
        for (int64_t i : pool_indices) {
          auto values_row = values_accessor[i];
-          for (int64_t j=0; j < nvalues; j++) {
+          for (const auto j : c10::irange(nvalues)) {
            mx_row[j] = std::max(mx_row[j], values_row[j]);
          }
        }
@ -338,7 +339,7 @@ void cpu_sparse_coo_softmax(Tensor output, const Tensor& input, const int64_t di
        for (int64_t i : pool_indices) {
          auto values_row = values_accessor[i];
          auto out_values_row = out_values_accessor[i];
-          for (int64_t j=0; j < nvalues; j++) {
+          for (const auto j : c10::irange(nvalues)) {
            auto v = std::exp(values_row[j] - mx_row[j]);
            if (!LogSoftMax) {
              out_values_row[j] = v;
@ -347,7 +348,7 @@ void cpu_sparse_coo_softmax(Tensor output, const Tensor& input, const int64_t di
          }
        }

-        for (int64_t j=0; j < nvalues; j++) {
+        for (const auto j : c10::irange(nvalues)) {
          if (LogSoftMax) {
            mx_row[j] += std::log(exp_sums_row[j]);
          } else {
@ -359,7 +360,7 @@ void cpu_sparse_coo_softmax(Tensor output, const Tensor& input, const int64_t di
        for (int64_t i : pool_indices) {
          auto values_row = values_accessor[i];
          auto out_values_row = out_values_accessor[i];
-          for (int64_t j=0; j < nvalues; j++) {
+          for (const auto j : c10::irange(nvalues)) {
            if (LogSoftMax) {
              out_values_row[j] = values_row[j] - mx_row[j];
            } else {
@ -421,7 +422,7 @@ void cpu_sparse_coo_softmax_backward(const Tensor& grad_input, const Tensor& gra
        values.set_(r);
      }
    } else {
-      for(int64_t i=0; i<out_nnz; i++) {
+      for (const auto i : c10::irange(out_nnz)) {
        auto low = std::lower_bound(grad_offsets.begin(), grad_offsets.end(), out_offsets[i]);
        auto j = low - grad_offsets.begin();
        if (j < grad_nnz && out_offsets[i] == grad_offsets[j]) {
@ -456,7 +457,7 @@ void cpu_sparse_coo_softmax_backward(const Tensor& grad_input, const Tensor& gra

  int64_t grain_size = 1;
  parallel_for(0, pools.size(), grain_size, [&](int64_t begin, int64_t end) {
-      for (auto p = begin; p < end; p++) {
+      for (const auto p : c10::irange(begin, end)) {
        auto pool_indices = pools[p];

        // Skip empty pools
@ -473,7 +474,7 @@ void cpu_sparse_coo_softmax_backward(const Tensor& grad_input, const Tensor& gra

          if (j < grad_nnz && (out_offsets[i] == grad_offsets[j])) {
            auto grad_values_row = grad_values_accessor[j];
-            for (int64_t k=0; k<nvalues; k++) {
+            for (const auto k : c10::irange(nvalues)) {
              if (LogSoftMax) {
                tmp_row[k] -= grad_values_row[k];
              } else {
@ -492,7 +493,7 @@ void cpu_sparse_coo_softmax_backward(const Tensor& grad_input, const Tensor& gra

          if (j < grad_nnz && (out_offsets[i] == grad_offsets[j])) {
            auto grad_values_row = grad_values_accessor[j];
-            for (int64_t k=0; k<nvalues; k++) {
+            for (const auto k : c10::irange(nvalues)) {
              if (LogSoftMax) {
                values_row[k] = grad_values_row[k] + std::exp(out_values_row[k]) * tmp_row[k];
              } else {
@ -500,7 +501,7 @@ void cpu_sparse_coo_softmax_backward(const Tensor& grad_input, const Tensor& gra
              }
            }
          } else {
-            for (int64_t k=0; k<nvalues; k++) {
+            for (const auto k : c10::irange(nvalues)) {
              if (LogSoftMax) {
                values_row[k] = std::exp(out_values_row[k]) * tmp_row[k];
              } else {
--- a/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp
+++ b/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp
@ -13,6 +13,7 @@
 #include <ATen/native/Resize.h>
 #include <ATen/native/mkl/SparseBlasImpl.h>
 #include <ATen/native/sparse/SparseBlasImpl.h>
+#include <c10/util/irange.h>

 #include <algorithm>

@ -60,7 +61,7 @@ void convert_indices_from_coo_to_csr_cpu(const Tensor& result, const Tensor& inp

  at::parallel_for(0, numel - 1, GRAIN_SIZE, [&](int64_t start, int64_t end) {
    input_t curr_value = data_in[start], next_value;
-    for (int64_t i = start; i < end; i++) {
+    for (const auto i : c10::irange(start, end)) {
      next_value = data_in[i + 1];
      for (; curr_value < next_value; curr_value++)
        data_out[curr_value + 1] = static_cast<output_t>(i + 1);
--- a/aten/src/ATen/native/sparse/SparseMatMul.cpp
+++ b/aten/src/ATen/native/sparse/SparseMatMul.cpp
@ -5,6 +5,7 @@
 #include <ATen/SparseTensorImpl.h>
 #include <ATen/SparseTensorUtils.h>
 #include <ATen/native/Resize.h>
+#include <c10/util/irange.h>
 #include <unordered_map>

 namespace at { namespace native {
@ -30,7 +31,7 @@ void csr_to_coo(const int64_t n_row, const int64_t Ap[], int64_t Bi[]) {
    Output:
      `Bi` is the row indices
  */
-  for (int64_t i = 0; i < n_row; i++) {
+  for (const auto i : c10::irange(n_row)) {
    for (int64_t jj = Ap[i]; jj < Ap[i + 1]; jj++) {
      Bi[jj] = i;
    }
@ -56,7 +57,7 @@ int64_t _csr_matmult_maxnnz(
  */
  std::vector<int64_t> mask(n_col, -1);
  int64_t nnz = 0;
-  for (int64_t i = 0; i < n_row; i++) {
+  for (const auto i : c10::irange(n_row)) {
    int64_t row_nnz = 0;

    for (int64_t jj = Ap[i]; jj < Ap[i + 1]; jj++) {
@ -127,19 +128,19 @@ void _csr_matmult(

  Cp[0] = 0;

-  for (int64_t i = 0; i < n_row; i++) {
+  for (const auto i : c10::irange(n_row)) {
    int64_t head = -2;
    int64_t length = 0;

    int64_t jj_start = Ap[i];
    int64_t jj_end = Ap[i + 1];
-    for (int64_t jj = jj_start; jj < jj_end; jj++) {
+    for (const auto jj : c10::irange(jj_start, jj_end)) {
      int64_t j = Aj[jj];
      scalar_t v = Ax[jj];

      int64_t kk_start = Bp[j];
      int64_t kk_end = Bp[j + 1];
-      for (int64_t kk = kk_start; kk < kk_end; kk++) {
+      for (const auto kk : c10::irange(kk_start, kk_end)) {
        int64_t k = Bj[kk];

        sums[k] += v * Bx[kk];
@ -152,7 +153,8 @@ void _csr_matmult(
      }
    }

-    for (int64_t jj = 0; jj < length; jj++) {
+    for (const auto jj : c10::irange(length)) {
+      (void)jj; //Suppress unused variable warning
      Cj[nnz] = head;
      Cx[nnz] = sums[head];
      nnz++;
--- a/aten/src/ATen/native/sparse/SparseTensor.cpp
+++ b/aten/src/ATen/native/sparse/SparseTensor.cpp
@ -12,6 +12,7 @@

 #include <ATen/native/Copy.h>
 #include <ATen/native/CPUBlas.h>
+#include <c10/util/irange.h>

 namespace at {
 namespace native {
@ -229,7 +230,7 @@ Tensor sparse_coo_tensor(const Tensor& indices, const Tensor& values_,
    auto cpu_min_indices_accessor = cpu_min_indices.accessor<int64_t, 1>();
    auto cpu_computed_indices_sizes_accessor =
        cpu_computed_indices_sizes.accessor<int64_t, 1>();
-    for (int64_t d = 0; d < sparse_dim; d++) {
+    for (const auto d : c10::irange(sparse_dim)) {
      int64_t min_index_in_dim = cpu_min_indices_accessor[d];
      TORCH_CHECK(
          min_index_in_dim >= 0,
@ -244,11 +245,11 @@ Tensor sparse_coo_tensor(const Tensor& indices, const Tensor& values_,
    // If the indices doesn't have elements in it, there is not enough
    // information to know what the minimum sparse dimension sizes should be,
    // and in this case we set them to 0
-    for (int64_t d = 0; d < sparse_dim; d++) {
+    for (const auto d : c10::irange(sparse_dim)) {
      computed_sizes[static_cast<size_t>(d)] = 0;
    }
  }
-  for (int64_t d = 0; d < dense_dim; d++) {
+  for (const auto d : c10::irange(dense_dim)) {
    computed_sizes[static_cast<size_t>(sparse_dim + d)] = values.size(d + 1);
  }

@ -305,7 +306,7 @@ void _validate_sparse_coo_tensor_args(
    }
    auto cpu_min_indices_accessor = cpu_min_indices.accessor<int64_t, 1>();
    auto cpu_max_indices_accessor = cpu_max_indices.accessor<int64_t, 1>();
-    for (int64_t d = 0; d < sparse_dim; d++) {
+    for (const auto d : c10::irange(sparse_dim)) {
      // NB: This used to sync ndim times to access each entry; now we copy
      // everything to CPU first and then access it.
      int64_t min_index_in_dim = cpu_min_indices_accessor[d];
@ -597,7 +598,7 @@ SparseTensor _coalesce_sparse_cpu(const SparseTensor& self) {
    int64_t blockSize = values.stride(0);
    scalar_t* values_ptr = values.data_ptr<scalar_t>();
    scalar_t* newValues_ptr = newValues.data_ptr<scalar_t>();
-    for (int64_t j = 0; j < nnz; j++) {
+    for (const auto j : c10::irange(nnz)) {
      int64_t pos = indicesPermutationAccessor[j];
      int64_t curr = indicesBufferAccessor[j];
      if (curr == prev) {
@ -613,7 +614,7 @@ SparseTensor _coalesce_sparse_cpu(const SparseTensor& self) {
        }
      } else {
        ++i;
-        for (int64_t d = 0; d < sparse_dim; d++) {
+        for (const auto d : c10::irange(sparse_dim)) {
          newIndicesAccessor[d][i] = indicesAccessor[d][pos];
        }
        if (values.numel() >
@ -656,9 +657,9 @@ void inline sparse_mask_out_cpu_kernel(
  auto t_strides = t.strides();

  at::parallel_for(0, r_nnz, 1000, [&](int64_t start, int64_t end) {
-    for (auto i = start; i < end; i++) {
+    for (const auto i : c10::irange(start, end)) {
      int64_t idx = 0;
-      for (int64_t d = 0; d < sparse_dim; d++) {
+      for (const auto d : c10::irange(sparse_dim)) {
        idx += mask_indices_accessor[d][i] * t_strides[d];
      }
      r_values_accessor[i] = t_ptr[idx];
@ -706,14 +707,14 @@ SparseTensor& sparse_mask_out_cpu(
    // ]. Keeping this implementation because it is faster than
    // flatten_indices()
    Tensor indices = at::zeros({mask._nnz()}, mask_indices.options());
-    for (int64_t d = 0; d < mask.sparse_dim(); d++) {
+    for (const auto d : c10::irange(mask.sparse_dim())) {
      indices.mul_(mask.size(d));
      indices.add_(mask_indices.select(0, d));
    }

    std::vector<int64_t> view_size(1 + mask.dense_dim());
    view_size[0] = -1;
-    for (int64_t d = 0; d < mask.dense_dim(); d++) {
+    for (const auto d : c10::irange(mask.dense_dim())) {
      view_size[d + 1] = mask.size(mask.sparse_dim() + d);
    }

@ -777,7 +778,7 @@ Tensor sparse_mask_helper_cpu(

  // Step 1: flatten the sparse indices `t._indices()` tensor and then  map this
  // flatten value `index` to the original position `i`
-  for (int64_t i = 0; i < t_nnz; i++) {
+  for (const auto i : c10::irange(t_nnz)) {
    int64_t index = ti_flattened_indices.data_ptr<int64_t>()[i];
    t_flatten_indices[index] = i;
  }
@ -802,7 +803,7 @@ Tensor sparse_mask_helper_cpu(
    const auto r_values_stride = r_values.strides()[0] * r_values.element_size();
    const auto t_values_stride = t_v.strides()[0] * t_v.element_size();

-    for (auto i = start; i < end; i++) {
+    for (const auto i : c10::irange(start, end)) {
      int64_t index = flattened_mask_indices.data_ptr<int64_t>()[i];
      auto iter = t_flatten_indices.find(index);
      if (iter != t_flatten_indices.end()) {
--- a/aten/src/ATen/native/sparse/SparseTensorMath.cpp
+++ b/aten/src/ATen/native/sparse/SparseTensorMath.cpp
@ -601,9 +601,9 @@ Tensor& add_out_dense_sparse_cpu(Tensor& r, const Tensor& dense, const SparseTen
  // accessors rely on nnz test
  if (nDim > nDimI) {
    auto indices_accessor = indices.accessor<int64_t, 2>();
-    for (int64_t k = 0; k < sparse._nnz(); k++) {
+    for (const auto k : c10::irange(sparse._nnz())) {
      Tensor dstBuffer = resultBuffer;
-      for (int64_t d = 0; d < sparse.sparse_dim(); d++) {
+      for (const auto d : c10::irange(sparse.sparse_dim())) {
        dstBuffer = dstBuffer.select(0, indices_accessor[d][k]);
      }
      Tensor srcBuffer = valuesBuffer.select(0, k);
@ -970,7 +970,7 @@ SparseTensor& hspmm_out_sparse_cpu(const SparseTensor& sparse_, const Tensor& de
  auto indices_accessor = indices.accessor<int64_t, 2>();

  int64_t i = -1, prevIdx = -1;
-  for (int64_t j = 0; j < nnz; j++) {
+  for (const auto j : c10::irange(nnz)) {
    int64_t currIdx = valueIndices_accessor[j];
    if (currIdx != prevIdx) {
      indices_accessor[0][++i] = currIdx;
@ -1086,10 +1086,10 @@ SparseTensor& _sspaddmm_out_cpu(
        scalar_t* newv_ptr = newv.data_ptr<scalar_t>();
        scalar_t cast_alpha = alpha.to<scalar_t>();

-        for (int64_t h = 0; h < dim_i; h++) {
+        for (const auto h : c10::irange(dim_i)) {
          int64_t i_start = csr_accessor[h];
          int64_t i_end = csr_accessor[h+1];
-          for (int64_t i = i_start; i < i_end; i++) {
+          for (const auto i : c10::irange(i_start, i_end)) {
            scalar_t val = values_accessor[i];
            int64_t col = indices_accessor[1][i];
            if (col >= 0 && col < dim_j) {
@ -1103,7 +1103,7 @@ SparseTensor& _sspaddmm_out_cpu(
          }
          // Fill up the indices with the right values
          if (i_start != i_end) {
-            for (int64_t i = 0; i < dim_k; i++) {
+            for (const auto i : c10::irange(dim_k)) {
              newi_accessor[0][p+i] = h;
              newi_accessor[1][p+i] = i;
            }
@ -1178,7 +1178,7 @@ Tensor _sparse_sum(const SparseTensor& input, IntArrayRef dims_to_sum) {

  auto dims_to_keep_v = std::vector<int64_t>();
  auto dense_dims_to_sum_v = std::vector<int64_t>();
-  for (int64_t d = 0; d < input_dim; d++) {
+  for (const auto d : c10::irange(input_dim)) {
    if (dims_to_sum_b[d]) {
      if (d >= sparse_dim) dense_dims_to_sum_v.emplace_back(d + 1 - sparse_dim);
    }
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cpp
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cpp
@ -3,6 +3,7 @@

 #include <ATen/SparseTensorUtils.h>
 #include <ATen/cuda/CUDAUtils.h>
+#include <c10/util/irange.h>

 namespace at { namespace native {

@ -34,7 +35,7 @@ SparseTensor& sparse_mask_out_cuda(SparseTensor& r, const Tensor& t, const Spars
  // Get a flattened sparse indices, similar to NOTE [ Flatten Sparse Indices ].
  // Keeping this implementation because it is faster than flatten_indices()
  Tensor indices = at::zeros({mask._nnz()}, mask_indices.options());
-  for (int64_t d = 0; d < mask.sparse_dim(); d++) {
+  for (const auto d : c10::irange(mask.sparse_dim())) {
    indices.mul_(mask.size(d));
    // This used to use a buffer but I deoptimized it
    indices.add_(mask_indices.select(0, d));
@ -42,7 +43,7 @@ SparseTensor& sparse_mask_out_cuda(SparseTensor& r, const Tensor& t, const Spars

  std::vector<int64_t> view_size(1 + mask.dense_dim());
  view_size[0] = -1;
-  for (int64_t d = 0; d < mask.dense_dim(); d++) {
+  for (const auto d : c10::irange(mask.dense_dim())) {
    view_size[d + 1] = mask.size(mask.sparse_dim() + d);
  }

--- a/aten/src/ATen/native/utils/ParamsHash.h
+++ b/aten/src/ATen/native/utils/ParamsHash.h
@ -17,7 +17,7 @@ struct ParamsHash {
  size_t operator()(const Params& params) const {
    auto ptr = reinterpret_cast<const uint8_t*>(&params);
    uint32_t value = 0x811C9DC5;
-    for (int i = 0; i < (int)sizeof(Params); ++i) {
+    for (const auto i : c10::irange((int)sizeof(Params))) {
      value ^= ptr[i];
      value *= 0x01000193;
    }
--- a/aten/src/ATen/native/vulkan/Vulkan.cpp
+++ b/aten/src/ATen/native/vulkan/Vulkan.cpp
@ -2,6 +2,7 @@
 #include <c10/util/accumulate.h>
 #include <c10/util/ArrayRef.h>
 #include <c10/util/Exception.h>
+#include <c10/util/irange.h>

 #ifdef USE_VULKAN_WRAPPER
 #include <vulkan_wrapper.h>
@ -192,7 +193,7 @@ uint32_t VContext::getComputeQueueFamilyIndex() {
  vkGetPhysicalDeviceQueueFamilyProperties(
      physicalDevice_, &queueFamilyCount, queueFamilies.data());

-  for (uint32_t i = 0; i < queueFamilies.size(); ++i) {
+  for (const auto i : c10::irange(queueFamilies.size())) {
    VkQueueFamilyProperties props = queueFamilies[i];
    if (props.queueCount > 0 && (props.queueFlags & VK_QUEUE_COMPUTE_BIT)) {
      return i;
@ -274,7 +275,7 @@ uint32_t findMemoryType(
    const VkMemoryPropertyFlags properties) {
  VkPhysicalDeviceMemoryProperties memoryProperties{};
  vkGetPhysicalDeviceMemoryProperties(physicalDevice, &memoryProperties);
-  for (uint32_t i = 0; i < memoryProperties.memoryTypeCount; ++i) {
+  for (const auto i : c10::irange(memoryProperties.memoryTypeCount)) {
    if ((memoryTypeBits & (1 << i)) &&
        ((memoryProperties.memoryTypes[i].propertyFlags & properties) ==
         properties)) {
--- a/aten/src/ATen/native/vulkan/VulkanAten.cpp
+++ b/aten/src/ATen/native/vulkan/VulkanAten.cpp
@ -9,6 +9,7 @@
 #include <ATen/native/vulkan/VulkanOpaqueTensorImpl.h>
 #include <ATen/native/vulkan/VulkanOps.h>
 #include <ATen/vulkan/Context.h>
+#include <c10/util/irange.h>

 namespace at {
 namespace native {
@ -265,13 +266,13 @@ Tensor cat(const TensorList tensors, int64_t dim) {
  int64_t cat_dim_size = 0;

  std::vector<VulkanTensor> vTensors{};
-  for (int i = 0; i < tensors.size(); ++i) {
+  for (const auto i : c10::irange(tensors.size())) {
    const auto& t = tensors[i];
    TORCH_INTERNAL_ASSERT(
        t.dim() == 4, "Vulkan cat expects 4 dimensional inputs");
    TORCH_INTERNAL_ASSERT(t.is_vulkan(), "Vulkan cat expects Vulkan inputs");

-    for (int d = 0; d < 4; ++d) {
+    for (const auto d : c10::irange(4)) {
      if (d == dim) {
        continue;
      }
--- a/aten/src/ATen/native/vulkan/VulkanOps.cpp
+++ b/aten/src/ATen/native/vulkan/VulkanOps.cpp
@ -3,6 +3,7 @@
 #include <c10/util/accumulate.h>
 #include <c10/util/Exception.h>
 #include <c10/util/Optional.h>
+#include <c10/util/irange.h>

 #include <ATen/native/vulkan/Vulkan.h>
 #include <ATen/native/vulkan/VulkanCommon.h>
@ -629,17 +630,17 @@ VBuffer kernelNCHW_OCHW_repack_O4C4HWi4o4(
    memset(basePtr, 0, size);
    const float* src = weights;
    int ridx = 0;
-    for (int oc = 0; oc < OC; ++oc) {
+    for (const auto oc : c10::irange(OC)) {
      int oc_4 = oc / 4;
      int oc_4_i = oc % 4;
      float* dst_oc = basePtr + oc_4 * oc_4SizeNumel;
-      for (int ic = 0; ic < C; ++ic) {
+      for (const auto ic : c10::irange(C)) {
        int ic_4 = ic / 4;
        int ic_4_i = ic % 4;
        float* dst_ic = dst_oc + ic_4 * KW * KH * 16;
-        for (int ky = 0; ky < KH; ++ky) {
+        for (const auto ky : c10::irange(KH)) {
          float* dst_ky = dst_ic + ky * KW * 16;
-          for (int kx = 0; kx < KW; ++kx) {
+          for (const auto kx : c10::irange(KW)) {
            float* dst_kx = dst_ky + kx * 16;
            dst_kx[4 * ic_4_i + oc_4_i] = src[ridx++];
          }
--- a/aten/src/ATen/native/vulkan/api/Runtime.cpp
+++ b/aten/src/ATen/native/vulkan/api/Runtime.cpp
@ -1,5 +1,6 @@
 #include <ATen/native/vulkan/api/Runtime.h>
 #include <ATen/native/vulkan/api/Adapter.h>
+#include <c10/util/irange.h>

 #include <sstream>

@ -244,7 +245,7 @@ uint32_t query_compute_queue_family_index(const VkPhysicalDevice physical_device
      &queue_family_count,
      queue_families_properties.data());

-  for (uint32_t i = 0; i < queue_families_properties.size(); ++i) {
+  for (const auto i : c10::irange(queue_families_properties.size())) {
    const VkQueueFamilyProperties& properties = queue_families_properties[i];
    if (properties.queueCount > 0 && (properties.queueFlags & VK_QUEUE_COMPUTE_BIT)) {
      return i;
--- a/aten/src/ATen/native/vulkan/api/vk_mem_alloc.h
+++ b/aten/src/ATen/native/vulkan/api/vk_mem_alloc.h
@ -1005,8 +1005,7 @@ VmaDefragmentationContext defragCtx;
 vmaDefragmentationBegin(allocator, &defragInfo, nullptr, &defragCtx);
 vmaDefragmentationEnd(allocator, defragCtx);

-for(uint32_t i = 0; i < allocCount; ++i)
-{
+for (const auto i : c10::irange(allocCount)) {
    if(allocationsChanged[i])
    {
        // Destroy buffer that is immutably bound to memory region which is no longer valid.
@ -1083,8 +1082,7 @@ vkEndCommandBuffer(commandBuffer);

 vmaDefragmentationEnd(allocator, defragCtx);

-for(uint32_t i = 0; i < allocCount; ++i)
-{
+for (const auto i : c10::irange(allocCount)) {
    if(allocationsChanged[i])
    {
        // Destroy buffer that is immutably bound to memory region which is no longer valid.
@ -4818,8 +4816,7 @@ T must be pointer type, e.g. VmaAllocation, VmaPool.
 template<typename T>
 static bool VmaValidatePointerArray(uint32_t count, const T* arr)
 {
-    for(uint32_t i = 0; i < count; ++i)
-    {
+    for (const auto i : c10::irange(count)) {
        const T iPtr = arr[i];
        if(iPtr == VMA_NULL)
        {
@ -7459,8 +7456,7 @@ private:
        {
            FreeSpace s = {};
            s.blockInfoIndex = SIZE_MAX;
-            for(size_t i = 0; i < MAX_COUNT; ++i)
-            {
+            for (const auto i : c10::irange(MAX_COUNT)) {
                m_FreeSpaces[i] = s;
            }
        }
@ -7474,8 +7470,7 @@ private:

            // Find first invalid or the smallest structure.
            size_t bestIndex = SIZE_MAX;
-            for(size_t i = 0; i < MAX_COUNT; ++i)
-            {
+            for (const auto i : c10::irange(MAX_COUNT)) {
                // Empty structure.
                if(m_FreeSpaces[i].blockInfoIndex == SIZE_MAX)
                {
@ -7502,8 +7497,7 @@ private:
        {
            size_t bestIndex = SIZE_MAX;
            VkDeviceSize bestFreeSpaceAfter = 0;
-            for(size_t i = 0; i < MAX_COUNT; ++i)
-            {
+            for (const auto i : c10::irange(MAX_COUNT)) {
                // Structure is valid.
                if(m_FreeSpaces[i].blockInfoIndex != SIZE_MAX)
                {
@ -7846,8 +7840,7 @@ struct VmaCurrentBudgetData

    VmaCurrentBudgetData()
    {
-        for(uint32_t heapIndex = 0; heapIndex < VK_MAX_MEMORY_HEAPS; ++heapIndex)
-        {
+        for (const auto heapIndex : c10::irange(VK_MAX_MEMORY_HEAPS)) {
            m_BlockBytes[heapIndex] = 0;
            m_AllocationBytes[heapIndex] = 0;
 #if VMA_MEMORY_BUDGET
@ -8447,8 +8440,7 @@ void VmaJsonWriter::ContinueString(const char* pStr)
    VMA_ASSERT(m_InsideString);

    const size_t strLen = strlen(pStr);
-    for(size_t i = 0; i < strLen; ++i)
-    {
+    for (const auto i : c10::irange(strLen)) {
        char ch = pStr[i];
        if(ch == '\\')
        {
@ -8583,8 +8575,7 @@ void VmaJsonWriter::WriteIndent(bool oneLess)
        {
            --count;
        }
-        for(size_t i = 0; i < count; ++i)
-        {
+        for (const auto i : c10::irange(count)) {
            m_SB.Add(INDENT);
        }
    }
@ -9123,8 +9114,7 @@ bool VmaBlockMetadata_Generic::Validate() const
    VMA_VALIDATE(m_FreeSuballocationsBySize.size() == freeSuballocationsToRegister);

    VkDeviceSize lastSize = 0;
-    for(size_t i = 0; i < m_FreeSuballocationsBySize.size(); ++i)
-    {
+    for (const auto i : c10::irange(m_FreeSuballocationsBySize.size())) {
        VmaSuballocationList::iterator suballocItem = m_FreeSuballocationsBySize[i];

        // Only free suballocations can be registered in m_FreeSuballocationsBySize.
@ -10075,8 +10065,7 @@ bool VmaBlockMetadata_Linear::Validate() const
    {
        const size_t suballoc2ndCount = suballocations2nd.size();
        size_t nullItem2ndCount = 0;
-        for(size_t i = 0; i < suballoc2ndCount; ++i)
-        {
+        for (const auto i : c10::irange(suballoc2ndCount)) {
            const VmaSuballocation& suballoc = suballocations2nd[i];
            const bool currFree = (suballoc.type == VMA_SUBALLOCATION_TYPE_FREE);

@ -10100,8 +10089,7 @@ bool VmaBlockMetadata_Linear::Validate() const
        VMA_VALIDATE(nullItem2ndCount == m_2ndNullItemsCount);
    }

-    for(size_t i = 0; i < m_1stNullItemsBeginCount; ++i)
-    {
+    for (const auto i : c10::irange(m_1stNullItemsBeginCount)) {
        const VmaSuballocation& suballoc = suballocations1st[i];
        VMA_VALIDATE(suballoc.type == VMA_SUBALLOCATION_TYPE_FREE &&
            suballoc.hAllocation == VK_NULL_HANDLE);
@ -10109,8 +10097,7 @@ bool VmaBlockMetadata_Linear::Validate() const

    size_t nullItem1stCount = m_1stNullItemsBeginCount;

-    for(size_t i = m_1stNullItemsBeginCount; i < suballoc1stCount; ++i)
-    {
+    for (const auto i : c10::irange(m_1stNullItemsBeginCount, suballoc1stCount)) {
        const VmaSuballocation& suballoc = suballocations1st[i];
        const bool currFree = (suballoc.type == VMA_SUBALLOCATION_TYPE_FREE);

@ -11301,10 +11288,7 @@ bool VmaBlockMetadata_Linear::CreateAllocationRequest_LowerAddress(
            // If conflict exists, allocation cannot be made here.
            if(allocSize % bufferImageGranularity || resultOffset % bufferImageGranularity)
            {
-                for(size_t nextSuballocIndex = index1st;
-                    nextSuballocIndex < suballocations1st.size();
-                    nextSuballocIndex++)
-                {
+                for (const auto nextSuballocIndex : c10::irange(index1st, suballocations1st.size())) {
                    const VmaSuballocation& nextSuballoc = suballocations1st[nextSuballocIndex];
                    if(VmaBlocksOnSamePage(resultOffset, allocSize, nextSuballoc.offset, bufferImageGranularity))
                    {
@ -11712,8 +11696,7 @@ void VmaBlockMetadata_Linear::CleanupAfterFree()
        {
            const size_t nonNullItemCount = suballoc1stCount - nullItem1stCount;
            size_t srcIndex = m_1stNullItemsBeginCount;
-            for(size_t dstIndex = 0; dstIndex < nonNullItemCount; ++dstIndex)
-            {
+            for (const auto dstIndex : c10::irange(nonNullItemCount)) {
                while(suballocations1st[srcIndex].hAllocation == VK_NULL_HANDLE)
                {
                    ++srcIndex;
@ -11817,8 +11800,7 @@ bool VmaBlockMetadata_Buddy::Validate() const
    VMA_VALIDATE(m_SumFreeSize == ctx.calculatedSumFreeSize);

    // Validate free node lists.
-    for(uint32_t level = 0; level < m_LevelCount; ++level)
-    {
+    for (const auto level : c10::irange(m_LevelCount)) {
        VMA_VALIDATE(m_FreeList[level].front == VMA_NULL ||
            m_FreeList[level].front->free.prev == VMA_NULL);

@ -11840,8 +11822,7 @@ bool VmaBlockMetadata_Buddy::Validate() const
    }

    // Validate that free lists ar higher levels are empty.
-    for(uint32_t level = m_LevelCount; level < MAX_LEVELS; ++level)
-    {
+    for (const auto level : c10::irange(m_LevelCount, MAX_LEVELS)) {
        VMA_VALIDATE(m_FreeList[level].front == VMA_NULL && m_FreeList[level].back == VMA_NULL);
    }

@ -11850,8 +11831,7 @@ bool VmaBlockMetadata_Buddy::Validate() const

 VkDeviceSize VmaBlockMetadata_Buddy::GetUnusedRangeSizeMax() const
 {
-    for(uint32_t level = 0; level < m_LevelCount; ++level)
-    {
+    for (const auto level : c10::irange(m_LevelCount)) {
        if(m_FreeList[level].front != VMA_NULL)
        {
            return LevelToNodeSize(level);
@ -12668,8 +12648,7 @@ VmaBlockVector::~VmaBlockVector()

 VkResult VmaBlockVector::CreateMinBlocks()
 {
-    for(size_t i = 0; i < m_MinBlockCount; ++i)
-    {
+    for (const auto i : c10::irange(m_MinBlockCount)) {
        VkResult res = CreateBlock(m_PreferredBlockSize, VMA_NULL);
        if(res != VK_SUCCESS)
        {
@ -12692,8 +12671,7 @@ void VmaBlockVector::GetPoolStats(VmaPoolStats* pStats)
    pStats->unusedRangeSizeMax = 0;
    pStats->blockCount = blockCount;

-    for(uint32_t blockIndex = 0; blockIndex < blockCount; ++blockIndex)
-    {
+    for (const auto blockIndex : c10::irange(blockCount)) {
        const VmaDeviceMemoryBlock* const pBlock = m_Blocks[blockIndex];
        VMA_ASSERT(pBlock);
        VMA_HEAVY_ASSERT(pBlock->Validate());
@ -12873,8 +12851,7 @@ VkResult VmaBlockVector::AllocatePage(
            if(strategy == VMA_ALLOCATION_CREATE_STRATEGY_BEST_FIT_BIT)
            {
                // Forward order in m_Blocks - prefer blocks with smallest amount of free space.
-                for(size_t blockIndex = 0; blockIndex < m_Blocks.size(); ++blockIndex )
-                {
+                for (const auto blockIndex : c10::irange(m_Blocks.size())) {
                    VmaDeviceMemoryBlock* const pCurrBlock = m_Blocks[blockIndex];
                    VMA_ASSERT(pCurrBlock);
                    VkResult res = AllocateFromBlock(
@ -12932,8 +12909,7 @@ VkResult VmaBlockVector::AllocatePage(
            {
                // Allocate 1/8, 1/4, 1/2 as first blocks.
                const VkDeviceSize maxExistingBlockSize = CalcMaxBlockSize();
-                for(uint32_t i = 0; i < NEW_BLOCK_SIZE_SHIFT_MAX; ++i)
-                {
+                for (const auto i : c10::irange(NEW_BLOCK_SIZE_SHIFT_MAX)) {
                    const VkDeviceSize smallerNewBlockSize = newBlockSize / 2;
                    if(smallerNewBlockSize > maxExistingBlockSize && smallerNewBlockSize >= size * 2)
                    {
@ -13013,8 +12989,7 @@ VkResult VmaBlockVector::AllocatePage(
            if(strategy == VMA_ALLOCATION_CREATE_STRATEGY_BEST_FIT_BIT)
            {
                // Forward order in m_Blocks - prefer blocks with smallest amount of free space.
-                for(size_t blockIndex = 0; blockIndex < m_Blocks.size(); ++blockIndex )
-                {
+                for (const auto blockIndex : c10::irange(m_Blocks.size())) {
                    VmaDeviceMemoryBlock* const pCurrBlock = m_Blocks[blockIndex];
                    VMA_ASSERT(pCurrBlock);
                    VmaAllocationRequest currRequest = {};
@ -13238,8 +13213,7 @@ VkDeviceSize VmaBlockVector::CalcMaxBlockSize() const

 void VmaBlockVector::Remove(VmaDeviceMemoryBlock* pBlock)
 {
-    for(uint32_t blockIndex = 0; blockIndex < m_Blocks.size(); ++blockIndex)
-    {
+    for (const auto blockIndex : c10::irange(m_Blocks.size())) {
        if(m_Blocks[blockIndex] == pBlock)
        {
            VmaVectorRemove(m_Blocks, blockIndex);
@ -13254,8 +13228,7 @@ void VmaBlockVector::IncrementallySortBlocks()
    if(m_Algorithm != VMA_POOL_CREATE_LINEAR_ALGORITHM_BIT)
    {
        // Bubble sort only until first swap.
-        for(size_t i = 1; i < m_Blocks.size(); ++i)
-        {
+        for (const auto i : c10::irange(1, m_Blocks.size())) {
            if(m_Blocks[i - 1]->m_pMetadata->GetSumFreeSize() > m_Blocks[i]->m_pMetadata->GetSumFreeSize())
            {
                VMA_SWAP(m_Blocks[i - 1], m_Blocks[i]);
@ -13413,8 +13386,7 @@ void VmaBlockVector::ApplyDefragmentationMovesCpu(

    // Go over all moves. Mark blocks that are used with BLOCK_FLAG_USED.
    const size_t moveCount = moves.size();
-    for(size_t moveIndex = 0; moveIndex < moveCount; ++moveIndex)
-    {
+    for (const auto moveIndex : c10::irange(moveCount)) {
        const VmaDefragmentationMove& move = moves[moveIndex];
        blockInfo[move.srcBlockIndex].flags |= BLOCK_FLAG_USED;
        blockInfo[move.dstBlockIndex].flags |= BLOCK_FLAG_USED;
@ -13448,8 +13420,7 @@ void VmaBlockVector::ApplyDefragmentationMovesCpu(
        const VkDeviceSize nonCoherentAtomSize = m_hAllocator->m_PhysicalDeviceProperties.limits.nonCoherentAtomSize;
        VkMappedMemoryRange memRange = { VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE };

-        for(size_t moveIndex = 0; moveIndex < moveCount; ++moveIndex)
-        {
+        for (const auto moveIndex : c10::irange(moveCount)) {
            const VmaDefragmentationMove& move = moves[moveIndex];

            const BlockInfo& srcBlockInfo = blockInfo[move.srcBlockIndex];
@ -13520,8 +13491,7 @@ void VmaBlockVector::ApplyDefragmentationMovesGpu(

    // Go over all moves. Mark blocks that are used with BLOCK_FLAG_USED.
    const size_t moveCount = moves.size();
-    for(size_t moveIndex = 0; moveIndex < moveCount; ++moveIndex)
-    {
+    for (const auto moveIndex : c10::irange(moveCount)) {
        const VmaDefragmentationMove& move = moves[moveIndex];

        //if(move.type == VMA_ALLOCATION_TYPE_UNKNOWN)
@ -13560,8 +13530,7 @@ void VmaBlockVector::ApplyDefragmentationMovesGpu(
    // Go over all moves. Post data transfer commands to command buffer.
    if(pDefragCtx->res == VK_SUCCESS)
    {
-        for(size_t moveIndex = 0; moveIndex < moveCount; ++moveIndex)
-        {
+        for (const auto moveIndex : c10::irange(moveCount)) {
            const VmaDefragmentationMove& move = moves[moveIndex];

            const VmaBlockDefragmentationContext& srcBlockCtx = pDefragCtx->blockContexts[move.srcBlockIndex];
@ -13686,8 +13655,7 @@ void VmaBlockVector::PrintDetailedMap(class VmaJsonWriter& json)

    json.WriteString("Blocks");
    json.BeginObject();
-    for(size_t i = 0; i < m_Blocks.size(); ++i)
-    {
+    for (const auto i : c10::irange(m_Blocks.size())) {
        json.BeginString();
        json.ContinueString(m_Blocks[i]->GetId());
        json.EndString();
@ -13895,8 +13863,7 @@ void VmaBlockVector::CommitDefragmentations(
 size_t VmaBlockVector::CalcAllocationCount() const
 {
    size_t result = 0;
-    for(size_t i = 0; i < m_Blocks.size(); ++i)
-    {
+    for (const auto i : c10::irange(m_Blocks.size())) {
        result += m_Blocks[i]->m_pMetadata->GetAllocationCount();
    }
    return result;
@ -13928,8 +13895,7 @@ void VmaBlockVector::MakePoolAllocationsLost(
 {
    VmaMutexLockWrite lock(m_Mutex, m_hAllocator->m_UseMutex);
    size_t lostAllocationCount = 0;
-    for(uint32_t blockIndex = 0; blockIndex < m_Blocks.size(); ++blockIndex)
-    {
+    for (const auto blockIndex : c10::irange(m_Blocks.size())) {
        VmaDeviceMemoryBlock* const pBlock = m_Blocks[blockIndex];
        VMA_ASSERT(pBlock);
        lostAllocationCount += pBlock->m_pMetadata->MakeAllocationsLost(currentFrameIndex, m_FrameInUseCount);
@ -13948,8 +13914,7 @@ VkResult VmaBlockVector::CheckCorruption()
    }

    VmaMutexLockRead lock(m_Mutex, m_hAllocator->m_UseMutex);
-    for(uint32_t blockIndex = 0; blockIndex < m_Blocks.size(); ++blockIndex)
-    {
+    for (const auto blockIndex : c10::irange(m_Blocks.size())) {
        VmaDeviceMemoryBlock* const pBlock = m_Blocks[blockIndex];
        VMA_ASSERT(pBlock);
        VkResult res = pBlock->CheckCorruption(m_hAllocator);
@ -13968,8 +13933,7 @@ void VmaBlockVector::AddStats(VmaStats* pStats)

    VmaMutexLockRead lock(m_Mutex, m_hAllocator->m_UseMutex);

-    for(uint32_t blockIndex = 0; blockIndex < m_Blocks.size(); ++blockIndex)
-    {
+    for (const auto blockIndex : c10::irange(m_Blocks.size())) {
        const VmaDeviceMemoryBlock* const pBlock = m_Blocks[blockIndex];
        VMA_ASSERT(pBlock);
        VMA_HEAVY_ASSERT(pBlock->Validate());
@ -13998,8 +13962,7 @@ VmaDefragmentationAlgorithm_Generic::VmaDefragmentationAlgorithm_Generic(
 {
    // Create block info for each block.
    const size_t blockCount = m_pBlockVector->m_Blocks.size();
-    for(size_t blockIndex = 0; blockIndex < blockCount; ++blockIndex)
-    {
+    for (const auto blockIndex : c10::irange(blockCount)) {
        BlockInfo* pBlockInfo = vma_new(m_hAllocator, BlockInfo)(m_hAllocator->GetAllocationCallbacks());
        pBlockInfo->m_OriginalBlockIndex = blockIndex;
        pBlockInfo->m_pBlock = m_pBlockVector->m_Blocks[blockIndex];
@ -14197,8 +14160,7 @@ VkResult VmaDefragmentationAlgorithm_Generic::DefragmentRound(
 size_t VmaDefragmentationAlgorithm_Generic::CalcBlocksWithNonMovableCount() const
 {
    size_t result = 0;
-    for(size_t i = 0; i < m_Blocks.size(); ++i)
-    {
+    for (const auto i : c10::irange(m_Blocks.size())) {
        if(m_Blocks[i]->m_HasNonMovableAllocations)
        {
            ++result;
@ -14219,8 +14181,7 @@ VkResult VmaDefragmentationAlgorithm_Generic::Defragment(
    }

    const size_t blockCount = m_Blocks.size();
-    for(size_t blockIndex = 0; blockIndex < blockCount; ++blockIndex)
-    {
+    for (const auto blockIndex : c10::irange(blockCount)) {
        BlockInfo* pBlockInfo = m_Blocks[blockIndex];

        if(m_AllAllocations)
@ -14325,8 +14286,7 @@ VkResult VmaDefragmentationAlgorithm_Fast::Defragment(
    // Sort blocks in order from most destination.

    m_BlockInfos.resize(blockCount);
-    for(size_t i = 0; i < blockCount; ++i)
-    {
+    for (const auto i : c10::irange(blockCount)) {
        m_BlockInfos[i].origBlockIndex = i;
    }

@ -14539,8 +14499,7 @@ VkResult VmaDefragmentationAlgorithm_Fast::Defragment(
 void VmaDefragmentationAlgorithm_Fast::PreprocessMetadata()
 {
    const size_t blockCount = m_pBlockVector->GetBlockCount();
-    for(size_t blockIndex = 0; blockIndex < blockCount; ++blockIndex)
-    {
+    for (const auto blockIndex : c10::irange(blockCount)) {
        VmaBlockMetadata_Generic* const pMetadata =
            (VmaBlockMetadata_Generic*)m_pBlockVector->GetBlock(blockIndex)->m_pMetadata;
        pMetadata->m_FreeCount = 0;
@ -14567,8 +14526,7 @@ void VmaDefragmentationAlgorithm_Fast::PreprocessMetadata()
 void VmaDefragmentationAlgorithm_Fast::PostprocessMetadata()
 {
    const size_t blockCount = m_pBlockVector->GetBlockCount();
-    for(size_t blockIndex = 0; blockIndex < blockCount; ++blockIndex)
-    {
+    for (const auto blockIndex : c10::irange(blockCount)) {
        VmaBlockMetadata_Generic* const pMetadata =
            (VmaBlockMetadata_Generic*)m_pBlockVector->GetBlock(blockIndex)->m_pMetadata;
        const VkDeviceSize blockSize = pMetadata->GetSize();
@ -14778,8 +14736,7 @@ VmaDefragmentationContext_T::~VmaDefragmentationContext_T()

 void VmaDefragmentationContext_T::AddPools(uint32_t poolCount, const VmaPool* pPools)
 {
-    for(uint32_t poolIndex = 0; poolIndex < poolCount; ++poolIndex)
-    {
+    for (const auto poolIndex : c10::irange(poolCount)) {
        VmaPool pool = pPools[poolIndex];
        VMA_ASSERT(pool);
        // Pools with algorithm other than default are not defragmented.
@ -14817,8 +14774,7 @@ void VmaDefragmentationContext_T::AddAllocations(
    VkBool32* pAllocationsChanged)
 {
    // Dispatch pAllocations among defragmentators. Create them when necessary.
-    for(uint32_t allocIndex = 0; allocIndex < allocationCount; ++allocIndex)
-    {
+    for (const auto allocIndex : c10::irange(allocationCount)) {
        const VmaAllocation hAlloc = pAllocations[allocIndex];
        VMA_ASSERT(hAlloc);
        // DedicatedAlloc cannot be defragmented.
@ -15615,14 +15571,12 @@ void VmaRecorder::WriteConfiguration(
    fprintf(m_File, "PhysicalDeviceLimits,nonCoherentAtomSize,%llu\n", devProps.limits.nonCoherentAtomSize);

    fprintf(m_File, "PhysicalDeviceMemory,HeapCount,%u\n", memProps.memoryHeapCount);
-    for(uint32_t i = 0; i < memProps.memoryHeapCount; ++i)
-    {
+    for (const auto i : c10::irange(memProps.memoryHeapCount)) {
        fprintf(m_File, "PhysicalDeviceMemory,Heap,%u,size,%llu\n", i, memProps.memoryHeaps[i].size);
        fprintf(m_File, "PhysicalDeviceMemory,Heap,%u,flags,%u\n", i, memProps.memoryHeaps[i].flags);
    }
    fprintf(m_File, "PhysicalDeviceMemory,TypeCount,%u\n", memProps.memoryTypeCount);
-    for(uint32_t i = 0; i < memProps.memoryTypeCount; ++i)
-    {
+    for (const auto i : c10::irange(memProps.memoryTypeCount)) {
        fprintf(m_File, "PhysicalDeviceMemory,Type,%u,heapIndex,%u\n", i, memProps.memoryTypes[i].heapIndex);
        fprintf(m_File, "PhysicalDeviceMemory,Type,%u,propertyFlags,%u\n", i, memProps.memoryTypes[i].propertyFlags);
    }
@ -15830,8 +15784,7 @@ VmaAllocator_T::VmaAllocator_T(const VmaAllocatorCreateInfo* pCreateInfo) :

    if(pCreateInfo->pHeapSizeLimit != VMA_NULL)
    {
-        for(uint32_t heapIndex = 0; heapIndex < GetMemoryHeapCount(); ++heapIndex)
-        {
+        for (const auto heapIndex : c10::irange(GetMemoryHeapCount())) {
            const VkDeviceSize limit = pCreateInfo->pHeapSizeLimit[heapIndex];
            if(limit != VK_WHOLE_SIZE)
            {
@ -15844,8 +15797,7 @@ VmaAllocator_T::VmaAllocator_T(const VmaAllocatorCreateInfo* pCreateInfo) :
        }
    }

-    for(uint32_t memTypeIndex = 0; memTypeIndex < GetMemoryTypeCount(); ++memTypeIndex)
-    {
+    for (const auto memTypeIndex : c10::irange(GetMemoryTypeCount())) {
        const VkDeviceSize preferredBlockSize = CalcPreferredBlockSize(memTypeIndex);

        m_pBlockVectors[memTypeIndex] = vma_new(this, VmaBlockVector)(
@ -16747,14 +16699,11 @@ void VmaAllocator_T::CalculateStats(VmaStats* pStats)
 {
    // Initialize.
    InitStatInfo(pStats->total);
-    for(size_t i = 0; i < VK_MAX_MEMORY_TYPES; ++i)
-        InitStatInfo(pStats->memoryType[i]);
-    for(size_t i = 0; i < VK_MAX_MEMORY_HEAPS; ++i)
-        InitStatInfo(pStats->memoryHeap[i]);
+    for (const auto i : c10::irange(VK_MAX_MEMORY_TYPES))InitStatInfo(pStats->memoryType[i]);
+    for (const auto i : c10::irange(VK_MAX_MEMORY_HEAPS))InitStatInfo(pStats->memoryHeap[i]);

    // Process default pools.
-    for(uint32_t memTypeIndex = 0; memTypeIndex < GetMemoryTypeCount(); ++memTypeIndex)
-    {
+    for (const auto memTypeIndex : c10::irange(GetMemoryTypeCount())) {
        VmaBlockVector* const pBlockVector = m_pBlockVectors[memTypeIndex];
        VMA_ASSERT(pBlockVector);
        pBlockVector->AddStats(pStats);
@ -16770,8 +16719,7 @@ void VmaAllocator_T::CalculateStats(VmaStats* pStats)
    }

    // Process dedicated allocations.
-    for(uint32_t memTypeIndex = 0; memTypeIndex < GetMemoryTypeCount(); ++memTypeIndex)
-    {
+    for (const auto memTypeIndex : c10::irange(GetMemoryTypeCount())) {
        const uint32_t memHeapIndex = MemoryTypeIndexToHeapIndex(memTypeIndex);
        VmaMutexLockRead dedicatedAllocationsLock(m_DedicatedAllocationsMutex[memTypeIndex], m_UseMutex);
        AllocationVectorType* const pDedicatedAllocVector = m_pDedicatedAllocations[memTypeIndex];
@ -16788,10 +16736,8 @@ void VmaAllocator_T::CalculateStats(VmaStats* pStats)

    // Postprocess.
    VmaPostprocessCalcStatInfo(pStats->total);
-    for(size_t i = 0; i < GetMemoryTypeCount(); ++i)
-        VmaPostprocessCalcStatInfo(pStats->memoryType[i]);
-    for(size_t i = 0; i < GetMemoryHeapCount(); ++i)
-        VmaPostprocessCalcStatInfo(pStats->memoryHeap[i]);
+    for (const auto i : c10::irange(GetMemoryTypeCount()))VmaPostprocessCalcStatInfo(pStats->memoryType[i]);
+    for (const auto i : c10::irange(GetMemoryHeapCount()))VmaPostprocessCalcStatInfo(pStats->memoryHeap[i]);
 }

 void VmaAllocator_T::GetBudget(VmaBudget* outBudget, uint32_t firstHeap, uint32_t heapCount)
@ -17114,8 +17060,7 @@ VkResult VmaAllocator_T::CheckCorruption(uint32_t memoryTypeBits)
    VkResult finalRes = VK_ERROR_FEATURE_NOT_PRESENT;

    // Process default pools.
-    for(uint32_t memTypeIndex = 0; memTypeIndex < GetMemoryTypeCount(); ++memTypeIndex)
-    {
+    for (const auto memTypeIndex : c10::irange(GetMemoryTypeCount())) {
        if(((1u << memTypeIndex) & memoryTypeBits) != 0)
        {
            VmaBlockVector* const pBlockVector = m_pBlockVectors[memTypeIndex];
@ -17463,8 +17408,7 @@ VkResult VmaAllocator_T::FlushOrInvalidateAllocations(
    typedef VmaSmallVector<VkMappedMemoryRange, RangeAllocator, 16> RangeVector;
    RangeVector ranges = RangeVector(RangeAllocator(GetAllocationCallbacks()));

-    for(uint32_t allocIndex = 0; allocIndex < allocationCount; ++allocIndex)
-    {
+    for (const auto allocIndex : c10::irange(allocationCount)) {
        const VmaAllocation alloc = allocations[allocIndex];
        const VkDeviceSize offset = offsets != VMA_NULL ? offsets[allocIndex] : 0;
        const VkDeviceSize size = sizes != VMA_NULL ? sizes[allocIndex] : VK_WHOLE_SIZE;
@ -17559,8 +17503,7 @@ uint32_t VmaAllocator_T::CalculateGlobalMemoryTypeBits() const
    if(!m_UseAmdDeviceCoherentMemory)
    {
        // Exclude memory types that have VK_MEMORY_PROPERTY_DEVICE_COHERENT_BIT_AMD.
-        for(uint32_t memTypeIndex = 0; memTypeIndex < GetMemoryTypeCount(); ++memTypeIndex)
-        {
+        for (const auto memTypeIndex : c10::irange(GetMemoryTypeCount())) {
            if((m_MemProps.memoryTypes[memTypeIndex].propertyFlags & VK_MEMORY_PROPERTY_DEVICE_COHERENT_BIT_AMD_COPY) != 0)
            {
                memoryTypeBits &= ~(1u << memTypeIndex);
@ -17650,8 +17593,7 @@ void VmaAllocator_T::UpdateVulkanBudget()
    {
        VmaMutexLockWrite lockWrite(m_Budget.m_BudgetMutex, m_UseMutex);

-        for(uint32_t heapIndex = 0; heapIndex < GetMemoryHeapCount(); ++heapIndex)
-        {
+        for (const auto heapIndex : c10::irange(GetMemoryHeapCount())) {
            m_Budget.m_VulkanUsage[heapIndex] = budgetProps.heapUsage[heapIndex];
            m_Budget.m_VulkanBudget[heapIndex] = budgetProps.heapBudget[heapIndex];
            m_Budget.m_BlockBytesAtBudgetFetch[heapIndex] = m_Budget.m_BlockBytes[heapIndex].load();
@ -17713,8 +17655,7 @@ uint32_t VmaAllocator_T::GetGpuDefragmentationMemoryTypeBits()
 void VmaAllocator_T::PrintDetailedMap(VmaJsonWriter& json)
 {
    bool dedicatedAllocationsStarted = false;
-    for(uint32_t memTypeIndex = 0; memTypeIndex < GetMemoryTypeCount(); ++memTypeIndex)
-    {
+    for (const auto memTypeIndex : c10::irange(GetMemoryTypeCount())) {
        VmaMutexLockRead dedicatedAllocationsLock(m_DedicatedAllocationsMutex[memTypeIndex], m_UseMutex);
        AllocationVectorType* const pDedicatedAllocVector = m_pDedicatedAllocations[memTypeIndex];
        VMA_ASSERT(pDedicatedAllocVector);
@ -17751,8 +17692,7 @@ void VmaAllocator_T::PrintDetailedMap(VmaJsonWriter& json)

    {
        bool allocationsStarted = false;
-        for(uint32_t memTypeIndex = 0; memTypeIndex < GetMemoryTypeCount(); ++memTypeIndex)
-        {
+        for (const auto memTypeIndex : c10::irange(GetMemoryTypeCount())) {
            if(m_pBlockVectors[memTypeIndex]->IsEmpty() == false)
            {
                if(allocationsStarted == false)
@ -17783,8 +17723,7 @@ void VmaAllocator_T::PrintDetailedMap(VmaJsonWriter& json)
        {
            json.WriteString("Pools");
            json.BeginObject();
-            for(size_t poolIndex = 0; poolIndex < poolCount; ++poolIndex)
-            {
+            for (const auto poolIndex : c10::irange(poolCount)) {
                json.BeginString();
                json.ContinueString(m_Pools[poolIndex]->GetId());
                json.EndString();
@ -18425,8 +18364,7 @@ VMA_CALL_PRE VkResult VMA_CALL_POST vmaAllocateMemoryPages(

    if(pAllocationInfo != VMA_NULL && result == VK_SUCCESS)
    {
-        for(size_t i = 0; i < allocationCount; ++i)
-        {
+        for (const auto i : c10::irange(allocationCount)) {
            allocator->GetAllocationInfo(pAllocations[i], pAllocationInfo + i);
        }
    }
--- a/aten/src/ATen/native/vulkan/ops/Convolution.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Convolution.cpp
@ -3,6 +3,7 @@
 #include <ATen/native/utils/ParamUtils.h>
 #include <ATen/native/vulkan/ops/Common.h>
 #include <ATen/native/vulkan/api/Utils.h>
+#include <c10/util/irange.h>

 namespace at {
 namespace native {
@ -32,7 +33,7 @@ inline bool is_pointwise(const IntArrayRef filter) {

 bool all_lessthan(const IntArrayRef arr, const int t) {
  bool retval = true;
-  for (size_t i = 0; i < arr.size(); i++) {
+  for (const auto i : c10::irange(arr.size())) {
    retval = retval && (arr[i] < t);
  }
  return retval;
@ -173,8 +174,8 @@ vTensor pack_weights_2d(
    for (int64_t src_ic = 0; src_ic < src_filter[Layout::Filter::input]; ++src_ic) {
      const int64_t dst_ic4 = src_ic / 4;

-      for (int64_t src_ih = 0; src_ih < src_kh_sz; ++src_ih) {
-        for (int64_t src_iw = 0; src_iw < src_kw_sz; ++src_iw) {
+      for (const auto src_ih : c10::irange(src_kh_sz)) {
+        for (const auto src_iw : c10::irange(src_kw_sz)) {
          memcpy(
              dst_weight_c_ptr + (dst_oh * src_kh_sz + src_ih) * dst_kw_sz +
                dst_ic4 * src_kw_sz * 4 + src_iw * 4 + src_ic % 4,
@ -225,11 +226,11 @@ vTensor pack_weights_2d_winograd_2_3(
  float* const dst_weight_ptr = v_weight_payload.get();
  memset(dst_weight_ptr, 0, v_weight.nbytes());

-  for (int64_t src_oc = 0; src_oc < src_oc_sz; ++src_oc) {
+  for (const auto src_oc : c10::irange(src_oc_sz)) {
    const int64_t dst_oh = src_oc / 4;
    const int64_t dst_iw = src_oc % 4;

-    for (int64_t src_ic = 0; src_ic < src_ic_sz; ++src_ic) {
+    for (const auto src_ic : c10::irange(src_ic_sz)) {
      const int64_t dst_ow = src_ic / 4;
      const int64_t dst_c = src_ic % 4;

@ -344,7 +345,7 @@ vTensor pack_biases(
    float* const dst_bias_ptr = v_bias_payload.get();

    memset(dst_bias_ptr, 0, v_bias.nbytes());
-    for (int64_t i = 0; i < src_w; ++i) {
+    for (const auto i : c10::irange(src_w)) {
      const int64_t c = i % 4;
      const int64_t x = i / 4;
      dst_bias_ptr[c * packed_w + x] = src_bias_ptr[i];
--- a/aten/src/ATen/native/vulkan/ops/Mm.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Mm.cpp
@ -1,4 +1,5 @@
 #include <ATen/native/vulkan/ops/Mm.h>
+#include <c10/util/irange.h>

 namespace at {
 namespace native {
@ -47,8 +48,8 @@ vTensor pack_weights(
  float* const dst_weight_ptr = v_weight_payload.get();
  memset(dst_weight_ptr, 0, v_weight.nbytes());

-  for (int64_t src_h = 0; src_h < src_kh_sz; ++src_h) {
-    for (int64_t src_w = 0; src_w < src_kw_sz; ++src_w) {
+  for (const auto src_h : c10::irange(src_kh_sz)) {
+    for (const auto src_w : c10::irange(src_kw_sz)) {
      int64_t dst_plane = 2*(src_h%2) + (src_w%2);
      int64_t dst_index = (src_h/2)*dst_kw_sz + (src_w/2);
      memcpy(
@ -109,8 +110,8 @@ vTensor pack_biases(
    float* const dst_bias_ptr = v_bias_payload.get();
    memset(dst_bias_ptr, 0, v_bias.nbytes());

-    for (int64_t src_h = 0; src_h < src_kh_sz; ++src_h) {
-      for (int64_t src_w = 0; src_w < src_kw_sz; ++src_w) {
+    for (const auto src_h : c10::irange(src_kh_sz)) {
+      for (const auto src_w : c10::irange(src_kw_sz)) {
        int64_t dst_plane = 2*(src_h%2) + (src_w%2);
        int64_t dst_index = (src_h/2)*dst_kw_sz + (src_w/2);
        memcpy(
--- a/aten/src/ATen/native/vulkan/ops/Padding.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Padding.cpp
@ -1,4 +1,5 @@
 #include <ATen/native/vulkan/ops/Common.h>
+#include <c10/util/irange.h>
 #include <torch/library.h>

 namespace at {
@ -35,7 +36,7 @@ Tensor reflection_pad2d(const Tensor& self_arg, IntArrayRef padding) {
  const vTensor& v_self = convert(self);

  c10::SmallVector<int64_t, 4> output_size(input_dim);
-  for (size_t d = 0; d < input_dim; ++d) {
+  for (const auto d : c10::irange(input_dim)) {
    if (d == input_dim - 1) {
      output_size[d] = input_size[d] + pad_right + pad_left;
    } else if (d == input_dim - 2) {
--- a/aten/src/ATen/native/xnnpack/Convolution.cpp
+++ b/aten/src/ATen/native/xnnpack/Convolution.cpp
@ -7,6 +7,7 @@
 #include <ATen/native/utils/Factory.h>
 #include <ATen/native/utils/ParamUtils.h>
 #include <ATen/native/xnnpack/Convolution.h>
+#include <c10/util/irange.h>

 namespace at {
 namespace native {
@ -150,11 +151,11 @@ const Tensor reorder_weights_for_transpose_conv(const Tensor& weight_nhwc,
  float* in_ptr = weight_nhwc.data_ptr<float>();

  int out_index = 0;
-  for (int g = 0; g < num_groups; g++) {
-    for (int o = 0; o < output_channels_per_group; o++) {
-      for (int w = 0; w < kernel_width; w++) {
-        for (int h = 0; h < kernel_height; h++) {
-          for (int i = 0; i < input_channels_per_group; i++) {
+  for (const auto g : c10::irange(num_groups)) {
+    for (const auto o : c10::irange(output_channels_per_group)) {
+      for (const auto w : c10::irange(kernel_width)) {
+        for (const auto h : c10::irange(kernel_height)) {
+          for (const auto i : c10::irange(input_channels_per_group)) {
            int in_index = (g*g_offset) + (i*i_offset) + (h*h_offset) + (w*w_offset) + (o*o_offset);
            out_ptr[out_index] = in_ptr[in_index];
            out_index++;
@ -210,7 +211,7 @@ ContextConv2D create(

  if (transposed) {
    const Tensor weight_reordered = reorder_weights_for_transpose_conv(weight_nhwc, groups);
-    for (int i = 0; i < 4; i++) {
+    for (const auto i : c10::irange(4)) {
      weight_sizes[i] = weight_reordered.size(i);
    }
    create_status = xnn_create_deconvolution2d_nhwc_f32(
@ -238,7 +239,7 @@ ContextConv2D create(
      0u,                                                             // flags
      &convolution_op);                                               // operator
  } else {
-    for (int i = 0; i < 4; i++) {
+    for (const auto i : c10::irange(4)) {
      weight_sizes[i] = weight_nhwc.size(i);
    }
    create_status = xnn_create_convolution2d_nhwc_f32(
--- a/aten/src/ATen/nnapi/nnapi_bind.cpp
+++ b/aten/src/ATen/nnapi/nnapi_bind.cpp
@ -4,6 +4,7 @@
 #include <ATen/nnapi/nnapi_bind.h>
 #include <ATen/nnapi/nnapi_wrapper.h>
 #include <ATen/nnapi/nnapi_model_loader.h>
+#include <c10/util/irange.h>

 namespace torch {
 namespace nnapi {
@ -103,7 +104,7 @@ void NnapiCompilation::run(
  TORCH_CHECK((int32_t)inputs.size() == num_inputs_);
  TORCH_CHECK((int32_t)outputs.size() == num_outputs_);

-  for (size_t i = 0; i < inputs.size(); i++) {
+  for (const auto i : c10::irange(inputs.size())) {
    auto& t = inputs[i];
    // TODO: Check contiguous and dtype.
    ANeuralNetworksOperandType op_type;
@ -117,7 +118,7 @@ void NnapiCompilation::run(
        t.nbytes());
  }

-  for (size_t i = 0; i < outputs.size(); i++) {
+  for (const auto i : c10::irange(outputs.size())) {
    auto& t = outputs[i];
    // TODO: Check contiguous and dtype.
    check_nnapi->Execution_setOutput(
@ -131,7 +132,7 @@ void NnapiCompilation::run(
  check_nnapi->Execution_compute(execution);

  // TODO: Maybe skip this for fixed-size outputs?
-  for (size_t i = 0; i < outputs.size(); i++) {
+  for (const auto i : c10::irange(outputs.size())) {
    auto& t = outputs[i];
    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
    uint32_t rank;
--- a/aten/src/ATen/test/apply_utils_test.cpp
+++ b/aten/src/ATen/test/apply_utils_test.cpp
@ -3,6 +3,7 @@
 #include <ATen/ATen.h>
 #include <ATen/CPUApplyUtils.h>
 #include <ATen/test/test_assert.h>
+#include <c10/util/irange.h>

 #include <iostream>
 using namespace std;
@ -10,7 +11,7 @@ using namespace at;

 void fill_tensor(int64_t scalar, Tensor& t_) {
  auto t = t_.view(-1);
-  for (int64_t i = 0; i < t.numel(); i++) {
+  for (const auto i : c10::irange(t.numel())) {
    t[i] = (i + 1) * scalar;
  }
 }
@ -42,7 +43,7 @@ void test(DeprecatedTypeProperties& type, IntArrayRef shape, int64_t a = 0, int6
  auto a4 = at::empty({0}, at::TensorOptions(kCPU).dtype(kDouble));

  std::vector<Tensor> tensors({a0, a1, a2, a3, a4});
-  for (size_t i = 0; i < tensors.size(); i++) {
+  for (const auto i : c10::irange(tensors.size())) {
    tensors[i].resize_(shape);
    fill_tensor(i + 1, tensors[i]);
    if (a >= 0 && b >= 0) {
@ -55,7 +56,7 @@ void test(DeprecatedTypeProperties& type, IntArrayRef shape, int64_t a = 0, int6
        a0, a1, [](scalar_t& y, const scalar_t& x) { y = x * x; });
    CPU_tensor_apply2<double, scalar_t>(
        a4, a1, [](double& y, scalar_t x) { y = (double)(x * x); });
-    for (int64_t i = 0; i < a0.numel(); i++) {
+    for (const auto i : c10::irange(a0.numel())) {
      auto target = a1.data_ptr<scalar_t>()[i] * a1.data_ptr<scalar_t>()[i];
      ASSERT(a0.data_ptr<scalar_t>()[i] == target);
      ASSERT(a4.data_ptr<double>()[i] == target);
@ -71,7 +72,7 @@ void test(DeprecatedTypeProperties& type, IntArrayRef shape, int64_t a = 0, int6
        a4, a1, a2, [](double& y, const scalar_t& x, const scalar_t& z) {
          y = (double)(x * x + z);
        });
-    for (int64_t i = 0; i < a0.numel(); i++) {
+    for (const auto i : c10::irange(a0.numel())) {
      auto target = a1.data_ptr<scalar_t>()[i] * a1.data_ptr<scalar_t>()[i];
      target = target + a2.data_ptr<scalar_t>()[i];
      ASSERT(a0.data_ptr<scalar_t>()[i] == target);
@ -97,7 +98,7 @@ void test(DeprecatedTypeProperties& type, IntArrayRef shape, int64_t a = 0, int6
        [](double& y, const scalar_t& x, const scalar_t& z, const scalar_t& a) {
          y = (double)(x * x + z * a);
        });
-    for (int64_t i = 0; i < a0.numel(); i++) {
+    for (const auto i : c10::irange(a0.numel())) {
      auto target = a1.data_ptr<scalar_t>()[i] * a1.data_ptr<scalar_t>()[i];
      target = target + a2.data_ptr<scalar_t>()[i] * a3.data_ptr<scalar_t>()[i];
      ASSERT(a0.data_ptr<scalar_t>()[i] == target);
--- a/aten/src/ATen/test/atest.cpp
+++ b/aten/src/ATen/test/atest.cpp
@ -1,6 +1,7 @@
 #include <gtest/gtest.h>

 #include <ATen/ATen.h>
+#include <c10/util/irange.h>

 #include <iostream>
 using namespace std;
@ -102,7 +103,7 @@ void trace() {
  auto foo_a = foo.accessor<float, 2>();
  float trace = 0;

-  for (int i = 0; i < foo_a.size(0); i++) {
+  for (const auto i : c10::irange(foo_a.size(0))) {
    trace += foo_a[i][i];
  }

@ -237,8 +238,8 @@ TEST_F(atest, atest) {
  // foo = foo[3];
  auto foo_v = foo.accessor<uint8_t, 2>();

-  for (int i = 0; i < foo_v.size(0); i++) {
-    for (int j = 0; j < foo_v.size(1); j++) {
+  for (const auto i : c10::irange(foo_v.size(0))) {
+    for (const auto j : c10::irange(foo_v.size(1))) {
      foo_v[i][j]++;
    }
  }
--- a/aten/src/ATen/test/basic.cpp
+++ b/aten/src/ATen/test/basic.cpp
@ -4,6 +4,7 @@
 #include <ATen/core/Reduction.h>
 #include <torch/cuda.h>
 #include <ATen/test/test_assert.h>
+#include <c10/util/irange.h>

 // for TH compat test only...
 struct THFloatTensor;
@ -84,7 +85,8 @@ void TestAdd(DeprecatedTypeProperties& type) {
 void TestZeros(DeprecatedTypeProperties& type) {
  auto begin = std::chrono::high_resolution_clock::now();
  Tensor a = zeros({1024, 1024}, type);
-  for (int i = 1; i < 1000; ++i) {
+  for (const auto i : c10::irange(1, 1000)) {
+    (void)i; // Suppress unused variable warning
    a = zeros({128, 128}, type);
  }
  auto end = std::chrono::high_resolution_clock::now();
@ -102,7 +104,8 @@ void TestLoadsOfAdds(DeprecatedTypeProperties& type) {
  auto begin = std::chrono::high_resolution_clock::now();
  Tensor d = ones({3, 4}, type);
  Tensor r = zeros({3, 4}, type);
-  for (auto i = 0; i < 100000; i++) {
+  for (const auto i : c10::irange(100000)) {
+    (void)i; // Suppress unused variable warning
    add_out(r, r, d);
  }
  auto end = std::chrono::high_resolution_clock::now();
@ -119,7 +122,8 @@ void TestLoadOfAddsWithCopy(DeprecatedTypeProperties& type) {
  auto begin = std::chrono::high_resolution_clock::now();
  Tensor d = ones({3, 4}, type);
  Tensor r = zeros({3, 4}, type);
-  for (auto i = 0; i < 100000; i++) {
+  for (const auto i : c10::irange(100000)) {
+    (void)i; // Suppress unused variable warning
    r = add(r, d);
  }
  auto end = std::chrono::high_resolution_clock::now();
@ -176,7 +180,7 @@ void TestCopyBroadcasting(DeprecatedTypeProperties& type) {
  Tensor a = zeros({4, 3}, type);
  Tensor e = rand({3}, type);
  a.copy_(e);
-  for (int i = 0; i < 4; ++i) {
+  for (const auto i : c10::irange(4)) {
    ASSERT_TRUE(a[i].equal(e));
  }
 }
@ -247,13 +251,13 @@ void TestToString() {
 void TestIndexingByScalar() {
  Tensor tensor = arange(0, 10, kInt);
  Tensor one = ones({}, kInt);
-  for (int64_t i = 0; i < tensor.numel(); ++i) {
+  for (const auto i : c10::irange(tensor.numel())) {
    ASSERT_TRUE(tensor[i].equal(one * i));
  }
  for (size_t i = 0; i < static_cast<uint64_t>(tensor.numel()); ++i) {
    ASSERT_TRUE(tensor[i].equal(one * static_cast<int64_t>(i)));
  }
-  for (int i = 0; i < tensor.numel(); ++i) {
+  for (const auto i : c10::irange(tensor.numel())) {
    ASSERT_TRUE(tensor[i].equal(one * i));
  }
  // NOLINTNEXTLINE(bugprone-too-small-loop-variable)
@ -272,7 +276,7 @@ void TestIndexingByScalar() {
 void TestIndexingByZerodimTensor() {
  Tensor tensor = arange(0, 10, kInt);
  Tensor one = ones({}, kInt);
-  for (int i = 0; i < tensor.numel(); ++i) {
+  for (const auto i : c10::irange(tensor.numel())) {
    ASSERT_TRUE(tensor[one * i].equal(one * i));
  }
  // Throw StartsWith(
--- a/aten/src/ATen/test/cpu_generator_test.cpp
+++ b/aten/src/ATen/test/cpu_generator_test.cpp
@ -4,6 +4,7 @@
 #include <ATen/Utils.h>
 #include <ATen/CPUGeneratorImpl.h>
 #include <ATen/core/PhiloxRNGEngine.h>
+#include <c10/util/irange.h>
 #include <thread>
 #include <limits>
 #include <random>
@ -160,7 +161,8 @@ TEST(CPUGeneratorImpl, TestPhiloxEngineOffset1) {
  // So if you want to skip 8 values, offset would
  // be 2, since 2*4=8.
  at::Philox4_32_10 engine2(123, 1, 2);
-  for(int i = 0; i < 8; i++){
+  for (const auto i : c10::irange(8)) {
+    (void)i; // Suppress unused variable warning
    // Note: instead of using the engine() call 8 times
    // we could have achieved the same functionality by
    // calling the incr() function twice.
@ -221,14 +223,16 @@ TEST(CPUGeneratorImpl, TestMT19937EngineReproducibility) {
  // test with zero seed
  at::mt19937 engine1(0);
  std::mt19937 engine2(0);
-  for(int i = 0; i < 10000; i++) {
+  for (const auto i : c10::irange(10000)) {
+    (void)i; // Suppress unused variable warning
    ASSERT_EQ(engine1(), engine2());
  }

  // test with large seed
  engine1 = at::mt19937(2147483647);
  engine2 = std::mt19937(2147483647);
-  for(int i = 0; i < 10000; i++) {
+  for (const auto i : c10::irange(10000)) {
+    (void)i; // Suppress unused variable warning
    ASSERT_EQ(engine1(), engine2());
  }

@ -237,7 +241,8 @@ TEST(CPUGeneratorImpl, TestMT19937EngineReproducibility) {
  auto seed = rd();
  engine1 = at::mt19937(seed);
  engine2 = std::mt19937(seed);
-  for(int i = 0; i < 10000; i++) {
+  for (const auto i : c10::irange(10000)) {
+    (void)i; // Suppress unused variable warning
    ASSERT_EQ(engine1(), engine2());
  }

--- a/aten/src/ATen/test/cuda_tensor_interop_test.cpp
+++ b/aten/src/ATen/test/cuda_tensor_interop_test.cpp
@ -2,6 +2,7 @@

 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>
+#include <c10/util/irange.h>
 #include <caffe2/core/init.h>
 #include <caffe2/core/operator.h>
 #include <caffe2/core/context_gpu.h>
@ -34,7 +35,7 @@ TEST(CUDACaffe2ToPytorch, SimpleLegacy) {

  auto at_cpu = at_tensor.cpu();
  auto it = at_cpu.data_ptr<int64_t>();
-  for (int64_t i = 0; i < 16; i++) {
+  for (const auto i : c10::irange(16)) {
    ASSERT_EQ(it[i], 777);
  }
 }
@ -53,7 +54,7 @@ TEST(CUDACaffe2ToPytorch, Simple) {

  auto at_cpu = at_tensor.cpu();
  auto it = at_cpu.data_ptr<int64_t>();
-  for (int64_t i = 0; i < 16; i++) {
+  for (const auto i : c10::irange(16)) {
    ASSERT_EQ(it[i], 777);
  }
 }
@ -109,7 +110,7 @@ TEST(CUDAPytorchToCaffe2, Op) {
  ASSERT_EQ(result.GetDeviceType(), caffe2::CUDA);

  auto data = result.data<float>();
-  for (int64_t i = 0; i < 25; i++) {
+  for (const auto i : c10::irange(25)) {
    ASSERT_EQ(cuda_get(data + i), 3.0);
  }
  at::Tensor at_result(result);
--- a/aten/src/ATen/test/ivalue_test.cpp
+++ b/aten/src/ATen/test/ivalue_test.cpp
@ -3,6 +3,7 @@
 #include <gtest/gtest.h>
 #include <torch/torch.h>
 #include <c10/util/intrusive_ptr.h>
+#include <c10/util/irange.h>
 #include <ATen/core/Dict.h>

 // Snippets for checking assembly.
@ -643,7 +644,7 @@ TEST(IValueTest, IdentityComparisonAndHashing) {
  auto moreSampleIValues = makeMoreSampleIValues();

  ASSERT_EQ(sampleIValues.size(), moreSampleIValues.size());
-  for (int ii = 0; ii < sampleIValues.size(); ++ii) {
+  for (const auto ii : c10::irange(sampleIValues.size())) {
    if (sampleIValues[ii].isComplexDouble() ||
        sampleIValues[ii].isBlob() ||
        sampleIValues[ii].isList() ||
--- a/aten/src/ATen/test/math_kernel_test.cpp
+++ b/aten/src/ATen/test/math_kernel_test.cpp
@ -2,6 +2,7 @@

 #include <ATen/ATen.h>
 #include <ATen/CPUFunctions.h>
+#include <c10/util/irange.h>

 using namespace at;

@ -115,7 +116,7 @@ TEST(MathKernelTest, MishBackward) {

 TEST(MathKernelTest, NarrowCopy)  {
  auto x = rand({5, 8, 7});
-  for (int64_t dim = 0; dim < 3; ++dim) {
+  for (const auto dim : c10::irange(3)) {
    const int64_t start = 1, length = 4;
    auto y_ref = x.narrow(dim, start, length);
    auto y_test = at::native::narrow_copy_dense(x, dim, start, length);
--- a/aten/src/ATen/test/native_test.cpp
+++ b/aten/src/ATen/test/native_test.cpp
@ -1,6 +1,7 @@
 #include <gtest/gtest.h>

 #include <ATen/ATen.h>
+#include <c10/util/irange.h>

 using namespace at;

@ -16,7 +17,7 @@ using namespace at;

 void requireEqualTensorList(TensorList t1, TensorList t2) {
  ASSERT_EQ(t1.size(), t2.size());
-  for (size_t i = 0; i < t1.size(); ++i) {
+  for (const auto i : c10::irange(t1.size())) {
    ASSERT_EQUAL(t1[i], t2[i]);
  }
 }
@ -74,7 +75,7 @@ void TestStack(TensorOptions T, Tensor& t) {
    auto z = rand({2, 3, 4});

    auto inputs = {x, y, z};
-    for (int64_t dim = 0; dim < 4; ++dim) {
+    for (const auto dim : c10::irange(4)) {
      _test_stack(inputs, dim, at::stack);
    }
  }
@ -85,7 +86,7 @@ void TestStack(TensorOptions T, Tensor& t) {
    auto z = rand({2, 3, 4});

    auto inputs = {x, y, z};
-    for (int64_t dim = 0; dim < 4; ++dim) {
+    for (const auto dim : c10::irange(4)) {
      _test_stack(inputs, dim, at::native::_stack);
    }
  }
@ -96,7 +97,7 @@ void TestStack(TensorOptions T, Tensor& t) {
    auto z = rand({2, 3, 4});

    auto inputs = {x, y, z};
-    for (int64_t dim = 0; dim < 4; ++dim) {
+    for (const auto dim : c10::irange(4)) {
      _test_stack(inputs, dim, at::native::_stack_cpu);
    }
  }
--- a/aten/src/ATen/test/packedtensoraccessor_test.cpp
+++ b/aten/src/ATen/test/packedtensoraccessor_test.cpp
@ -1,6 +1,7 @@
 #include <ATen/Operators.h>
 #include <ATen/test/test_assert.h>
 #include <c10/util/Exception.h>
+#include <c10/util/irange.h>
 #include <gtest/gtest.h>

 #include <ATen/ATen.h>
@ -34,7 +35,7 @@ TEST(PackedtensoraccessorTest, TransposeTest) {
  t = rand({size}, CPU(kFloat));
  auto original_1d = t.packed_accessor64<float, 1, DefaultPtrTraits>();
  auto transposed_1d = original_1d.transpose(0, 0);
-  for (int i = 0; i < size; i++){
+  for (const auto i : c10::irange(size)) {
    ASSERT_EQ(original_1d[i], transposed_1d[i]);
  }

--- a/aten/src/ATen/test/pow_test.cpp
+++ b/aten/src/ATen/test/pow_test.cpp
@ -1,6 +1,7 @@
 #include <gtest/gtest.h>

 #include <ATen/native/Pow.h>
+#include <c10/util/irange.h>

 #include <torch/types.h>
 #include <torch/utils.h>
@ -203,7 +204,7 @@ void tensor_pow_tensor(const Vals vals, c10::ScalarType vals_dtype, Pows pows, c
  std::cout.precision(dbl::max_digits10);

  const auto vals_tensor = torch::tensor(vals, vals_dtype);
-  for (size_t shift = 0; shift < pows.size(); shift++) {
+  for (const auto shift : c10::irange(pows.size())) {
    const auto pows_tensor = torch::tensor(pows, pows_dtype);

    const auto actual_pow = vals_tensor.pow(pows_tensor);
--- a/aten/src/ATen/test/quantized_test.cpp
+++ b/aten/src/ATen/test/quantized_test.cpp
@ -11,6 +11,7 @@
 // For quantize_val
 #include <ATen/native/quantized/affine_quantizer.h>
 #include <c10/core/ScalarType.h>
+#include <c10/util/irange.h>
 #include <ATen/quantized/Quantizer.h>

 using namespace at;
@ -30,14 +31,14 @@ TEST(TestQTensor, QuantDequantAPIs) {
  // int_repr
  Tensor int_repr = qr.int_repr();
  auto* int_repr_data = int_repr.data_ptr<uint8_t>();
-  for (auto i = 0; i < num_elements; ++i) {
+  for (const auto i : c10::irange(num_elements)) {
    ASSERT_EQ(int_repr_data[i], 3);
  }

  // Check for correct quantization
  auto r_data = r.data_ptr<float>();
  auto qr_data = qr.data_ptr<quint8>();
-  for (auto i = 0; i < num_elements; ++i) {
+  for (const auto i : c10::irange(num_elements)) {
    ASSERT_EQ(
        native::quantize_val<quint8>(scale, zero_point, r_data[i]).val_,
        qr_data[i].val_);
@ -46,10 +47,10 @@ TEST(TestQTensor, QuantDequantAPIs) {
  // Check for correct dequantization
  Tensor rqr = qr.dequantize();
  auto rqr_data = rqr.data_ptr<float>();
-  for (auto i = 0; i < num_elements; ++i) {
+  for (const auto i : c10::irange(num_elements)) {
    ASSERT_EQ(r_data[i], rqr_data[i]);
  }
-  for (auto i = 0; i < num_elements; ++i) {
+  for (const auto i : c10::irange(num_elements)) {
    ASSERT_EQ(
        r_data[i],
        native::dequantize_val(qr.q_scale(), qr.q_zero_point(), qr_data[i]));
@ -60,7 +61,7 @@ TEST(TestQTensor, QuantDequantAPIs) {
  int64_t new_zero_point = 1;
  Tensor reqr = at::quantize_per_tensor(r, new_scale, new_zero_point, kQInt8);
  auto reqr_data = reqr.data_ptr<qint8>();
-  for (auto i = 0; i < num_elements; ++i) {
+  for (const auto i : c10::irange(num_elements)) {
    reqr_data[i].val_ =
        native::requantize_val<quint8, qint8>(
            scale, zero_point, new_scale, new_zero_point, qr_data[i])
@ -85,7 +86,7 @@ TEST(TestQTensor, RoundingMode) {
  Tensor qx = at::quantize_per_tensor(x, /*scale=*/1.0, zero_point, kQUInt8);

  auto qx_data = qx.data_ptr<quint8>();
-  for (size_t idx = 0; idx < x_values.size(); ++idx) {
+  for (const auto idx : c10::irange(x_values.size())) {
    ASSERT_EQ(qx_expect[idx], qx_data[idx].val_)
        << "Tie breaking during rounding element " << idx << " failed!";
  }
@ -108,14 +109,14 @@ TEST(TestQTensor, EmptyQuantized) {
      {numel}, at::device(at::kCPU).dtype(kQUInt8), scale, zero_point);
  // Assigning to QTensor
  auto* q_data = q.data_ptr<quint8>();
-  for (int i = 0; i < numel; ++i) {
+  for (const auto i : c10::irange(numel)) {
    q_data[i].val_ = val;
  }

  // dequantize
  auto r = q.dequantize();
  auto* r_data = r.data_ptr<float>();
-  for (int i = 0; i < numel; ++i) {
+  for (const auto i : c10::irange(numel)) {
    ASSERT_EQ(r_data[i], (val - zero_point) * scale);
  }
 }
@ -134,14 +135,14 @@ TEST(TestQTensor, EmptyPerchannelQuantized) {
      at::device(at::kCPU).dtype(kQUInt8));
  // Assigning to QTensor
  auto* q_data = q.data_ptr<quint8>();
-  for (int i = 0; i < numel; ++i) {
+  for (const auto i : c10::irange(numel)) {
    q_data[i].val_ = val;
  }

  // dequantize
  auto r = q.dequantize();
  auto* r_data = r.data_ptr<float>();
-  for (int i = 0; i < numel; ++i) {
+  for (const auto i : c10::irange(numel)) {
    ASSERT_EQ(
        r_data[i],
        (val - zero_points[i].item().to<int>()) * scales[i].item().to<float>());
@ -222,7 +223,7 @@ TEST(TestQTensor, FromBlobQuantizedPerTensor) {
  custom_vec->reserve(numel);

  uint8_t* custom_data = custom_vec->data();
-  for (auto i = 0; i < numel; ++i) {
+  for (const auto i : c10::irange(numel)) {
    custom_data[i] = i;
  }
  bool customDataDeleted{false};
@ -236,7 +237,7 @@ TEST(TestQTensor, FromBlobQuantizedPerTensor) {
  Tensor qtensor = at::from_blob_quantized_per_tensor_affine(custom_data, shape, deleter, scale, zero_point, options);

  uint8_t* q_data = (uint8_t*)qtensor.data_ptr<quint8>();
-  for (auto i = 0; i < numel; ++i) {
+  for (const auto i : c10::irange(numel)) {
    ASSERT_EQ((int)custom_data[i], (int)q_data[i]);
  }
  ASSERT_EQ((float)qtensor.q_scale(), (float)scale);
@ -258,7 +259,7 @@ TEST(TestQTensor, FromBlobQuantizedPerChannel) {
  custom_vec->reserve(numel);

  uint8_t* custom_data = custom_vec->data();
-  for (auto i = 0; i < numel; ++i) {
+  for (const auto i : c10::irange(numel)) {
    custom_data[i] = i;
  }
  bool customDataDeleted{false};
@ -271,7 +272,7 @@ TEST(TestQTensor, FromBlobQuantizedPerChannel) {
  {
  Tensor qtensor = at::from_blob_quantized_per_channel_affine(custom_data, shape, deleter, scales, zero_points, ch_axis, options);
  uint8_t* q_data = (uint8_t*)qtensor.data_ptr<quint8>();
-  for (auto i = 0; i < numel; ++i) {
+  for (const auto i : c10::irange(numel)) {
    ASSERT_EQ((int)custom_data[i], (int)q_data[i]);
  }
  ASSERT_TRUE(at::allclose(qtensor.q_per_channel_scales(), scales));
--- a/aten/src/ATen/test/tensor_interop_test.cpp
+++ b/aten/src/ATen/test/tensor_interop_test.cpp
@ -1,6 +1,7 @@
 #include <gtest/gtest.h>

 #include <ATen/ATen.h>
+#include <c10/util/irange.h>
 #include <caffe2/core/init.h>
 #include <caffe2/core/operator.h>

@ -8,13 +9,13 @@ TEST(Caffe2ToPytorch, SimpleLegacy) {
  caffe2::Tensor c2_tensor(caffe2::CPU);
  c2_tensor.Resize(4, 4);
  auto data = c2_tensor.mutable_data<int64_t>();
-  for (int64_t i = 0; i < 16; i++) {
+  for (const auto i : c10::irange(16)) {
    data[i] = i;
  }
  at::Tensor at_tensor(c2_tensor);

  auto it = at_tensor.data_ptr<int64_t>();
-  for (int64_t i = 0; i < 16; i++) {
+  for (const auto i : c10::irange(16)) {
    ASSERT_EQ(it[i], i);
  }
 }
@ -22,13 +23,13 @@ TEST(Caffe2ToPytorch, SimpleLegacy) {
 TEST(Caffe2ToPytorch, Simple) {
  caffe2::Tensor c2_tensor = caffe2::empty({4, 4}, at::kLong);
  auto data = c2_tensor.mutable_data<int64_t>();
-  for (int64_t i = 0; i < 16; i++) {
+  for (const auto i : c10::irange(16)) {
    data[i] = i;
  }
  at::Tensor at_tensor(c2_tensor);

  auto it = at_tensor.data_ptr<int64_t>();
-  for (int64_t i = 0; i < 16; i++) {
+  for (const auto i : c10::irange(16)) {
    ASSERT_EQ(it[i], i);
  }
 }
@ -37,7 +38,7 @@ TEST(Caffe2ToPytorch, ExternalData) {
  caffe2::Tensor c2_tensor = caffe2::empty({4, 4}, at::kLong);
  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays,cppcoreguidelines-avoid-magic-numbers)
  int64_t buf[16];
-  for (int64_t i = 0; i < 16; i++) {
+  for (const auto i : c10::irange(16)) {
    buf[i] = i;
  }
  c2_tensor.ShareExternalPointer(buf, 16 * sizeof(int64_t));
@ -48,7 +49,7 @@ TEST(Caffe2ToPytorch, ExternalData) {
  at_tensor.permute({1, 0});
  at_tensor.permute({1, 0});
  auto it = at_tensor.data_ptr<int64_t>();
-  for (int64_t i = 0; i < 16; i++) {
+  for (const auto i : c10::irange(16)) {
    ASSERT_EQ(it[i], i);
  }
  ASSERT_FALSE(at_tensor.storage().resizable());
@ -60,7 +61,7 @@ TEST(Caffe2ToPytorch, Op) {
  caffe2::Tensor c2_tensor(caffe2::CPU);
  c2_tensor.Resize(3, 3);
  auto data = c2_tensor.mutable_data<int64_t>();
-  for (int64_t i = 0; i < 9; i++) {
+  for (const auto i : c10::irange(9)) {
    data[i] = i;
  }
  at::Tensor at_tensor(c2_tensor);
@ -107,7 +108,7 @@ TEST(Caffe2ToPytorch, PartiallyInitialized) {
 TEST(Caffe2ToPytorch, MutualResizes) {
  caffe2::Tensor c2_tensor = caffe2::empty({5, 5}, at::kFloat);
  auto data = c2_tensor.mutable_data<float>();
-  for (int64_t i = 0; i < 25; i++) {
+  for (const auto i : c10::irange(25)) {
    data[i] = 0;
  }

@ -171,7 +172,7 @@ TEST(PytorchToCaffe2, Op) {
  auto result = XBlobGetMutableTensor(workspace.CreateBlob("d"), {5, 5}, at::kCPU);

  auto it = result.data<float>();
-  for (int64_t i = 0; i < 25; i++) {
+  for (const auto i : c10::irange(25)) {
    ASSERT_EQ(it[i], 3.0);
  }
  at::Tensor at_result(result);
@ -202,7 +203,7 @@ TEST(PytorchToCaffe2, SharedStorageRead) {

  auto result = XBlobGetMutableTensor(workspace.CreateBlob("c"), {5, 5}, at::kCPU);
  auto it = result.data<float>();
-  for (int64_t i = 0; i < 25; i++) {
+  for (const auto i : c10::irange(25)) {
    ASSERT_EQ(it[i], 2.0);
  }
  at::Tensor at_result(result);
@ -259,7 +260,7 @@ TEST(PytorchToCaffe2, Strided) {
  ASSERT_ANY_THROW(caffe2::Tensor c2_tensor(at_tensor));
  // but calling contiguous is fine
  caffe2::Tensor c2_tensor(at_tensor.contiguous());
-  for (int64_t i = 0; i < 25; i++) {
+  for (const auto i : c10::irange(25)) {
    ASSERT_EQ(c2_tensor.data<float>()[i], 1.0);
  }
 }
--- a/aten/src/ATen/test/thread_init_test.cpp
+++ b/aten/src/ATen/test/thread_init_test.cpp
@ -1,5 +1,6 @@
 #include <ATen/ATen.h>
 #include <ATen/Parallel.h>
+#include <c10/util/irange.h>
 #include <test/cpp/tensorexpr/test_base.h>
 #include <thread>

@ -13,7 +14,8 @@ void test(int given_num_threads) {
  ASSERT_TRUE(given_num_threads >= 0);
  ASSERT_EQ(at::get_num_threads(), given_num_threads);
  auto t_sum = t.sum();
-  for (int i = 0; i < 1000; ++i) {
+  for (const auto i : c10::irange(1000)) {
+    (void)i; // Suppress unused variable warning
    t_sum = t_sum + t.sum();
  }
 }
--- a/aten/src/ATen/test/vec_test_all_types.cpp
+++ b/aten/src/ATen/test/vec_test_all_types.cpp
@ -1,4 +1,5 @@
 #include <ATen/test/vec_test_all_types.h>
+#include <c10/util/irange.h>
 namespace {
 #if GTEST_HAS_TYPED_TEST
    template <typename T>
@ -455,7 +456,7 @@ namespace {
        // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
        CACHE_ALIGN VT expected_vals[vec::size()];
        auto vals = 1 << (vec::size());
-        for (int val = 0; val < vals; ++val) {
+        for (const auto val : c10::irange(vals)) {
          for (int i = 0; i < vec::size(); ++i) {
            if (val & (1 << i)) {
              test_vals[i] = std::numeric_limits<VT>::quiet_NaN();
@ -747,7 +748,7 @@ namespace {
        CACHE_ALIGN VT test_vals[vec::size()];
        //all sets will be within 0  2^(n-1)
        auto power_sets = 1 << (vec::size());
-        for (int expected = 0; expected < power_sets; expected++) {
+        for (const auto expected : c10::irange(power_sets)) {
            // generate test_val based on expected
            for (int i = 0; i < vec::size(); ++i)
            {
@ -894,7 +895,7 @@ namespace {
    void blend_init(T(&a)[N], T(&b)[N]) {
        a[0] = (T)1.0;
        b[0] = a[0] + (T)N;
-        for (int i = 1; i < N; i++) {
+        for (const auto i : c10::irange(1, N)) {
            a[i] = a[i - 1] + (T)(1.0);
            b[i] = b[i - 1] + (T)(1.0);
        }
@ -905,7 +906,7 @@ namespace {
        auto add = Complex<float>(1., 100.);
        a[0] = Complex<float>(1., 100.);
        b[0] = Complex<float>(5., 1000.);
-        for (int i = 1; i < 4; i++) {
+        for (const auto i : c10::irange(1, 4)) {
            a[i] = a[i - 1] + add;
            b[i] = b[i - 1] + add;
        }
@ -1051,7 +1052,8 @@ namespace {
        float minv = static_cast<float>(static_cast<double>(min_val) * 2.0);
        float maxv = static_cast<float>(static_cast<double>(max_val) * 2.0);
        ValueGen<float> gen(minv, maxv, seed.add(2));
-        for (int i = 0; i < trials; i++) {
+        for (const auto i : c10::irange(trials)) {
+            (void)i; // Suppress unused variable warning
            float scale = generator_sc.get();
            float inv_scale = 1.0f / static_cast<float>(scale);
            auto zero_point_val = generator_zp.get();
@ -1088,7 +1090,8 @@ namespace {
        ValueGen<int> generator(min_val, max_val, seed.add(1));
        //scale
        ValueGen<float> generator_sc(1.f, 15.f, seed.add(2));
-        for (int i = 0; i < trials; i++) {
+        for (const auto i : c10::irange(trials)) {
+            (void)i; // Suppress unused variable warning
            float scale = generator_sc.get();
            int32_t zero_point_val = generator.get();
            float scale_zp_premul = -(scale * zero_point_val);
@ -1135,7 +1138,8 @@ namespace {
        ValueGen<int32_t> generator(min_val, max_val, seed);
        //scale
        ValueGen<float> generator_sc(1.f, 15.f, seed.add(1));
-        for (int i = 0; i < trials; i++) {
+        for (const auto i : c10::irange(trials)) {
+            (void)i; // Suppress unused variable warning
            float multiplier = 1.f / (generator_sc.get());
            auto zero_point_val = generator.get();
            int index = 0;
@ -1172,7 +1176,8 @@ namespace {
        typename vec::int_vec_return_type  expected_int_ret;
        auto seed = TestSeed();
        ValueGen<underlying> generator(min_val, max_val, seed);
-        for (int i = 0; i < trials; i++) {
+        for (const auto i : c10::irange(trials)) {
+            (void)i; // Suppress unused variable warning
            //generate vals
            for (int j = 0; j < vec::size(); j++) {
                qint_vals[j] = generator.get();
@ -1251,7 +1256,7 @@ namespace {
        CACHE_ALIGN VT ref_y[N];
        auto seed = TestSeed();
        ValueGen<VT> generator(VT(-100), VT(100), seed);
-        for (int64_t i = 0; i < N; i++) {
+        for (const auto i : c10::irange(N)) {
          x1[i] = generator.get();
          x2[i] = generator.get();
          x3[i] = generator.get();
@ -1263,19 +1268,19 @@ namespace {
        };
        // test map: y = x1
        at::vec::map<VT>([](vec x) { return x; }, y, x1, N);
-        for (int64_t i = 0; i < N; i++) { ref_y[i] = x1[i]; }
+        for (const auto i : c10::irange(N)) { ref_y[i] = x1[i]; }
        cmp(y, ref_y);
        // test map2: y = x1 + x2
        at::vec::map2<VT>([](vec x1, vec x2) { return x1 + x2; }, y, x1, x2, N);
-        for (int64_t i = 0; i < N; i++) { ref_y[i] = x1[i] + x2[i]; }
+        for (const auto i : c10::irange(N)) { ref_y[i] = x1[i] + x2[i]; }
        cmp(y, ref_y);
        // test map3: y = x1 + x2 + x3
        at::vec::map3<VT>([](vec x1, vec x2, vec x3) { return x1 + x2 + x3; }, y, x1, x2, x3, N);
-        for (int64_t i = 0; i < N; i++) { ref_y[i] = x1[i] + x2[i] + x3[i]; }
+        for (const auto i : c10::irange(N)) { ref_y[i] = x1[i] + x2[i] + x3[i]; }
        cmp(y, ref_y);
        // test map4: y = x1 + x2 + x3 + x4
        at::vec::map4<VT>([](vec x1, vec x2, vec x3, vec x4) { return x1 + x2 + x3 + x4; }, y, x1, x2, x3, x4, N);
-        for (int64_t i = 0; i < N; i++) { ref_y[i] = x1[i] + x2[i] + x3[i] + x4[i]; }
+        for (const auto i : c10::irange(N)) { ref_y[i] = x1[i] + x2[i] + x3[i] + x4[i]; }
        cmp(y, ref_y);
    }
      TYPED_TEST(FunctionalBF16Tests, Reduce) {
@ -1294,7 +1299,7 @@ namespace {
      CACHE_ALIGN VT x_b3[N];
      auto seed = TestSeed();
      ValueGen<RT> generator(RT(-1), RT(1), seed);
-      for (int64_t i = 0; i < N; i++) {
+      for (const auto i : c10::irange(N)) {
        x_f1[i] = generator.get();
        x_f2[i] = generator.get();
        x_f3[i] = generator.get();
@ -1362,7 +1367,7 @@ namespace {
      CACHE_ALIGN VT y_b[N];
      auto seed = TestSeed();
      ValueGen<RT> generator(RT(-1), RT(1), seed);
-      for (int64_t i = 0; i < N; i++) {
+      for (const auto i : c10::irange(N)) {
        x_f1[i] = generator.get();
        x_f2[i] = generator.get();
        x_f3[i] = generator.get();
@ -1379,7 +1384,7 @@ namespace {
      for (int64_t len = 1; len <= N; len++) {
        at::vec::map<RT>([](auto x) { return x; }, y_f, x_f1, len);
        at::vec::map<VT>([](auto x) { return x; }, y_b, x_b1, len);
-        for (int64_t i = 0; i < len; i++) {
+        for (const auto i : c10::irange(len)) {
          ASSERT_TRUE(cmp(y_f[i], y_b[i])) << "Failure Details:\nTest Seed to reproduce: " << seed
              << "\nmap, Length: " << len << "; index: " << i << "; fp32 reference: " << y_f[i] << "; bf16 value: " << RT(y_b[i]);
        }
@ -1388,7 +1393,7 @@ namespace {
      for (int64_t len = 1; len <= N; len++) {
        at::vec::map2<RT>([](auto x, auto y) { return x + y; }, y_f, x_f1, x_f2, len);
        at::vec::map2<VT>([](auto x, auto y) { return x + y; }, y_b, x_b1, x_b2, len);
-        for (int64_t i = 0; i < len; i++) {
+        for (const auto i : c10::irange(len)) {
          ASSERT_TRUE(cmp(y_f[i], y_b[i])) << "Failure Details:\nTest Seed to reproduce: " << seed
              << "\nmap2, Length: " << len << "; index: " << i << "; fp32 reference: " << y_f[i] << "; bf16 value: " << RT(y_b[i]);
        }
@ -1397,7 +1402,7 @@ namespace {
      for (int64_t len = 1; len <= N; len++) {
        at::vec::map3<RT>([](auto x, auto y, auto z) { return x + y * z; }, y_f, x_f1, x_f2, x_f3, len);
        at::vec::map3<VT>([](auto x, auto y, auto z) { return x + y * z; }, y_b, x_b1, x_b2, x_b3, len);
-        for (int64_t i = 0; i < len; i++) {
+        for (const auto i : c10::irange(len)) {
          ASSERT_TRUE(cmp(y_f[i], y_b[i])) << "Failure Details:\nTest Seed to reproduce: " << seed
              << "\nmap3, Length: " << len << "; index: " << i << "; fp32 reference: " << y_f[i] << "; bf16 value: " << RT(y_b[i]);
        }
@ -1406,7 +1411,7 @@ namespace {
      for (int64_t len = 1; len <= N; len++) {
         at::vec::map4<RT>([](auto x, auto y, auto z, auto w) { return x + y * z - w; }, y_f, x_f1, x_f2, x_f3, x_f4, len);
         at::vec::map4<VT>([](auto x, auto y, auto z, auto w) { return x + y * z - w; }, y_b, x_b1, x_b2, x_b3, x_b4, len);
-         for (int64_t i = 0; i < len; i++) {
+         for (const auto i : c10::irange(len)) {
           ASSERT_TRUE(cmp(y_f[i], y_b[i])) << "Failure Details:\nTest Seed to reproduce: " << seed
               << "\nmap4, Length: " << len << "; index: " << i << "; fp32 reference: " << y_f[i] << "; bf16 value: " << RT(y_b[i]);
         }
--- a/aten/src/ATen/test/vec_test_all_types.h
+++ b/aten/src/ATen/test/vec_test_all_types.h
@ -1,6 +1,7 @@
 #pragma once
 #include <ATen/cpu/vec/vec.h>
 #include <ATen/cpu/vec/functional.h>
+#include <c10/util/irange.h>
 #include <gtest/gtest.h>
 #include <chrono>
 #include <exception>
@ -869,8 +870,7 @@ public:
        act.store(actArr);
        if (bitwise)
        {
-            for (int i = 0; i < sizeX; i++)
-            {
+            for (const auto i : c10::irange(sizeX)) {
                BVT b_exp = bit_cast<BVT>(expArr[i]);
                BVT b_act = bit_cast<BVT>(actArr[i]);
                EXPECT_EQ(b_exp, b_act) << getDetail(i / unitStorageCount);
@ -880,8 +880,7 @@ public:
        }
        else if (checkWithTolerance)
        {
-            for (int i = 0; i < sizeX; i++)
-            {
+            for (const auto i : c10::irange(sizeX)) {
                EXPECT_EQ(nearlyEqual<UVT>(expArr[i], actArr[i], absErr), true) << expArr[i] << "!=" << actArr[i] << "\n" << getDetail(i / unitStorageCount);
                if (::testing::Test::HasFailure())
                    return true;
@ -889,8 +888,7 @@ public:
        }
        else
        {
-            for (int i = 0; i < sizeX; i++)
-            {
+            for (const auto i : c10::irange(sizeX)) {
                if (std::is_same<UVT, float>::value)
                {
                    if (!check_both_nan(expArr[i], actArr[i])) {
@ -952,8 +950,9 @@ void test_unary(
        UVT start = dmn_argc > 0 ? dmn.ArgsDomain[0].start : default_start;
        UVT end = dmn_argc > 0 ? dmn.ArgsDomain[0].end : default_end;
        ValueGen<VT> generator(start, end, seed.add(changeSeedBy));
-        for (int trial = 0; trial < trialCount; trial++) {
-            for (int k = 0; k < el_count; k++) {
+        for (const auto trial : c10::irange(trialCount)) {
+            (void)trial; // Suppress unused variable warning
+            for (const auto k : c10::irange(el_count)) {
                vals[k] = generator.get();
                call_filter(filter, vals[k]);
                //map operator
@ -1011,8 +1010,9 @@ void test_binary(
        UVT end1 = dmn_argc > 1 ? dmn.ArgsDomain[1].end : default_end;
        ValueGen<VT> generator0(start0, end0, seed.add(changeSeedBy));
        ValueGen<VT> generator1(start1, end1, seed.add(changeSeedBy + 1));
-        for (int trial = 0; trial < trialCount; trial++) {
-            for (int k = 0; k < el_count; k++) {
+        for (const auto trial : c10::irange(trialCount)) {
+            (void)trial; // Suppress unused variable warning
+            for (const auto k : c10::irange(el_count)) {
                vals0[k] = generator0.get();
                vals1[k] = generator1.get();
                call_filter(filter, vals0[k], vals1[k]);
@ -1076,8 +1076,9 @@ void test_ternary(
        ValueGen<VT> generator1(start1, end1, seed.add(changeSeedBy + 1));
        ValueGen<VT> generator2(start2, end2, seed.add(changeSeedBy + 2));

-        for (int trial = 0; trial < trialCount; trial++) {
-            for (int k = 0; k < el_count; k++) {
+        for (const auto trial : c10::irange(trialCount)) {
+            (void)trial; // Suppress unused variable warning
+            for (const auto k : c10::irange(el_count)) {
                vals0[k] = generator0.get();
                vals1[k] = generator1.get();
                vals2[k] = generator2.get();
--- a/aten/src/ATen/test/vitals.cpp
+++ b/aten/src/ATen/test/vitals.cpp
@ -3,6 +3,7 @@

 #include <ATen/ATen.h>
 #include <ATen/core/Vitals.h>
+#include <c10/util/irange.h>
 #include <cstdlib>

 using namespace at::vitals;
@ -62,7 +63,7 @@ TEST(Vitals, MultiString) {
 }

 TEST(Vitals, OnAndOff) {
-  for (auto i = 0; i < 2; ++i) {
+  for (const auto i : c10::irange(2)) {
    std::stringstream buffer;

    std::streambuf* sbuf = std::cout.rdbuf();
--- a/aten/src/ATen/test/vmap_test.cpp
+++ b/aten/src/ATen/test/vmap_test.cpp
@ -3,6 +3,7 @@
 #include <ATen/ATen.h>
 #include <ATen/BatchedTensorImpl.h>
 #include <ATen/VmapTransforms.h>
+#include <c10/util/irange.h>

 using namespace at;

@ -55,7 +56,7 @@ TEST(VmapTest, TestBatchedTensor) {
 // returns {{lvl=0,dim=0}, {lvl=1,dim=1}, ..., {lvl=kVmapNumLevels-1,dim=kVmapNumLevels-1}};
 static BatchDims maxBatchDimsAtFront() {
  BatchDims result;
-  for (int64_t lvl = 0; lvl < kVmapNumLevels; lvl++) {
+  for (const auto lvl : c10::irange(kVmapNumLevels)) {
    result.emplace_back(lvl, /*dim=*/lvl);
  }
  return result;
@ -169,7 +170,8 @@ TEST(VmapTest, TestBatchedTensorActualDim) {
  {
    // ActualDim on kVmapMaxTensorDims sized underlying tensor
    auto tensor = ones({});
-    for (int64_t i = 0; i < kVmapMaxTensorDims; i++) {
+    for (const auto i : c10::irange(kVmapMaxTensorDims)) {
+      (void)i; // Suppress unused variable warning
      tensor = tensor.unsqueeze(0);
    }
    ASSERT_EQ(tensor.dim(), kVmapMaxTensorDims);
@ -260,7 +262,7 @@ TEST(VmapTest, TestMultiBatchVmapTransform) {
    BatchDims batch_dims = {
      {0, 2}, {1, 1}, {2, kVmapNumLevels - 1}, {3, 5}, {4, 0}, {5, 3}, {6, 4}
    };
-    for (int64_t level = 7; level < kVmapNumLevels; level++ ) {
+    for (const auto level : c10::irange(7, kVmapNumLevels)) {
      batch_dims.emplace_back(level, /*dim=*/level - 1);
    }
    auto tensor = ones(sizes);
@ -303,7 +305,7 @@ TEST(VmapTest, TestVmapPhysicalViewGetPhysicalDims) {

 static void checkBatchDimsEqual(BatchDimsRef bdims, BatchDimsRef expected_bdims) {
  ASSERT_EQ(bdims.size(), expected_bdims.size());
-  for (int64_t idx = 0; idx < bdims.size(); idx++) {
+  for (const auto idx : c10::irange(bdims.size())) {
    ASSERT_EQ(bdims[idx].dim(), expected_bdims[idx].dim());
    ASSERT_EQ(bdims[idx].level(), expected_bdims[idx].level());
  }
@ -394,7 +396,7 @@ TEST(VmapTest, TestBatchedTensorSum) {
 static void checkBroadcastingVmapTransform(TensorList inputs, TensorList expected_outputs) {
  auto outputs = BroadcastingVmapTransform::logicalToPhysical(inputs);
  ASSERT_EQ(outputs.size(), expected_outputs.size());
-  for (int64_t idx = 0; idx < outputs.size(); idx++) {
+  for (const auto idx : c10::irange(outputs.size())) {
    const auto& output = outputs[idx].tensor();
    ASSERT_EQ(output.data_ptr(), expected_outputs[idx].data_ptr());
    ASSERT_TRUE(at::allclose(output, expected_outputs[idx]));
@ -878,7 +880,7 @@ TEST(VmapTest, TestBatchedTensorPermute) {
 static void checkMultiBatchVmapTransform(TensorList inputs, TensorList expected_outputs) {
  auto outputs = MultiBatchVmapTransform::logicalToPhysical(inputs);
  ASSERT_EQ(outputs.size(), expected_outputs.size());
-  for (int64_t idx = 0; idx < outputs.size(); idx++) {
+  for (const auto idx : c10::irange(outputs.size())) {
    const auto& output = outputs[idx].tensor();
    ASSERT_EQ(output.data_ptr(), expected_outputs[idx].data_ptr());
    ASSERT_EQ(output.sizes(), expected_outputs[idx].sizes());
--- a/aten/src/ATen/test/vulkan_test.cpp
+++ b/aten/src/ATen/test/vulkan_test.cpp
@ -5,6 +5,7 @@
 #include <ATen/ATen.h>
 #include <ATen/core/dispatch/Dispatcher.h>
 #include <ATen/vulkan/Context.h>
+#include <c10/util/irange.h>

 bool checkRtol(const at::Tensor& diff, const std::vector<at::Tensor> inputs) {
  double maxValue = 0.0;
@ -145,7 +146,7 @@ TEST(VulkanTest, addScalar) {
  auto t_in = at::rand({3, 2, 2, 3}, at::device(at::kCPU).dtype(at::kFloat));
  float* data = t_in.data_ptr<float>();
  auto numel = t_in.numel();
-  for (int i = 0; i < numel; i++) {
+  for (const auto i : c10::irange(numel)) {
    // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
    data[i] = i;
  }
@ -772,7 +773,7 @@ TEST(VulkanTest, tensor5d_transpose) {
      at::empty({1, 2, 3, 2, 1}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
  float* data = t_in.data_ptr<float>();
  auto numel = t_in.numel();
-  for (int i = 0; i < numel; i++) {
+  for (const auto i : c10::irange(numel)) {
    // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
    data[i] = i;
  }
@ -816,7 +817,7 @@ TEST(VulkanTest, slice) {
      at::empty({1, 4, 2, 2}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
  float* data = t_in.data_ptr<float>();
  auto numel = t_in.numel();
-  for (int i = 0; i < numel; i++) {
+  for (const auto i : c10::irange(numel)) {
    // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
    data[i] = i;
  }
@ -841,7 +842,7 @@ TEST(VulkanTest, select) {
      at::empty({1, 4, 2, 2}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
  float* data = t_in.data_ptr<float>();
  auto numel = t_in.numel();
-  for (int i = 0; i < numel; i++) {
+  for (const auto i : c10::irange(numel)) {
    // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
    data[i] = i;
  }
@ -866,7 +867,7 @@ TEST(VulkanTest, unsqueeze) {
      at::empty({1, 2, 2}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
  float* data = t_in.data_ptr<float>();
  auto numel = t_in.numel();
-  for (int i = 0; i < numel; i++) {
+  for (const auto i : c10::irange(numel)) {
    // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
    data[i] = i;
  }
--- a/benchmarks/cpp/tensorexpr/bench_concat.cpp
+++ b/benchmarks/cpp/tensorexpr/bench_concat.cpp
@ -1,4 +1,5 @@
 #include <benchmark/benchmark.h>
+#include <c10/util/irange.h>
 #include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
 #include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
 #include <torch/csrc/jit/tensorexpr/loopnest.h>
@ -15,14 +16,14 @@ class ConcatBench : public benchmark::Fixture {
    input_sizes_ = std::move(input_sizes);
    concat_dim_ = concat_dim;
    inputs_.resize(input_sizes_.size());
-    for (size_t i = 0; i < input_sizes_.size(); ++i) {
+    for (const auto i : c10::irange(input_sizes_.size())) {
      inputs_[i] = torch::ones({input_sizes_[i][0], input_sizes_[i][1]});
    }
    output_size_.resize(input_sizes_.front().size());
-    for (size_t i = 0; i < output_size_.size(); ++i) {
+    for (const auto i : c10::irange(output_size_.size())) {
      if (i == static_cast<size_t>(concat_dim_)) {
        output_size_[i] = 0;
-        for (size_t j = 0; j < input_sizes_.size(); ++j) {
+        for (const auto j : c10::irange(input_sizes_.size())) {
          output_size_[i] += input_sizes_[j][i];
        }
      } else {
@ -64,7 +65,7 @@ class ConcatBench : public benchmark::Fixture {
        [&](const VarHandle& m, const VarHandle& n) {
          int d = 0;
          std::vector<int> cumulative_concat_dim_sizes(num_inputs);
-          for (size_t i = 0; i < num_inputs; ++i) {
+          for (const auto i : c10::irange(num_inputs)) {
            cumulative_concat_dim_sizes[i] = d;
            d += input_sizes_[i][concat_dim_];
          }
@ -119,7 +120,7 @@ class ConcatBench : public benchmark::Fixture {
          {input_sizes_[i][0], input_sizes_[i][1]},
          kFloat));
      std::vector<VarPtr> for_vars(num_inputs);
-      for (size_t d = 0; d < num_dims; ++d) {
+      for (const auto d : c10::irange(num_dims)) {
        for_vars[d] =
            alloc<Var>("i" + std::to_string(i) + "_" + std::to_string(d), kInt);
      }
--- a/benchmarks/cpp/tensorexpr/bench_fuser_overhead.cpp
+++ b/benchmarks/cpp/tensorexpr/bench_fuser_overhead.cpp
@ -1,5 +1,6 @@
 #include <benchmark/benchmark.h>
 #include <c10/core/InferenceMode.h>
+#include <c10/util/irange.h>
 #include <torch/csrc/jit/codegen/fuser/interface.h>
 #include <torch/torch.h>

@ -22,7 +23,8 @@ static void FusedOverhead(benchmark::State& state) {
  auto z = torch::ones({1});

  // Warmup.
-  for (int i = 0; i < 8; i++) {
+  for (const auto i : c10::irange(8)) {
+    (void)i; // Suppress unused variable warning
    m.run_method("two_adds", x, y, z);
  }

@ -43,7 +45,8 @@ static void UnfusedOverhead(benchmark::State& state) {
  auto z = torch::ones({1});

  // Warmup.
-  for (int i = 0; i < 8; i++) {
+  for (const auto i : c10::irange(8)) {
+    (void)i; // Suppress unused variable warning
    m.run_method("two_adds", x, y, z);
  }

--- a/benchmarks/cpp/tensorexpr/bench_parallel.cpp
+++ b/benchmarks/cpp/tensorexpr/bench_parallel.cpp
@ -1,4 +1,5 @@
 #include <benchmark/benchmark.h>
+#include <c10/util/irange.h>
 #include <torch/csrc/jit/tensorexpr/analysis.h>
 #include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
 #include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
@ -53,7 +54,7 @@ BENCHMARK_DEFINE_F(ParallelAdd, Simple)(benchmark::State& state) {
  float* c_ptr = C.data_ptr<float>();
  std::vector<void*> args({c_ptr, a_ptr, b_ptr});
  cg.value<int>(args);
-  for (int i = 0; i < M; i++) {
+  for (const auto i : c10::irange(M)) {
    float diff = fabs(a_ptr[i] + b_ptr[i] - c_ptr[i]);
    TORCH_CHECK(diff < 1e-5);
  }
--- a/benchmarks/cpp/tensorexpr/bench_reduce.cpp
+++ b/benchmarks/cpp/tensorexpr/bench_reduce.cpp
@ -1,4 +1,5 @@
 #include <benchmark/benchmark.h>
+#include <c10/util/irange.h>
 #include <torch/csrc/jit/tensorexpr/analysis.h>
 #include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
 #include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
@ -78,7 +79,7 @@ static void reduce1d_naive(at::Tensor& A, at::Tensor& B) {
  int size = A.numel();
  TORCH_CHECK(B.numel() == 1);
  *pB = 0.;
-  for (int i = 0; i < size; i++) {
+  for (const auto i : c10::irange(size)) {
    *pB += pA[i];
  }
 }
@ -101,18 +102,18 @@ static void reduce1d_native_rfactor(at::Tensor& A, at::Tensor& B) {
  TORCH_CHECK(size % kChunkSize == 0);
  *pB = 0.;
  float temp[kChunkSize];
-  for (int j = 0; j < kChunkSize; j++) {
+  for (const auto j : c10::irange(kChunkSize)) {
    temp[j] = 0;
  }

  int chunk_count = size / kChunkSize;
-  for (int i = 0; i < chunk_count; i++) {
-    for (int j = 0; j < kChunkSize; j++) {
+  for (const auto i : c10::irange(chunk_count)) {
+    for (const auto j : c10::irange(kChunkSize)) {
      temp[j] += pA[i * kChunkSize + j];
    }
  }

-  for (int j = 0; j < kChunkSize; j++) {
+  for (const auto j : c10::irange(kChunkSize)) {
    *pB += temp[j];
  }
 }
@ -163,7 +164,7 @@ static void reduce1d_native_vector(at::Tensor& A, at::Tensor& B) {
  temp = _mm256_setzero_ps();

  int tile_count = size / kChunkSize;
-  for (int i = 0; i < tile_count; i++) {
+  for (const auto i : c10::irange(tile_count)) {
    __m256 data = _mm256_load_ps(pA + i * kChunkSize);
    temp = _mm256_add_ps(temp, data);
  }
@ -196,7 +197,7 @@ static void reduce1d_native_tiled(at::Tensor& A, at::Tensor& B) {
      kChunkSize,
      " ! = 0");
  __m256 t[kTileSize];
-  for (int j = 0; j < kTileSize; j++) {
+  for (const auto j : c10::irange(kTileSize)) {
    t[j] = _mm256_setzero_ps();
  }

@ -211,7 +212,7 @@ static void reduce1d_native_tiled(at::Tensor& A, at::Tensor& B) {
  }

  float result = sum_f32x8(t[0]);
-  for (int j = 1; j < kTileSize; j++) {
+  for (const auto j : c10::irange(1, kTileSize)) {
    result += sum_f32x8(t[j]);
  }
  *pB = result;
@ -540,16 +541,16 @@ BENCHMARK_DEFINE_F(Reduce2DRow, Hand)(benchmark::State& state) {
    for (int m_outer = 0; m_outer < M; m_outer += Mb) {
      float bregs[Mb][Nb] = {0.0f};
      for (int n_outer = 0; n_outer < N; n_outer += Nb) {
-        for (int m_inner = 0; m_inner < Mb; m_inner++) {
-          for (int n_inner = 0; n_inner < Nb; n_inner++) {
+        for (const auto m_inner : c10::irange(Mb)) {
+          for (const auto n_inner : c10::irange(Nb)) {
            bregs[m_inner][n_inner] +=
                a[(m_outer + m_inner) * N + n_outer + n_inner];
          }
        }
      }
-      for (int m_inner = 0; m_inner < Mb; m_inner++) {
+      for (const auto m_inner : c10::irange(Mb)) {
        b[m_outer + m_inner] = 0.f;
-        for (int n_inner = 0; n_inner < Nb; n_inner++) {
+        for (const auto n_inner : c10::irange(Nb)) {
          b[m_outer + m_inner] += bregs[m_inner][n_inner];
        }
      }
--- a/binaries/benchmark_helper.h
+++ b/binaries/benchmark_helper.h
@ -24,6 +24,7 @@
 #include "caffe2/core/operator.h"
 #include "caffe2/utils/string_utils.h"
 #include "c10/util/string_utils.h"
+#include <c10/util/irange.h>

 using std::map;
 using std::shared_ptr;
@ -55,12 +56,12 @@ void writeTextOutput(
  int dims_size = tensor_proto.dims_size();
  long long elem_dim_size =
      dims_size > 1 ? tensor_proto.dims(1) : tensor_proto.dims(0);
-  for (int i = 2; i < dims_size; i++) {
+  for (const auto i : c10::irange(2, dims_size)) {
    elem_dim_size *= tensor_proto.dims(i);
  }
  std::vector<std::string> lines;
  std::string dims;
-  for (int i = 0; i < dims_size; i++) {
+  for (const auto i : c10::irange(dims_size)) {
    int dim = tensor_proto.dims(i);
    if (i > 0) {
      dims += ", ";
--- a/c10/core/CPUAllocator.cpp
+++ b/c10/core/CPUAllocator.cpp
@ -2,6 +2,7 @@
 #include <c10/core/DeviceType.h>
 #include <c10/mobile/CPUCachingAllocator.h>
 #include <c10/mobile/CPUProfilingAllocator.h>
+#include <c10/util/irange.h>

 // TODO: rename flags to C10
 C10_DEFINE_bool(
@ -30,7 +31,7 @@ void memset_junk(void* data, size_t num) {
  int32_t int64_count = num / sizeof(kJunkPattern64);
  int32_t remaining_bytes = num % sizeof(kJunkPattern64);
  int64_t* data_i64 = reinterpret_cast<int64_t*>(data);
-  for (int i = 0; i < int64_count; i++) {
+  for (const auto i : c10::irange(int64_count)) {
    data_i64[i] = kJunkPattern64;
  }
  if (remaining_bytes > 0) {
--- a/c10/core/TensorImpl.cpp
+++ b/c10/core/TensorImpl.cpp
@ -5,6 +5,7 @@
 #include <c10/core/WrapDimMinimal.h>
 #include <c10/core/impl/LocalDispatchKeySet.h>
 #include <c10/util/Optional.h>
+#include <c10/util/irange.h>

 C10_DEFINE_bool(
    caffe2_keep_on_shrink,
@ -335,7 +336,7 @@ bool TensorImpl::compute_non_overlapping_and_dense() const {
  }
  SmallVector<int64_t, 5> perm;
  perm.resize(dim());
-  for (int64_t i = 0; i < dim(); i++) {
+  for (const auto i : c10::irange(dim())) {
    perm[i] = i;
  }
  // Sort by strides, leaving 0 and 1 sized dims at the end of the array
@ -349,7 +350,7 @@ bool TensorImpl::compute_non_overlapping_and_dense() const {
        sizes_and_strides_.stride_at_unchecked(b);
  });
  auto require_stride = 1;
-  for (int64_t i = 0; i < dim(); i++) {
+  for (const auto i : c10::irange(dim())) {
    const auto size_perm_i = sizes_and_strides_.size_at_unchecked(perm[i]);
    if (size_perm_i < 2) {
      return true;
--- a/c10/core/TensorImpl.h
+++ b/c10/core/TensorImpl.h
@ -19,6 +19,7 @@
 #include <c10/util/Logging.h>
 #include <c10/util/Optional.h>
 #include <c10/util/accumulate.h>
+#include <c10/util/irange.h>
 #include <c10/util/python_stub.h>

 // A global boolean variable to control whether we free memory when a Tensor
@ -68,7 +69,7 @@ inline std::vector<int64_t> ToVectorint64_t(ArrayRef<int> src) {
 */
 inline int64_t size_from_dim_(int k, IntArrayRef dims) {
  int64_t r = 1;
-  for (size_t i = k; i < dims.size(); ++i) {
+  for (const auto i : c10::irange(k, dims.size())) {
    r *= dims[i];
  }
  return r;
@ -78,7 +79,7 @@ inline int64_t size_from_dim_(int k, IntArrayRef dims) {
 inline int64_t size_to_dim_(int k, IntArrayRef dims) {
  TORCH_CHECK((unsigned)k <= dims.size());
  int64_t r = 1;
-  for (int i = 0; i < k; ++i) {
+  for (const auto i : c10::irange(k)) {
    r *= dims[i];
  }
  return r;
@ -2163,7 +2164,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
    auto old_numel = numel_;
    sizes_and_strides_.resize(src.size());
    int64_t new_numel = 1;
-    for (size_t i = 0; i < src.size(); ++i) {
+    for (const auto i : c10::irange(src.size())) {
      new_numel *= src[i];
      sizes_and_strides_.size_at_unchecked(i) = src[i];
    }
--- a/c10/core/impl/InlineStreamGuard.h
+++ b/c10/core/impl/InlineStreamGuard.h
@ -2,6 +2,7 @@

 #include <c10/core/impl/InlineDeviceGuard.h>
 #include <c10/util/ArrayRef.h>
+#include <c10/util/irange.h>

 namespace c10 {
 namespace impl {
@ -237,7 +238,7 @@ class InlineMultiStreamGuard {
  static DeviceType getDeviceTypeOfStreams(ArrayRef<Stream> streams) {
    TORCH_INTERNAL_ASSERT(!streams.empty());
    DeviceType type = streams[0].device_type();
-    for (size_t idx = 1; idx < streams.size(); idx++) {
+    for (const auto idx : c10::irange(1, streams.size())) {
      TORCH_CHECK_VALUE(
          streams[idx].device_type() == type,
          "Streams have a mix of device types: stream 0 is on ",
--- a/c10/test/core/impl/SizesAndStrides_test.cpp
+++ b/c10/test/core/impl/SizesAndStrides_test.cpp
@ -201,7 +201,7 @@ static SizesAndStrides makeBig(int offset = 0) {

 static void checkSmall(const SizesAndStrides& sm, int offset = 0) {
  std::vector<int64_t> sizes(3), strides(3);
-  for (int ii = 0; ii < 3; ++ii) {
+  for (const auto ii : c10::irange(3)) {
    sizes[ii] = ii + 1 + offset;
    strides[ii] = 2 * (ii + 1 + offset);
  }
@ -210,7 +210,7 @@ static void checkSmall(const SizesAndStrides& sm, int offset = 0) {

 static void checkBig(const SizesAndStrides& big, int offset = 0) {
  std::vector<int64_t> sizes(8), strides(8);
-  for (int ii = 0; ii < 8; ++ii) {
+  for (const auto ii : c10::irange(8)) {
    sizes[ii] = ii - 1 + offset;
    strides[ii] = 2 * (ii - 1 + offset);
  }
--- a/c10/test/util/Bitset_test.cpp
+++ b/c10/test/util/Bitset_test.cpp
@ -1,6 +1,7 @@
 #include <gtest/gtest.h>

 #include <c10/util/Bitset.h>
+#include <c10/util/irange.h>

 using c10::utils::bitset;

@ -37,7 +38,7 @@ TEST(BitsetTest, givenEmptyBitset_whenSettingBit_thenIsSet) {
 TEST(BitsetTest, givenEmptyBitset_whenSettingBit_thenOthersStayUnset) {
  bitset b;
  b.set(6);
-  for (size_t i = 0; i < 6; ++i) {
+  for (const auto i : c10::irange(6)) {
    EXPECT_FALSE(b.get(i));
  }
  for (size_t i = 7; i < bitset::NUM_BITS(); ++i) {
@ -56,10 +57,10 @@ TEST(BitsetTest, givenNonemptyBitset_whenSettingBit_thenOthersStayAtOldValue) {
  bitset b;
  b.set(6);
  b.set(30);
-  for (size_t i = 0; i < 6; ++i) {
+  for (const auto i : c10::irange(6)) {
    EXPECT_FALSE(b.get(i));
  }
-  for (size_t i = 7; i < 30; ++i) {
+  for (const auto i : c10::irange(7, 30)) {
    EXPECT_FALSE(b.get(i));
  }
  for (size_t i = 31; i < bitset::NUM_BITS(); ++i) {
@ -82,7 +83,7 @@ TEST(
  b.set(6);
  b.set(30);
  b.unset(6);
-  for (size_t i = 0; i < 30; ++i) {
+  for (const auto i : c10::irange(30)) {
    EXPECT_FALSE(b.get(i));
  }
  EXPECT_TRUE(b.get(30));
@ -100,7 +101,7 @@ struct IndexCallbackMock final {

  void expect_was_called_for_indices(std::vector<size_t> expected_indices) {
    EXPECT_EQ(expected_indices.size(), called_for_indices.size());
-    for (size_t i = 0; i < expected_indices.size(); ++i) {
+    for (const auto i : c10::irange(expected_indices.size())) {
      EXPECT_EQ(expected_indices[i], called_for_indices[i]);
    }
  }
--- a/c10/test/util/bfloat16_test.cpp
+++ b/c10/test/util/bfloat16_test.cpp
@ -1,6 +1,7 @@
 // clang-format off
 #include <c10/util/BFloat16.h>
 #include <c10/util/BFloat16-math.h>
+#include <c10/util/irange.h>
 // clang-format on
 #include <gtest/gtest.h>

@ -24,7 +25,7 @@ float float_from_bytes(uint32_t sign, uint32_t exponent, uint32_t fraction) {
 TEST(BFloat16Conversion, FloatToBFloat16AndBack) {
  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,cppcoreguidelines-avoid-magic-numbers,modernize-avoid-c-arrays)
  float in[100];
-  for (int i = 0; i < 100; ++i) {
+  for (const auto i : c10::irange(100)) {
    // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions,cppcoreguidelines-avoid-magic-numbers)
    in[i] = i + 1.25;
  }
@ -34,7 +35,7 @@ TEST(BFloat16Conversion, FloatToBFloat16AndBack) {
  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,cppcoreguidelines-avoid-magic-numbers,modernize-avoid-c-arrays)
  float out[100];

-  for (int i = 0; i < 100; ++i) {
+  for (const auto i : c10::irange(100)) {
    bfloats[i].x = c10::detail::bits_from_f32(in[i]);
    out[i] = c10::detail::f32_from_bits(bfloats[i].x);

@ -47,7 +48,7 @@ TEST(BFloat16Conversion, FloatToBFloat16AndBack) {
 TEST(BFloat16Conversion, FloatToBFloat16RNEAndBack) {
  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,cppcoreguidelines-avoid-magic-numbers,modernize-avoid-c-arrays)
  float in[100];
-  for (int i = 0; i < 100; ++i) {
+  for (const auto i : c10::irange(100)) {
    // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions,cppcoreguidelines-avoid-magic-numbers)
    in[i] = i + 1.25;
  }
@ -57,7 +58,7 @@ TEST(BFloat16Conversion, FloatToBFloat16RNEAndBack) {
  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,cppcoreguidelines-avoid-magic-numbers,modernize-avoid-c-arrays)
  float out[100];

-  for (int i = 0; i < 100; ++i) {
+  for (const auto i : c10::irange(100)) {
    bfloats[i].x = c10::detail::round_to_nearest_even(in[i]);
    out[i] = c10::detail::f32_from_bits(bfloats[i].x);

--- a/c10/test/util/ordered_preserving_dict_test.cpp
+++ b/c10/test/util/ordered_preserving_dict_test.cpp
@ -4,6 +4,7 @@

 #include <c10/macros/Macros.h>
 #include <c10/util/Exception.h>
+#include <c10/util/irange.h>
 #include <c10/util/order_preserving_flat_hash_map.h>
 #include <gtest/gtest.h>

@ -15,14 +16,15 @@ using dict_int_int =
    ska_ordered::order_preserving_flat_hash_map<int64_t, int64_t>;

 dict_int_int test_dict(dict_int_int& dict) {
-  for (int64_t i = 0; i < 100; ++i) {
+  for (const auto i : c10::irange(100)) {
    dict[i] = i + 1;
  }

-  int64_t i = 0;
+  int64_t entry_i = 0;
  for (auto entry : dict) {
-    TORCH_INTERNAL_ASSERT(entry.first == i && entry.second == i + 1);
-    ++i;
+    TORCH_INTERNAL_ASSERT(
+        entry.first == entry_i && entry.second == entry_i + 1);
+    ++entry_i;
  }

  // erase a few entries by themselves
@ -33,29 +35,32 @@ dict_int_int test_dict(dict_int_int& dict) {

  // erase via iterators
  auto begin = dict.begin();
-  for (size_t i = 0; i < 20; ++i)
+  for (const auto i : c10::irange(20)) {
+    (void)i; // Suppress unused variable warning
    begin++;
+  }

  auto end = begin;
-  for (size_t i = 0; i < 20; ++i) {
+  for (const auto i : c10::irange(20)) {
+    (void)i; // Suppress unused variable warning
    erase_set.insert(end->first);
    end++;
  }
  dict.erase(begin, end);

  std::vector<size_t> order;
-  for (size_t i = 0; i < 100; ++i) {
+  for (const auto i : c10::irange(100)) {
    if (!erase_set.count(i)) {
      order.push_back(i);
    }
  }

-  i = 0;
+  entry_i = 0;
  for (auto entry : dict) {
-    TORCH_INTERNAL_ASSERT(order[i] == entry.first);
-    TORCH_INTERNAL_ASSERT(dict[order[i]] == entry.second);
-    TORCH_INTERNAL_ASSERT(entry.second == order[i] + 1);
-    i++;
+    TORCH_INTERNAL_ASSERT(order[entry_i] == entry.first);
+    TORCH_INTERNAL_ASSERT(dict[order[entry_i]] == entry.second);
+    TORCH_INTERNAL_ASSERT(entry.second == order[entry_i] + 1);
+    entry_i++;
  }
  TORCH_INTERNAL_ASSERT(dict.size() == order.size());
  return dict;
@ -113,12 +118,12 @@ TEST(OrderedPreservingDictTest, DictCollisions) {

  for (auto init_dict_size : {27, 34, 41}) {
    bad_hash_dict dict;
-    for (int64_t i = 0; i < init_dict_size; ++i) {
+    for (const auto i : c10::irange(init_dict_size)) {
      dict[i] = i + 1;
    }

    int64_t i = 0;
-    for (auto entry : dict) {
+    for (const auto& entry : dict) {
      TORCH_INTERNAL_ASSERT(entry.first == i && entry.second == i + 1);
      ++i;
    }
@ -131,20 +136,22 @@ TEST(OrderedPreservingDictTest, DictCollisions) {

    // erase a few entries via iterator
    auto begin = dict.begin();
-    for (size_t i = 0; i < 10; ++i) {
+    for (const auto j : c10::irange(10)) {
+      (void)j; // Suppress unused variable warning
      begin++;
    }
    auto end = begin;
-    for (size_t i = 0; i < 7; ++i) {
+    for (const auto j : c10::irange(7)) {
+      (void)j; // Suppress unused variable warning
      erase_set.insert(end->first);
      end++;
    }
    dict.erase(begin, end);

    std::vector<int64_t> order;
-    for (int64_t i = 0; i < init_dict_size; ++i) {
-      if (!erase_set.count(i)) {
-        order.push_back(i);
+    for (const auto j : c10::irange(init_dict_size)) {
+      if (!erase_set.count(j)) {
+        order.push_back(j);
      }
    }

@ -167,7 +174,7 @@ TEST(OrderedPreservingDictTest, test_range_insert) {
  // check values
  const int nb_values = 1000;
  std::vector<std::pair<int, int>> values;
-  for (int i = 0; i < nb_values; i++) {
+  for (const auto i : c10::irange(nb_values)) {
    // NOLINTNEXTLINE(modernize-use-emplace,performance-inefficient-vector-operation)
    values.push_back(std::make_pair(i, i + 1));
  }
@ -190,7 +197,7 @@ TEST(OrderedPreservingDictTest, test_range_erase_all) {
  // insert x values, delete all
  const std::size_t nb_values = 1000;
  dict_int_int map;
-  for (size_t i = 0; i < nb_values; ++i) {
+  for (const auto i : c10::irange(nb_values)) {
    map[i] = i + 1;
  }
  auto it = map.erase(map.begin(), map.end());
@ -206,7 +213,7 @@ TEST(OrderedPreservingDictTest, test_range_erase) {

  const std::size_t nb_values = 1000;
  HMap map;
-  for (size_t i = 0; i < nb_values; ++i) {
+  for (const auto i : c10::irange(nb_values)) {
    map[c10::guts::to_string(i)] = i;
    auto begin = map.begin();
    for (size_t j = 0; j <= i; ++j, begin++) {
@ -305,7 +312,7 @@ TEST(OrderedPreservingDictTest, test_copy_constructor_and_operator) {

  const std::size_t nb_values = 100;
  HMap map;
-  for (size_t i = 0; i < nb_values; ++i) {
+  for (const auto i : c10::irange(nb_values)) {
    map[c10::guts::to_string(i)] = c10::guts::to_string(i);
  }

--- a/c10/util/Backtrace.cpp
+++ b/c10/util/Backtrace.cpp
@ -1,6 +1,7 @@
 #include <c10/util/Backtrace.h>
 #include <c10/util/Optional.h>
 #include <c10/util/Type.h>
+#include <c10/util/irange.h>

 #include <functional>
 #include <memory>
@ -281,8 +282,7 @@ std::string get_backtrace(
  // Toggles to true after the first skipped python frame.
  bool has_skipped_python_frames = false;

-  for (size_t frame_number = 0; frame_number < callstack.size();
-       ++frame_number) {
+  for (const auto frame_number : c10::irange(callstack.size())) {
    const auto frame = parse_frame_information(symbols[frame_number]);

    if (skip_python_frames && frame && is_python_frame(*frame)) {
--- a/c10/util/typeid.h
+++ b/c10/util/typeid.h
@ -27,6 +27,7 @@
 #include <c10/util/flat_hash_map.h>

 #include <c10/core/ScalarType.h>
+#include <c10/util/irange.h>

 /*
 * TypeIdentifier is a small type containing an id.
@ -170,7 +171,7 @@ struct TypeMetaData final {
 template <typename T>
 inline void _PlacementNew(void* ptr, size_t n) {
  T* typed_ptr = static_cast<T*>(ptr);
-  for (size_t i = 0; i < n; ++i) {
+  for (const auto i : c10::irange(n)) {
    new (typed_ptr + i) T;
  }
 }
@ -234,7 +235,7 @@ template <typename T>
 inline void _Copy(const void* src, void* dst, size_t n) {
  const T* typed_src = static_cast<const T*>(src);
  T* typed_dst = static_cast<T*>(dst);
-  for (size_t i = 0; i < n; ++i) {
+  for (const auto i : c10::irange(n)) {
    typed_dst[i] = typed_src[i];
  }
 }
@ -274,7 +275,7 @@ inline constexpr TypeMetaData::Copy* _PickCopy() {
 template <typename T>
 inline void _PlacementDelete(void* ptr, size_t n) {
  T* typed_ptr = static_cast<T*>(ptr);
-  for (size_t i = 0; i < n; ++i) {
+  for (const auto i : c10::irange(n)) {
    typed_ptr[i].~T();
  }
 }
--- a/caffe2/contrib/aten/aten_op_template.h
+++ b/caffe2/contrib/aten/aten_op_template.h
@ -3,6 +3,7 @@
 #include <string>
 #include <ATen/ATen.h>
 #include <c10/macros/Macros.h>
+#include <c10/util/irange.h>
 #include <caffe2/core/context.h>
 #include <caffe2/core/operator.h>
 #include <caffe2/utils/math.h>
@ -130,7 +131,7 @@ private:
  void assignListStartingAt(
      size_t offset,
      const std::vector<at::Tensor>& tensors) {
-    for (size_t i = 0; i < tensors.size(); i++) {
+    for (const auto i : c10::irange(tensors.size())) {
      assignTo(Output(offset + i), tensors[i]);
    }
  }
@ -176,7 +177,7 @@ private:
    std::stringstream descriptor;
    descriptor << op;
    std::vector<std::string> attrs;
-    for(size_t i = 0; i < operator_def.arg_size(); i++) {
+    for (const auto i : c10::irange(operator_def.arg_size())) {
      auto & attr = operator_def.arg(i);
      if(attr.name() == "operator" || attr.name() == "type" )
        continue;
@ -223,7 +224,7 @@ private:
    std::vector<int64_t> ints =
        OperatorBase::GetRepeatedArgument<int64_t>(name, {});
    std::array<bool, N> result;
-    for (size_t i = 0; i < N; ++i) {
+    for (const auto i : c10::irange(N)) {
      result[i] = ints.at(i);
    }
    return result;
--- a/caffe2/contrib/fakelowp/fp16_fc_acc_op.h
+++ b/caffe2/contrib/fakelowp/fp16_fc_acc_op.h
@ -118,8 +118,8 @@ class Fp16FCAccOp final : public Operator<Context> {
        if (!W_fbgemm->packed()) {
          float* W_fp16_trans = new float[W_size];
          fbgemm::Float16ToFloat_avx2(W_fbgemm->pmat(), W_fp16_trans, W_size);
-          for (int i = 0; i < N; i++) {
-            for (int j = 0; j < K; j++) {
+          for (const auto i : c10::irange(N)) {
+            for (const auto j : c10::irange(K)) {
              W_fp16_[j * N + i] = W_fp16_trans[i * K + j];
            }
          }
@ -136,8 +136,8 @@ class Fp16FCAccOp final : public Operator<Context> {
        const auto& W = Input(1);
        W_data = W.template data<T_W>();
        // Transpose W
-        for (int i = 0; i < N; i++) {
-          for (int j = 0; j < K; j++) {
+        for (const auto i : c10::irange(N)) {
+          for (const auto j : c10::irange(K)) {
            W_fp16_[j * N + i] = W_data[i * K + j];
          }
        }
@ -352,7 +352,7 @@ class Fp16FCAccOp final : public Operator<Context> {
 #ifdef LOG_LEVEL_FOR_FBFCPACKEDACC16_ACCURACY_LOG
  float compute_L2_norm(float* A, int size) {
    float square_sum = 0.0;
-    for (int i = 0; i < size; i++) {
+    for (const auto i : c10::irange(size)) {
      square_sum += A[i] * A[i];
    }
    return std::sqrt(square_sum);
@ -360,7 +360,7 @@ class Fp16FCAccOp final : public Operator<Context> {

  float compute_relative_error(float* A, float* A_ref, int size) {
    float error = 0.0;
-    for (int i = 0; i < size; i++) {
+    for (const auto i : c10::irange(size)) {
      error += (A[i] - A_ref[i]) * (A[i] - A_ref[i]);
    }
    error = std::sqrt(error);
--- a/caffe2/contrib/fakelowp/int8_dequantize_op_nnpi.h
+++ b/caffe2/contrib/fakelowp/int8_dequantize_op_nnpi.h
@ -22,7 +22,7 @@ void Int8DequantizeNNPI(
    const float X_scale,
    const int32_t X_offset) {
  float X_scale_fp32 = 1.0f / X_scale;
-  for (auto i = 0; i < N; ++i) {
+  for (const auto i : c10::irange(N)) {
    out[i] = (float)(static_cast<int32_t>(in[i]) - X_offset) / X_scale_fp32;
  }
 } // namespace
--- a/caffe2/contrib/fakelowp/int8_quantize_op_nnpi.h
+++ b/caffe2/contrib/fakelowp/int8_quantize_op_nnpi.h
@ -53,12 +53,12 @@ void Int8QuantizeNNPI(
  std::vector<float> inv_scalev(N, inv_scale_fp16);
  std::vector<float> offsetv(N, -offset_tmp);
  fake_fp16::fma_fp16(N, in_fp16.data(), inv_scalev.data(), offsetv.data());
-  for (int i = 0; i < N; i++) {
+  for (const auto i : c10::irange(N)) {
    offsetv[i] = round(offsetv[i]);
  }
  fbgemm::RoundToFloat16(
      offsetv.data(), offsetv.data(), N, false /* no clamping */);
-  for (int i = 0; i < N; i++) {
+  for (const auto i : c10::irange(N)) {
    float halfRes = offsetv[i];
    if (std::isinf(halfRes)) {
      if (halfRes > 0) {
--- a/caffe2/contrib/fakelowp/int8_swish_op_nnpi.h
+++ b/caffe2/contrib/fakelowp/int8_swish_op_nnpi.h
@ -29,7 +29,7 @@ void SwishFakeInt8NNPI(
  int32_t quant_val = 0;
  uint8_t result = 0;

-  for (auto i = 0; i < N; ++i) {
+  for (const auto i : c10::irange(N)) {
    deq_val = (static_cast<uint8_t>(in[i]) - X_offset) / X_scale_fp32;
    deq_swish = deq_val / (1 + exp(-deq_val));
    quant_val = round(deq_swish / Y_scale + Y_offset);
--- a/caffe2/contrib/fakelowp/layernorm_fp16_fake_op.h
+++ b/caffe2/contrib/fakelowp/layernorm_fp16_fake_op.h
@ -129,7 +129,7 @@ class LayerNormFakeFp16Op final : public Operator<CPUContext> {
          FLAGS_caffe2_fbgemm_fake_fp16_clamp,
          false /*USE_ACC_FP16*/);

-      for (int i = 0; i < M; ++i) {
+      for (const auto i : c10::irange(M)) {
        // fma_fp16(A, B, Out) -> Out = A * B + Out
        std::vector<float> out(N);
        std::memcpy(out.data(), bias_data.data(), sizeof(float) * N);
@ -169,7 +169,7 @@ class LayerNormFakeFp16Op final : public Operator<CPUContext> {
      const int32_t qmin = std::numeric_limits<uint8_t>::min();
      const int32_t qmax = std::numeric_limits<uint8_t>::max();

-      for (int i = 0; i < Nout; i++) {
+      for (const auto i : c10::irange(Nout)) {
        float halfRes = offsetv[i];
        halfRes = round(halfRes);
        if (std::isinf(halfRes)) {
--- a/caffe2/contrib/fakelowp/lengths_reducer_fused_4bit_rowwise_fp16_fake_op.h
+++ b/caffe2/contrib/fakelowp/lengths_reducer_fused_4bit_rowwise_fp16_fake_op.h
@ -85,7 +85,7 @@ class SparseLengthsFused4BitRowwiseFakeFP16Op final : public Operator<Context> {
    const auto scale_bias_offset = 2 * sizeof(at::Half);
    const int64_t input_fused_block_size = input_block_size + scale_bias_offset;
    int64_t current = 0;
-    for (int m = 0; m < output_size; ++m) {
+    for (const auto m : c10::irange(output_size)) {
      if (!use_fp16_for_embedding_only) {
        memset(rowTempSums[0].data(), 0, sizeof(float) * output_block_size);
        memset(rowTempSums[1].data(), 0, sizeof(float) * output_block_size);
@ -135,7 +135,7 @@ class SparseLengthsFused4BitRowwiseFakeFP16Op final : public Operator<Context> {
        // Unpack int4 elements
        std::vector<float> input_rounded(output_block_size);
        int k = 0;
-        for (int j = 0; j < input_block_size; j++) {
+        for (const auto j : c10::irange(input_block_size)) {
          input_rounded[k++] =
              input[input_fused_block_size * indices_data[current] + j] & 0x0f;
          input_rounded[k++] =
@ -150,7 +150,7 @@ class SparseLengthsFused4BitRowwiseFakeFP16Op final : public Operator<Context> {
              input_rounded.data(),
              product_rounded.data());

-          for (int j = 0; j < output_block_size; ++j) {
+          for (const auto j : c10::irange(output_block_size)) {
            product_rounded[j] += bias;
          }

@ -190,7 +190,7 @@ class SparseLengthsFused4BitRowwiseFakeFP16Op final : public Operator<Context> {
      }

      if (!use_fp16_for_embedding_only) {
-        for (int j = 0; j < output_block_size; ++j) {
+        for (const auto j : c10::irange(output_block_size)) {
          out[j] = rowTempSums[0][j] + rowTempSums[1][j];
        }
        fbgemm::RoundToFloat16(
--- a/caffe2/contrib/fakelowp/lengths_reducer_fused_8bit_rowwise_fp16_fake_op.h
+++ b/caffe2/contrib/fakelowp/lengths_reducer_fused_8bit_rowwise_fp16_fake_op.h
@ -84,7 +84,7 @@ class SparseLengthsFused8BitRowwiseFakeFP16Op final : public Operator<Context> {
    const auto scale_bias_offset = 8 / sizeof(uint8_t);
    const int64_t fused_block_size = block_size + scale_bias_offset;
    int64_t current = 0;
-    for (int m = 0; m < output_size; ++m) {
+    for (const auto m : c10::irange(output_size)) {
      memset(out, 0, sizeof(float) * block_size);
      memset(rowTempSums[0].data(), 0, sizeof(float) * block_size);
      memset(rowTempSums[1].data(), 0, sizeof(float) * block_size);
@ -152,7 +152,7 @@ class SparseLengthsFused8BitRowwiseFakeFP16Op final : public Operator<Context> {

        // Fake fp16 rounding of input/ it is already ints
        std::vector<float> input_rounded(block_size);
-        for (int j = 0; j < block_size; ++j) {
+        for (const auto j : c10::irange(block_size)) {
          input_rounded[j] =
              input[fused_block_size * indices_data[current] + j];
        }
@ -164,7 +164,7 @@ class SparseLengthsFused8BitRowwiseFakeFP16Op final : public Operator<Context> {
          TypedAxpy<float, float>(
              block_size, scale, input_rounded.data(), product_rounded.data());

-          for (int j = 0; j < block_size; ++j) {
+          for (const auto j : c10::irange(block_size)) {
            product_rounded[j] += bias;
          }

@ -215,7 +215,7 @@ class SparseLengthsFused8BitRowwiseFakeFP16Op final : public Operator<Context> {
              block_size,
              FLAGS_caffe2_fbgemm_fake_fp16_clamp);

-          for (int j = 0; j < block_size; ++j) {
+          for (const auto j : c10::irange(block_size)) {
            product_rounded[j] += bias;
          }
          // Fake fp16 rounding of w x scale x input + w x bias
@ -239,7 +239,7 @@ class SparseLengthsFused8BitRowwiseFakeFP16Op final : public Operator<Context> {
              block_size,
              FLAGS_caffe2_fbgemm_fake_fp16_clamp);
        } else if (use_acc_fp32) {
-          for (int j = 0; j < block_size; ++j) {
+          for (const auto j : c10::irange(block_size)) {
            float deqVal = fake_fp16::fmafp32_avx_emulation(
                scale,
                input_rounded[j],
@ -256,7 +256,7 @@ class SparseLengthsFused8BitRowwiseFakeFP16Op final : public Operator<Context> {

          TypedAxpy<float, float>(block_size, scale, input_rounded.data(), out);

-          for (int j = 0; j < block_size; ++j) {
+          for (const auto j : c10::irange(block_size)) {
            out[j] += bias;
          }
        }
@ -264,7 +264,7 @@ class SparseLengthsFused8BitRowwiseFakeFP16Op final : public Operator<Context> {
      }

      if (use_nnpi_fma || use_acc_fp32) {
-        for (int j = 0; j < block_size; ++j) {
+        for (const auto j : c10::irange(block_size)) {
          out[j] = rowTempSums[0][j] + rowTempSums[1][j];
        }
      }
--- a/caffe2/contrib/fakelowp/lengths_reducer_ops.h
+++ b/caffe2/contrib/fakelowp/lengths_reducer_ops.h
@ -94,7 +94,7 @@ class SparseLengthsReductionFakeFp16Op final : public Operator<CPUContext> {
    float* out = out_data;

    int64_t current = 0;
-    for (int m = 0; m < output_size; ++m) {
+    for (const auto m : c10::irange(output_size)) {
      memset(out, 0, sizeof(float) * block_size);
      if (current + lengths[m] > index_size) {
        return false;
--- a/caffe2/contrib/fakelowp/quant_lut_fp16_fake_op.h
+++ b/caffe2/contrib/fakelowp/quant_lut_fp16_fake_op.h
@ -39,7 +39,7 @@ class TanhInt8QuantizeNNPIOp final : public Operator<CPUContext> {
    Y_scale = 1.0f / Y_scale;

    // create table once
-    for (int i = 0; i < lutSize; i++) {
+    for (const auto i : c10::irange(lutSize)) {
        short input = i + tanhLUTMinOffset;
        float x = _cvtsh_ss(input);
        float tanh_x = tanh(x);
@ -54,7 +54,7 @@ class TanhInt8QuantizeNNPIOp final : public Operator<CPUContext> {
    }

    const float* X_data = X.template data<float>();
-    for (int i = 0; i < X.numel(); i++) {
+    for (const auto i : c10::irange(X.numel())) {
        short val = _cvtss_sh(X_data[i], 0);
        unsigned short max16BitPositive = 0x7FFF;
        unsigned short input16Bit = (*(unsigned short*)& val);
--- a/caffe2/contrib/fakelowp/spatial_batch_norm_fp16_fake_op.h
+++ b/caffe2/contrib/fakelowp/spatial_batch_norm_fp16_fake_op.h
@ -159,7 +159,7 @@ class SpatialBNFakeLoweredFp16Op : public Operator<CPUContext> {
    const int stride = C * HxW;
    const float* X_ptr = X;
    float* Y_ptr = Y;
-    for (int i = 0; i < N; ++i) {
+    for (const auto i : c10::irange(N)) {
      EigenArrayMap<float>(Y_ptr, HxW, C) =
          ConstEigenArrayMap<float>(X_ptr, HxW, C).rowwise() -
          mean_arr.transpose();
@ -356,9 +356,9 @@ class SpatialBNFakeFp16Op : public Operator<CPUContext> {
    float* Y_ptr = Y;

    // Do Y = X * scale + bias
-    for (int i = 0; i < N; i++) {
-      for (int j = 0; j < C; j++) {
-        for (int k = 0; k < HxW; k++) {
+    for (const auto i : c10::irange(N)) {
+      for (const auto j : c10::irange(C)) {
+        for (const auto k : c10::irange(HxW)) {
          Y_ptr[HxW * j + k] = bias[j];
        }

--- a/caffe2/contrib/fakelowp/sum_fp16_fake_op.h
+++ b/caffe2/contrib/fakelowp/sum_fp16_fake_op.h
@ -18,7 +18,7 @@ class SumFP16FP16AccOp : public Operator<Context> {
    size_t N = input0.numel();
    auto* output = Output(0, input0.sizes(), at::dtype<float>());
    // Dimension checking
-    for (int i = 1; i < InputSize(); ++i) {
+    for (const auto i : c10::irange(1, InputSize())) {
      if (output->sizes() != Input(i).sizes()) {
        CAFFE_THROW(
            "Check failed: output->sizes() == Input(i).sizes().",
@ -37,7 +37,7 @@ class SumFP16FP16AccOp : public Operator<Context> {
    std::vector<float> t1(N);
    std::vector<float> t2(N);

-    for (auto i = 0; i < InputSize(); i++) {
+    for (const auto i : c10::irange(InputSize())) {
      fbgemm::RoundToFloat16(
          Input(i).template data<float>(),
          t1.data(),
--- a/caffe2/contrib/gloo/allgather_ops.h
+++ b/caffe2/contrib/gloo/allgather_ops.h
@ -85,13 +85,13 @@ class AllgatherOp final : public Operator<Context> {

    // Verify tensors all have same size
    size_t size = Input(1).numel();
-    for (auto i = 2; i < InputSize(); i++) {
+    for (const auto i : c10::irange(2, InputSize())) {
      CAFFE_ENFORCE_EQ(Input(i).numel(), size);
    }

    // Verify tensors all have same type
    TypeMeta meta = Input(1).dtype();
-    for (auto i = 2; i < InputSize(); i++) {
+    for (const auto i : c10::irange(2, InputSize())) {
      CAFFE_ENFORCE(Input(i).dtype() == meta);
    }

@ -113,7 +113,7 @@ class AllgatherOp final : public Operator<Context> {
    params.inputs.resize(InputSize() - 1);
    params.size = Input(1).numel();
    params.meta = Input(1).dtype();
-    for (auto i = 0; i < params.inputs.size(); i++) {
+    for (const auto i : c10::irange(params.inputs.size())) {
      params.inputs[i] = Input(i + 1).raw_data();
    }
    params.outputs.resize(OutputSize());
--- a/caffe2/contrib/gloo/allreduce_ops.h
+++ b/caffe2/contrib/gloo/allreduce_ops.h
@ -65,19 +65,19 @@ class AllreduceOp final : public Operator<Context> {

    // Verify inputs == outputs
    CAFFE_ENFORCE_EQ(init_.inputs.size(), init_.outputs.size());
-    for (auto i = 0U; i < init_.inputs.size(); i++) {
+    for (const auto i : c10::irange(0U, init_.inputs.size())) {
      CAFFE_ENFORCE_EQ(init_.inputs[i], init_.outputs[i]);
    }

    // Verify tensors all have same size
    auto size = Input(1).numel();
-    for (auto i = 2; i < InputSize(); i++) {
+    for (const auto i : c10::irange(2, InputSize())) {
      CAFFE_ENFORCE_EQ(Input(i).numel(), size);
    }

    // Verify tensors all have same type
    TypeMeta meta = Input(1).dtype();
-    for (auto i = 2; i < InputSize(); i++) {
+    for (const auto i : c10::irange(2, InputSize())) {
      CAFFE_ENFORCE(Input(i).dtype() == meta);
    }

@ -115,7 +115,7 @@ class AllreduceOp final : public Operator<Context> {
    params.context = OperatorBase::Input<std::shared_ptr<::gloo::Context>>(0);
    params.inputs.resize(InputSize() - 1);
    params.outputs.resize(OutputSize());
-    for (auto i = 0U; i < params.inputs.size(); i++) {
+    for (const auto i : c10::irange(0U, params.inputs.size())) {
      params.inputs[i] = Input(i + 1).raw_data();
      params.outputs[i] = Output(i)->raw_mutable_data();
    }
--- a/caffe2/contrib/gloo/broadcast_ops.h
+++ b/caffe2/contrib/gloo/broadcast_ops.h
@ -60,19 +60,19 @@ class BroadcastOp final : public Operator<Context> {

    // Verify inputs == outputs
    CAFFE_ENFORCE_EQ(init_.inputs.size(), init_.outputs.size());
-    for (auto i = 0; i < init_.inputs.size(); i++) {
+    for (const auto i : c10::irange(init_.inputs.size())) {
      CAFFE_ENFORCE_EQ(init_.inputs[i], init_.outputs[i]);
    }

    // Verify tensors all have same size
    size_t size = Input(1).numel();
-    for (auto i = 2; i < InputSize(); i++) {
+    for (const auto i : c10::irange(2, InputSize())) {
      CAFFE_ENFORCE_EQ(Input(i).numel(), size);
    }

    // Verify tensors all have same size
    TypeMeta meta = Input(1).dtype();
-    for (auto i = 2; i < InputSize(); i++) {
+    for (const auto i : c10::irange(2, InputSize())) {
      CAFFE_ENFORCE(Input(i).dtype() == meta);
    }

@ -94,7 +94,7 @@ class BroadcastOp final : public Operator<Context> {
    params.context = OperatorBase::Input<std::shared_ptr<::gloo::Context>>(0);
    params.inputs.resize(InputSize() - 1);
    params.outputs.resize(OutputSize());
-    for (auto i = 0; i < params.inputs.size(); i++) {
+    for (const auto i : c10::irange(params.inputs.size())) {
      params.inputs[i] = Input(i + 1).raw_data();
      params.outputs[i] = Output(i)->raw_mutable_data();
    }
--- a/caffe2/contrib/gloo/reduce_scatter_ops.h
+++ b/caffe2/contrib/gloo/reduce_scatter_ops.h
@ -75,7 +75,7 @@ class ReduceScatterOp final : public Operator<Context> {

    // Verify inputs == outputs
    CAFFE_ENFORCE_EQ(init_.inputs.size(), init_.outputs.size());
-    for (auto i = 0; i < init_.inputs.size(); i++) {
+    for (const auto i : c10::irange(init_.inputs.size())) {
      CAFFE_ENFORCE_EQ(init_.inputs[i], init_.outputs[i]);
    }

@ -107,7 +107,7 @@ class ReduceScatterOp final : public Operator<Context> {
    params.context = OperatorBase::Input<std::shared_ptr<::gloo::Context>>(0);
    params.inputs.resize(InputSize() - 2);
    params.outputs.resize(OutputSize() - 1);
-    for (auto i = 0; i < params.inputs.size(); i++) {
+    for (const auto i : c10::irange(params.inputs.size())) {
      params.inputs[i] = Input(i + 1).raw_data();
      params.outputs[i] = Output(i)->raw_mutable_data();
    }
--- a/caffe2/contrib/opencl/OpenCL/cl.hpp
+++ b/caffe2/contrib/opencl/OpenCL/cl.hpp
@ -1241,7 +1241,7 @@ inline cl_int getInfoHelper(Func f, cl_uint name, size_t<N>* param, long)
        return err;
    }

-    for(int i = 0; i < N; ++i) {
+    for (const auto i : c10::irange(N)) {
        (*param)[i] = value[i];
    }

--- a/caffe2/core/blob_serialization.h
+++ b/caffe2/core/blob_serialization.h
@ -9,6 +9,8 @@
 #include "caffe2/core/blob.h"
 #include "caffe2/core/blob_serializer_base.h"
 #include "caffe2/core/tensor.h"
+
+#include <c10/util/irange.h>
 #include <c10/util/typeid.h>
 #include "caffe2/core/types.h"
 #include "caffe2/utils/simple_queue.h"
@ -201,7 +203,7 @@ void ExtendRepeatedField(
 #else
  // We unfortunately do still need to support old protobuf versions in some
  // build configurations.
-  for (size_t i = 0; i < size; ++i) {
+  for (const auto i : c10::irange(size)) {
    field->Add(0);
  }
 #endif
@ -236,7 +238,7 @@ inline void CopyToProtoWithCast(
  context->template CopyToCPU<SrcType>(size, src, buffer.get());
  context->FinishDeviceComputation();
  field->Reserve(size);
-  for (size_t i = 0; i < size; ++i) {
+  for (const auto i : c10::irange(size)) {
    field->Add(static_cast<DstType>(buffer[i]));
  }
 }
@ -267,7 +269,7 @@ inline void CopyFromProtoWithCast(
  // CPUContext. Remove it if it is performance critical.
  unique_ptr<DstType[]> buffer(new DstType[size]);
  const SrcType* src = field.data();
-  for (size_t i = 0; i < size; ++i) {
+  for (const auto i : c10::irange(size)) {
    buffer[i] = static_cast<DstType>(src[i]);
  }
  context->template CopyFromCPU<DstType>(size, buffer.get(), dst);
--- a/caffe2/core/context.h
+++ b/caffe2/core/context.h
@ -17,6 +17,7 @@

 #if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
 #include <c10/core/GeneratorImpl.h>
+#include <c10/util/irange.h>
 #include <ATen/core/DistributionsHelper.h>
 #include <ATen/core/MT19937RNGEngine.h>
 #else
@ -155,7 +156,7 @@ class TORCH_API CPUContext final : public BaseContext {
          static_cast<const void*>(src),
          static_cast<void*>(dst));
    } else {
-      for (size_t i = 0; i < n; ++i) {
+      for (const auto i : c10::irange(n)) {
        dst[i] = src[i];
      }
    }
--- a/caffe2/core/db.h
+++ b/caffe2/core/db.h
@ -4,6 +4,7 @@
 #include <mutex>

 #include <c10/util/Registry.h>
+#include <c10/util/irange.h>
 #include <c10/util/string_view.h>
 #include "caffe2/core/blob_serialization.h"
 #include "caffe2/proto/caffe2_pb.h"
@ -248,7 +249,8 @@ class TORCH_API DBReader {
    *value = cursor_->value();

    // In sharded mode, each read skips num_shards_ records
-    for (uint32_t s = 0; s < num_shards_; s++) {
+    for (const auto s : c10::irange(num_shards_)) {
+      (void)s; // Suppress unused variable
      cursor_->Next();
      if (!cursor_->Valid()) {
        MoveToBeginning();
@ -292,7 +294,8 @@ class TORCH_API DBReader {

  void MoveToBeginning() const {
    cursor_->SeekToFirst();
-    for (uint32_t s = 0; s < shard_id_; s++) {
+    for (const auto s : c10::irange(shard_id_)) {
+      (void)s; // Suppress unused variable
      cursor_->Next();
      CAFFE_ENFORCE(
          cursor_->Valid(), "Db has fewer rows than shard id: ", s, shard_id_);
--- a/caffe2/core/export_c10_op_to_caffe2.h
+++ b/caffe2/core/export_c10_op_to_caffe2.h
@ -12,6 +12,7 @@
 #include <c10/util/C++17.h>
 #include <c10/util/Metaprogramming.h>
 #include "caffe2/core/export_caffe2_op_to_c10.h"
+#include <c10/util/irange.h>

 namespace caffe2 {

@ -136,7 +137,7 @@ class C10OperatorWrapper final : public Operator<Context> {

  void popOutputs_() {
    AT_ASSERT(stack_.size() == op_.schema().returns().size());
-    for (size_t i = 0; i < op_.schema().returns().size(); ++i) {
+    for (const auto i : c10::irange(op_.schema().returns().size())) {
      OperatorBase::SetOutputTensor(i, Tensor(std::move(stack_[i]).toTensor()));
    }
    stack_.clear();
@ -146,7 +147,7 @@ class C10OperatorWrapper final : public Operator<Context> {
    c10::List<at::Tensor> result;
    result.reserve(InputSize());
    // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
-    for (size_t i = 0; i < InputSize(); ++i) {
+    for (const auto i : c10::irange(InputSize())) {
      result.emplace_back(Input(i));
    }
    return result;
@ -156,7 +157,7 @@ class C10OperatorWrapper final : public Operator<Context> {
    c10::List<at::Tensor> result;
    result.reserve(OutputSize());
    // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
-    for (size_t i = 0; i < OutputSize(); ++i) {
+    for (const auto i : c10::irange(OutputSize())) {
      result.emplace_back(OperatorBase::OutputTensorOrUndefined(i));
    }
    return result;
--- a/caffe2/core/export_caffe2_op_to_c10.h
+++ b/caffe2/core/export_caffe2_op_to_c10.h
@ -9,6 +9,7 @@
 #include <ATen/core/op_registration/op_registration.h>
 #include <torch/csrc/jit/frontend/function_schema_parser.h>
 #include <c10/core/CompileTimeFunctionPointer.h>
+#include <c10/util/irange.h>
 #include <torch/library.h>
 #include <vector>

@ -94,7 +95,7 @@ inline void _call_caffe2_op_from_c10(
    // We should not unwrap the list if we expect tensor list in the schema.
    torch::jit::push(*stack, outputs);
  } else {
-    for (size_t i = 0; i < outputs.size(); ++i) {
+    for (const auto i : c10::irange(outputs.size())) {
      torch::jit::push(*stack, outputs.extract(i));
    }
  }
--- a/caffe2/core/nomnigraph/include/nomnigraph/Converters/Dot.h
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Converters/Dot.h
@ -1,6 +1,7 @@
 #ifndef NOM_CONVERTERS_DOT_H
 #define NOM_CONVERTERS_DOT_H

+#include "c10/util/irange.h"
 #include "nomnigraph/Graph/Algorithms.h"
 #include "nomnigraph/Graph/Graph.h"
 #include "nomnigraph/Support/Casting.h"
@ -42,7 +43,7 @@ class DotGenerator {
    for (const auto& node : sg.getNodes()) {
      generateNode(node, sg, output);
    }
-    for (size_t i = 0; i < subgraphs.size(); ++i) {
+    for (const auto i : c10::irange(subgraphs.size())) {
      const auto& subgraph = subgraphs[i];
      output << "subgraph cluster" << i << " {\n";
      output << "style=dotted;\n";
--- a/caffe2/core/nomnigraph/include/nomnigraph/Transformations/SubgraphMatcher.h
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Transformations/SubgraphMatcher.h
@ -1,6 +1,7 @@
 #ifndef NOM_TRANFORMATIONS_SUBGRAPH_MATCHER_H
 #define NOM_TRANFORMATIONS_SUBGRAPH_MATCHER_H

+#include "c10/util/irange.h"
 #include "caffe2/core/common.h"
 #include "nomnigraph/Graph/Graph.h"

@ -240,8 +241,7 @@ class MatchGraph : public Graph<MatchPredicate<GraphType>> {
    // criteria in the given order.

    int currentEdgeIdx = 0;
-    for (int criteriaIdx = 0; criteriaIdx < numChildrenCriteria;
-         criteriaIdx++) {
+    for (const auto criteriaIdx : c10::irange(numChildrenCriteria)) {
      auto childrenCriteriaRef = invertGraphTraversal
          ? criteriaEdges[criteriaIdx]->tail()
          : criteriaEdges[criteriaIdx]->head();
--- a/caffe2/core/operator_schema.h
+++ b/caffe2/core/operator_schema.h
@ -9,13 +9,14 @@
 #include <unordered_map>
 #include <vector>

-#include "c10/util/Registry.h"
-#include "caffe2/core/common.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/core/types.h"
-#include "caffe2/proto/caffe2_pb.h"
-#include "caffe2/utils/filler.h"
-#include "caffe2/utils/proto_utils.h"
+#include <c10/util/irange.h>
+#include <c10/util/Registry.h>
+#include <caffe2/core/common.h>
+#include <caffe2/core/logging.h>
+#include <caffe2/core/types.h>
+#include <caffe2/proto/caffe2_pb.h>
+#include <caffe2/utils/filler.h>
+#include <caffe2/utils/proto_utils.h>

 namespace caffe2 {

@ -519,7 +520,7 @@ inline uint64_t nElemFromDim(const TensorShape& X, int dim = 0) {
  CAFFE_ENFORCE_GE(dim, 0, "Invalid maximum index specified");

  uint64_t nElem = 1;
-  for (int i = dim; i < X.dims_size(); ++i) {
+  for (const auto i : c10::irange(dim, X.dims_size())) {
    nElem *= X.dims(i);
  }
  return nElem;
@ -531,7 +532,7 @@ inline uint64_t nElemBetweenDim(const TensorShape& X, int start, int stop) {
  CAFFE_ENFORCE_LE(stop, X.dims_size(), "Invalid maximum index specified");

  uint64_t nElem = 1;
-  for (int i = start; i < stop; ++i) {
+  for (const auto i : c10::irange(start, stop)) {
    nElem *= X.dims(i);
  }
  return nElem;
@ -560,7 +561,7 @@ OpSchema::Cost PointwiseCostInference(
  const TensorShape X = inputs[0];
  uint64_t nElemX = nElemFromDim(X);
  uint64_t nElemRead = 0;
-  for (size_t i = 0; i < inputs.size(); ++i) {
+  for (const auto i : c10::irange(inputs.size())) {
    nElemRead += nElemFromDim(inputs[i]);
  }

--- a/caffe2/core/qtensor.h
+++ b/caffe2/core/qtensor.h
@ -5,6 +5,7 @@
 #include "caffe2/core/context.h"
 #include "caffe2/core/tensor.h"
 #include <c10/util/accumulate.h>
+#include <c10/util/irange.h>
 #include <c10/util/typeid.h>

 #include <algorithm>
@ -218,7 +219,7 @@ class C10_EXPORT QTensor {
   */
  inline int64_t size_from_dim(int k) const {
    int64_t r = 1;
-    for (int i = k; i < dims_.size(); ++i) {
+    for (const auto i : c10::irange(k, dims_.size())) {
      r *= dims_[i];
    }
    return r;
@ -230,7 +231,7 @@ class C10_EXPORT QTensor {
  inline int64_t size_to_dim(int k) const {
    CAFFE_ENFORCE(k < dims_.size());
    int64_t r = 1;
-    for (int i = 0; i < k; ++i) {
+    for (const auto i : c10::irange(k)) {
      r *= dims_[i];
    }
    return r;
--- a/caffe2/core/qtensor_serialization.h
+++ b/caffe2/core/qtensor_serialization.h
@ -46,7 +46,7 @@ void QTensorSerializer<Context>::Serialize(
  blob_proto.set_type(kQTensorBlobQType);
  QTensorProto& proto = *blob_proto.mutable_qtensor();
  proto.set_name(name);
-  for (int i = 0; i < qtensor.ndim(); ++i) {
+  for (const auto i : c10::irange(qtensor.ndim())) {
    proto.add_dims(qtensor.dim32(i));
  }
  proto.set_precision(qtensor.precision());
--- a/caffe2/core/stats.h
+++ b/caffe2/core/stats.h
@ -73,7 +73,7 @@ TORCH_API ExportedStatMap toMap(const ExportedStatList& stats);
 * int main() {
 *   MyCaffeClass a("first");
 *   MyCaffeClass b("second");
- *   for (int i = 0; i < 10; ++i) {
+ *   for (const auto i : c10::irange(10)) {
 *     a.run(10);
 *     b.run(5);
 *   }
--- a/caffe2/core/test_utils.h
+++ b/caffe2/core/test_utils.h
@ -6,6 +6,7 @@
 #include "caffe2/utils/proto_utils.h"

 #include <c10/macros/Macros.h>
+#include <c10/util/irange.h>

 #include <cmath>
 #include <string>
@ -34,7 +35,7 @@ void assertTensorEquals(
    float epsilon = 0.1f) {
  CAFFE_ENFORCE(tensor.IsType<T>());
  CAFFE_ENFORCE_EQ(tensor.numel(), data.size());
-  for (auto idx = 0; idx < tensor.numel(); ++idx) {
+  for (const auto idx : c10::irange(tensor.numel())) {
    if (tensor.IsType<float>()) {
      assertNear(tensor.data<T>()[idx], data[idx], epsilon);
    } else {
@ -88,7 +89,7 @@ void randomFill(
  std::mt19937 gen(42);
  std::uniform_real_distribution<RealType> dis(
      static_cast<RealType>(min), static_cast<RealType>(max));
-  for (size_t i = 0; i < size; i++) {
+  for (const auto i : c10::irange(size)) {
    data[i] = dis(gen);
  }
 }
--- a/caffe2/cuda_rtc/common_rtc.h
+++ b/caffe2/cuda_rtc/common_rtc.h
@ -120,7 +120,7 @@ inline std::string GetUniqueName() {

  std::stringstream ss;
  ss << "_cuda_kernel_";
-  for (int i = 0; i < len; ++i) {
+  for (const auto i : c10::irange(len)) {
    ss << alpha[rand() % (sizeof(alpha) - 1)];
  }
  return ss.str();
--- a/caffe2/experiments/operators/fully_connected_op_prune.h
+++ b/caffe2/experiments/operators/fully_connected_op_prune.h
@ -32,7 +32,7 @@ template <int N>
 const std::vector<int64_t>& shape(Shape<N> vs) {
  static thread_local std::vector<int64_t> cache;
  cache.resize(vs.size());
-  for (auto i = 0; i < vs.size(); ++i) {
+  for (const auto i : c10::irange(vs.size())) {
    cache[i] = vs[i];
  }
  return cache;
@ -86,7 +86,7 @@ void MaskMatrix_Inc<float, CPUContext>(
    int /*N*/,
    int seq_len,
    float target) {
-  for (int i = 0; i < seq_len; ++i) {
+  for (const auto i : c10::irange(seq_len)) {
    // assume that the mask_seq is smaller than size
    // Although it seems that random access gets bad performance,
    // we make sure that seq is in order;
--- a/caffe2/experiments/operators/fully_connected_op_sparse.h
+++ b/caffe2/experiments/operators/fully_connected_op_sparse.h
@ -35,7 +35,7 @@ template <int N>
 const std::vector<int64_t>& shape(Shape<N> vs) {
  static thread_local std::vector<int64_t> cache;
  cache.resize(vs.size());
-  for (auto i = 0; i < vs.size(); ++i) {
+  for (const auto i : c10::irange(vs.size())) {
    cache[i] = vs[i];
  }
  return cache;
@ -71,8 +71,8 @@ void trans_mat<float, CPUContext>(
    int m,
    int n,
    CPUContext* /*context*/) {
-  for (int i = 0; i < m; ++i) {
-    for (int j = 0; j < n; ++j) {
+  for (const auto i : c10::irange(m)) {
+    for (const auto j : c10::irange(n)) {
      t[j * m + i] = o[i * n + j];
    }
  }
--- a/caffe2/experiments/operators/funhash_op.h
+++ b/caffe2/experiments/operators/funhash_op.h
@ -67,7 +67,7 @@ class FunHashOp : public Operator<Context> {

    int64_t n_segments = num_segments_;
    if (num_segments_ == -1) {
-      for (int64_t i = 0; i < num_nz_ent; ++i) {
+      for (const auto i : c10::irange(num_nz_ent)) {
        if (seg_data[i] > n_segments) {
          n_segments = seg_data[i];
        }
@ -86,14 +86,14 @@ class FunHashOp : public Operator<Context> {
    const auto* val_data = val.template data<T>();
    const auto* key_data = key.template data<int64_t>();

-    for (int64_t j = 0; j < num_nz_ent; ++j) {
+    for (const auto j : c10::irange(num_nz_ent)) {
      int64_t cur_seg = seg_data[j];
      int64_t cur_key = key_data[j];
      T cur_val = val_data[j];
      int64_t output_stride = cur_seg * num_outputs_;
-      for (int64_t i = 0; i < num_outputs_; ++i) {
+      for (const auto i : c10::irange(num_outputs_)) {
        T sum = 0;
-        for (int64_t k = 0; k < num_alpha; ++k) {
+        for (const auto k : c10::irange(num_alpha)) {
          uint64_t hash;
          // The hash function takes as input four integers:
          // 1. feature index
@ -186,14 +186,14 @@ class FunHashGradientOp : public Operator<Context> {

    memset(grad_weight_data, 0, sizeof(T) * num_weight);

-    for (int64_t j = 0; j < num_nz_ent; ++j) {
+    for (const auto j : c10::irange(num_nz_ent)) {
      int64_t cur_seg = seg_data[j];
      int64_t cur_key = key_data[j];
      T cur_val = val_data[j];
      int64_t grad_out_stride = cur_seg * num_outputs_;
-      for (int64_t i = 0; i < num_outputs_; ++i) {
+      for (const auto i : c10::irange(num_outputs_)) {
        T grad_out_scale = grad_out_data[grad_out_stride + i] * cur_val;
-        for (int64_t k = 0; k < num_alpha; ++k) {
+        for (const auto k : c10::irange(num_alpha)) {
          uint64_t hash;
          hash_data[0] = cur_key;
          hash_data[1] = i;
--- a/caffe2/experiments/operators/sparse_funhash_op.h
+++ b/caffe2/experiments/operators/sparse_funhash_op.h
@ -66,7 +66,7 @@ class SparseFunHashOp : public Operator<Context> {

    int64_t n_segments = num_segments_;
    if (num_segments_ == -1) {
-      for (int64_t i = 0; i < num_nz_ent; ++i) {
+      for (const auto i : c10::irange(num_nz_ent)) {
        if (seg_data[i] > n_segments) {
          n_segments = seg_data[i];
        }
@ -85,14 +85,14 @@ class SparseFunHashOp : public Operator<Context> {
    const auto* val_data = val.template data<T>();
    const auto* key_data = key.template data<int64_t>();

-    for (int64_t j = 0; j < num_nz_ent; ++j) {
+    for (const auto j : c10::irange(num_nz_ent)) {
      int64_t cur_seg = seg_data[j];
      int64_t cur_key = key_data[j];
      T cur_val = val_data[j];
      int64_t output_stride = cur_seg * num_outputs_;
-      for (int64_t i = 0; i < num_outputs_; ++i) {
+      for (const auto i : c10::irange(num_outputs_)) {
        T sum = 0;
-        for (int64_t k = 0; k < num_alpha; ++k) {
+        for (const auto k : c10::irange(num_alpha)) {
          // The hash function takes as input three integers:
          // 1. feature index
          // 2. output index
@ -190,14 +190,14 @@ class SparseFunHashGradientOp : public Operator<Context> {
    const auto* key_data = key.template data<int64_t>();

    int64_t w_ind = 0;
-    for (int64_t j = 0; j < num_nz_ent; ++j) {
+    for (const auto j : c10::irange(num_nz_ent)) {
      int64_t cur_seg = seg_data[j];
      int64_t cur_key = key_data[j];
      T cur_val = val_data[j];
      int64_t grad_out_stride = cur_seg * num_outputs_;
-      for (int64_t i = 0; i < num_outputs_; ++i) {
+      for (const auto i : c10::irange(num_outputs_)) {
        T grad_out_scale = grad_out_data[grad_out_stride + i] * cur_val;
-        for (int64_t k = 0; k < num_alpha; ++k) {
+        for (const auto k : c10::irange(num_alpha)) {
          hash_data[0] = cur_key;
          hash_data[1] = i;
          hash_data[2] = k;
--- a/caffe2/experiments/operators/sparse_matrix_reshape_op.h
+++ b/caffe2/experiments/operators/sparse_matrix_reshape_op.h
@ -111,7 +111,7 @@ class SparseMatrixReshapeOp : public Operator<Context> {
    auto* new_col_data = new_col->template mutable_data<int64_t>();
    auto* new_row_data = new_row->template mutable_data<int>();

-    for (int i = 0; i < nnz; ++i) {
+    for (const auto i : c10::irange(nnz)) {
      int64_t offset = old_row_data[i] * old_stride_ + old_col_data[i];
      new_row_data[i] = offset / new_stride_;
      new_col_data[i] = offset % new_stride_;