mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
use irange for loops 2 (#66746)
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/66746 Modified loops in files under fbsource/fbcode/caffe2/ from the format `for(TYPE var=x0;var<x_max;x++)` to the format `for(const auto var: irange(xmax))` This was achieved by running r-barnes's loop upgrader script (D28874212) with some modification to exclude all files under /torch/jit and a number of reversions or unused variable suppression warnings added by hand. Test Plan: Sandcastle Reviewed By: malfet Differential Revision: D31705361 fbshipit-source-id: 33fd22eb03086d114e2c98e56703e8ec84460268
This commit is contained in:
committed by
Facebook GitHub Bot
parent
91d16cb633
commit
29d759948e
@ -303,7 +303,7 @@ at::Tensor PackedLinearWeightsQnnp::apply_impl(
|
||||
w_zero_points[0]);
|
||||
auto* qnnp_w_data = qnnp_weight.data_ptr<c10::quint8>();
|
||||
auto wt_numel = weight_contig.numel();
|
||||
for (int i = 0; i < wt_numel; ++i) {
|
||||
for (const auto i : c10::irange(wt_numel)) {
|
||||
qnnp_w_data[i] = static_cast<c10::quint8>(w_data[i] + 128);
|
||||
}
|
||||
// Original bias was float, so we requantize it here.
|
||||
|
@ -301,7 +301,7 @@ at::Tensor PackedLinearWeightsQnnp::apply_dynamic_impl(
|
||||
auto* qnnp_w_data = qnnp_weight.data_ptr<c10::quint8>();
|
||||
int8_t* w_data = (int8_t*)weight_contig.data_ptr<c10::qint8>();
|
||||
auto wt_numel = weight_contig.numel();
|
||||
for (int i = 0; i < wt_numel; ++i) {
|
||||
for (const auto i : c10::irange(wt_numel)) {
|
||||
qnnp_w_data[i] = static_cast<c10::quint8>(w_data[i] + 128);
|
||||
}
|
||||
|
||||
|
@ -9,6 +9,7 @@
|
||||
#include <ATen/native/quantized/cpu/quantized_ops.h>
|
||||
#include <ATen/native/quantized/cpu/init_qnnpack.h>
|
||||
#include <ATen/native/quantized/cpu/qnnpack_utils.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <caffe2/utils/threadpool/pthreadpool-cpp.h>
|
||||
|
||||
#include <algorithm>
|
||||
@ -43,7 +44,7 @@ void spatial_dilated_max_pooling(
|
||||
int64_t dW, // dilation
|
||||
T* oData) { // output arrays (data and max-index)
|
||||
at::parallel_for(0, iC, 0, [&](int64_t start, int64_t end) {
|
||||
for (auto p = start; p < end; ++p) {
|
||||
for (const auto p : c10::irange(start, end)) {
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
|
||||
int64_t row, col;
|
||||
const T* i_p = iData + p * iW * iH;
|
||||
@ -195,7 +196,7 @@ Tensor q_maxpool_2d(
|
||||
oData);
|
||||
} else {
|
||||
at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) {
|
||||
for (auto p = start; p < end; ++p) {
|
||||
for (const auto p : c10::irange(start, end)) {
|
||||
auto* iData = qxd + p * iC * iW * iH;
|
||||
auto* oData = qyd + p * oC * oW * oH;
|
||||
spatial_dilated_max_pooling<Q>(
|
||||
|
@ -6,6 +6,7 @@
|
||||
#include <ATen/native/quantized/cpu/init_qnnpack.h>
|
||||
#include <ATen/native/quantized/cpu/qnnpack_utils.h>
|
||||
#include <ATen/native/quantized/cpu/quantized_ops.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <caffe2/utils/threadpool/pthreadpool-cpp.h>
|
||||
#include <torch/library.h>
|
||||
|
||||
@ -30,7 +31,7 @@ Tensor qnnpack_relu(Tensor input) {
|
||||
initQNNPACK();
|
||||
|
||||
size_t num_elems = 1;
|
||||
for (int i = 1; i < input_contig.ndimension(); ++i) {
|
||||
for (const auto i : c10::irange(1, input_contig.ndimension())) {
|
||||
num_elems *= input_contig.size(i);
|
||||
}
|
||||
|
||||
|
@ -7,6 +7,7 @@
|
||||
#include <ATen/native/quantized/cpu/quantized_ops.h>
|
||||
#include <ATen/native/quantized/cpu/init_qnnpack.h>
|
||||
#include <ATen/native/quantized/cpu/qnnpack_utils.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <caffe2/utils/threadpool/pthreadpool-cpp.h>
|
||||
|
||||
#include <algorithm>
|
||||
@ -26,7 +27,7 @@ Tensor qnnpack_sigmoid(
|
||||
|
||||
Tensor input_contig = input.contiguous(input.suggest_memory_format());
|
||||
size_t num_elems = 1;
|
||||
for (int i = 1; i < input_contig.ndimension(); ++i) {
|
||||
for (const auto i : c10::irange(1, input_contig.ndimension())) {
|
||||
num_elems *= input_contig.size(i);
|
||||
}
|
||||
|
||||
|
@ -7,6 +7,7 @@
|
||||
#include <ATen/native/quantized/cpu/quantized_ops.h>
|
||||
#include <ATen/native/quantized/cpu/init_qnnpack.h>
|
||||
#include <ATen/native/quantized/cpu/qnnpack_utils.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <caffe2/utils/threadpool/pthreadpool-cpp.h>
|
||||
|
||||
#include <algorithm>
|
||||
@ -29,7 +30,7 @@ Tensor qnnpack_tanh(Tensor input) {
|
||||
|
||||
Tensor input_contig = input.contiguous(input.suggest_memory_format());
|
||||
size_t num_elems = 1;
|
||||
for (int i = 1; i < input_contig.ndimension(); ++i) {
|
||||
for (const auto i : c10::irange(1, input_contig.ndimension())) {
|
||||
num_elems *= input_contig.size(i);
|
||||
}
|
||||
const auto zero_point = input_contig.q_zero_point();
|
||||
|
@ -1,6 +1,7 @@
|
||||
#pragma once
|
||||
|
||||
#include <ATen/ATen.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
|
||||
@ -193,7 +194,7 @@ static C10_UNUSED torch::List<int64_t> MakeArgForConv1d(const torch::List<int64_
|
||||
inline void HandleWeightsSaturation(int64_t N, float* weight) {
|
||||
const float kFp16Max = RawUint16ToFp16(0x7BFF);
|
||||
bool found_out_of_range = false;
|
||||
for (int64_t i = 0; i < N; ++i) {
|
||||
for (const auto i : c10::irange(N)) {
|
||||
bool saturate = CheckAndSaturate<float>(kFp16Max, weight + i);
|
||||
if (saturate) {
|
||||
found_out_of_range = true;
|
||||
|
@ -2,6 +2,7 @@
|
||||
#include <ATen/native/UpSample.h>
|
||||
#include <ATen/native/quantized/affine_quantizer.h>
|
||||
#include <ATen/native/quantized/cpu/quantized_ops.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
@ -57,7 +58,7 @@ static void upsample_bilinear2d_out_frame(
|
||||
const int64_t input_q_zero_point = input.q_zero_point();
|
||||
const int64_t output_q_zero_point = output.q_zero_point();
|
||||
|
||||
for (int64_t h2 = 0; h2 < output_height; ++h2) {
|
||||
for (const auto h2 : c10::irange(output_height)) {
|
||||
const auto h1r = area_pixel_compute_source_index<float>(
|
||||
rheight, h2, align_corners, /*cubic=*/false);
|
||||
|
||||
@ -67,7 +68,7 @@ static void upsample_bilinear2d_out_frame(
|
||||
const float h1lambda = h1r - h1;
|
||||
const float h0lambda = static_cast<float>(1.) - h1lambda;
|
||||
|
||||
for (int64_t w2 = 0; w2 < output_width; ++w2) {
|
||||
for (const auto w2 : c10::irange(output_width)) {
|
||||
const auto w1r = area_pixel_compute_source_index<float>(
|
||||
rwidth, w2, align_corners, /*cubic=*/false);
|
||||
|
||||
@ -79,7 +80,8 @@ static void upsample_bilinear2d_out_frame(
|
||||
const typename scalar_t::underlying* pos1 = i_p + h1 * input_width + w1;
|
||||
typename scalar_t::underlying* pos2 = o_p + h2 * output_width + w2;
|
||||
|
||||
for (int64_t c = 0; c < channels; ++c) {
|
||||
for (const auto c : c10::irange(channels)) {
|
||||
(void)c; //Suppress unused variable warning
|
||||
float result = h0lambda * (w0lambda * pos1[0] + w1lambda * pos1[w1p]) +
|
||||
h1lambda *
|
||||
(w0lambda * pos1[h1p * input_width] +
|
||||
|
@ -44,18 +44,19 @@ static void upsample_nearest2d_out_frame(
|
||||
return;
|
||||
}
|
||||
|
||||
for (int64_t h2 = 0; h2 < output_height; ++h2) {
|
||||
for (const auto h2 : c10::irange(output_height)) {
|
||||
const int64_t h1 =
|
||||
nn_compute_source_index_fn(height_scale, h2, input_height);
|
||||
|
||||
for (int64_t w2 = 0; w2 < output_width; ++w2) {
|
||||
for (const auto w2 : c10::irange(output_width)) {
|
||||
const int64_t w1 =
|
||||
nn_compute_source_index_fn(width_scale, w2, input_width);
|
||||
|
||||
const auto* pos1 = &i_p[h1 * input_width + w1];
|
||||
auto* pos2 = &o_p[h2 * output_width + w2];
|
||||
|
||||
for (int64_t c = 0; c < channels; ++c) {
|
||||
for (const auto c : c10::irange(channels)) {
|
||||
(void)c; //Suppress unused variable warning
|
||||
pos2[0] = pos1[0];
|
||||
pos1 += input_height * input_width;
|
||||
pos2 += output_height * output_width;
|
||||
@ -88,11 +89,11 @@ static void upsample_nearest2d_out_frame_nhwc(
|
||||
return;
|
||||
}
|
||||
|
||||
for (int64_t h2 = 0; h2 < output_height; ++h2) {
|
||||
for (const auto h2 : c10::irange(output_height)) {
|
||||
const int64_t h1 =
|
||||
nn_compute_source_index_fn(height_scale, h2, input_height);
|
||||
|
||||
for (int64_t w2 = 0; w2 < output_width; ++w2) {
|
||||
for (const auto w2 : c10::irange(output_width)) {
|
||||
const int64_t w1 =
|
||||
nn_compute_source_index_fn(width_scale, w2, input_width);
|
||||
|
||||
|
@ -48,22 +48,23 @@ static void upsample_nearest3d_out_frame(
|
||||
return;
|
||||
}
|
||||
|
||||
for (int64_t d2 = 0; d2 < output_depth; ++d2) {
|
||||
for (const auto d2 : c10::irange(output_depth)) {
|
||||
const int64_t d1 =
|
||||
nn_compute_source_index_fn(depth_scale, d2, input_depth);
|
||||
|
||||
for (int64_t h2 = 0; h2 < output_height; ++h2) {
|
||||
for (const auto h2 : c10::irange(output_height)) {
|
||||
const int64_t h1 =
|
||||
nn_compute_source_index_fn(height_scale, h2, input_height);
|
||||
|
||||
for (int64_t w2 = 0; w2 < output_width; ++w2) {
|
||||
for (const auto w2 : c10::irange(output_width)) {
|
||||
const int64_t w1 =
|
||||
nn_compute_source_index_fn(width_scale, w2, input_width);
|
||||
|
||||
const auto* pos1 = &i_p[d1 * input_height * input_width + h1 * input_width + w1];
|
||||
auto* pos2 = &o_p[d2 * output_height * output_width + h2 * output_width + w2];
|
||||
|
||||
for (int64_t c = 0; c < channels; ++c) {
|
||||
for (const auto c : c10::irange(channels)) {
|
||||
(void)c; //Suppress unused variable warning
|
||||
pos2[0] = pos1[0];
|
||||
pos1 += input_depth * input_height * input_width;
|
||||
pos2 += output_depth * output_height * output_width;
|
||||
@ -101,14 +102,14 @@ static void upsample_nearest3d_out_frame_nhwc(
|
||||
return;
|
||||
}
|
||||
|
||||
for (int64_t d2 = 0; d2 < output_depth; ++d2) {
|
||||
for (const auto d2 : c10::irange(output_depth)) {
|
||||
const int64_t d1 =
|
||||
nn_compute_source_index_fn(depth_scale, d2, input_depth);
|
||||
for (int64_t h2 = 0; h2 < output_height; ++h2) {
|
||||
for (const auto h2 : c10::irange(output_height)) {
|
||||
const int64_t h1 =
|
||||
nn_compute_source_index_fn(height_scale, h2, input_height);
|
||||
|
||||
for (int64_t w2 = 0; w2 < output_width; ++w2) {
|
||||
for (const auto w2 : c10::irange(output_width)) {
|
||||
const int64_t w1 =
|
||||
nn_compute_source_index_fn(width_scale, w2, input_width);
|
||||
|
||||
|
@ -218,7 +218,7 @@ std::tuple<Tensor, Tensor, Tensor> _fake_quantize_learnable_per_channel_affine_b
|
||||
// into the same shapes as X along the channel axis.
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-no-malloc)
|
||||
int64_t* axis_mask = (int64_t *) calloc(numDimensions, sizeof(int64_t));
|
||||
for (int i = 0; i < numDimensions; ++i) {
|
||||
for (const auto i : c10::irange(numDimensions)) {
|
||||
axis_mask[i] = (i == axis) ? X.size(axis) : 1;
|
||||
}
|
||||
auto X_shape = X.sizes();
|
||||
|
@ -7,6 +7,7 @@
|
||||
#include <ATen/Parallel.h>
|
||||
#include <ATen/SparseTensorUtils.h>
|
||||
#include <c10/util/accumulate.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
#include <map>
|
||||
|
||||
@ -71,9 +72,9 @@ std::vector<int64_t> get_offsets(const Tensor& indices, const IntArrayRef& sizes
|
||||
}
|
||||
}
|
||||
|
||||
for (int64_t i=0; i < nnz; i++) {
|
||||
for (const auto i : c10::irange(nnz)) {
|
||||
int64_t acc = 0;
|
||||
for (int64_t j=0; j < ndim; j++) {
|
||||
for (const auto j : c10::irange(ndim)) {
|
||||
auto indices_row = indices_accessor[j];
|
||||
auto stride = strides[j];
|
||||
if (j != dim) {
|
||||
@ -119,9 +120,9 @@ std::vector<std::vector<int64_t>> get_pools(const Tensor& indices, const IntArra
|
||||
}
|
||||
}
|
||||
|
||||
for (int64_t i=0; i < nnz; i++) {
|
||||
for (const auto i : c10::irange(nnz)) {
|
||||
int64_t pool_index = 0;
|
||||
for (int64_t j=0; j < ndim; j++) {
|
||||
for (const auto j : c10::irange(ndim)) {
|
||||
if (j != dim) {
|
||||
const auto indices_row = indices_accessor[j];
|
||||
const auto stride = strides[j];
|
||||
@ -315,7 +316,7 @@ void cpu_sparse_coo_softmax(Tensor output, const Tensor& input, const int64_t di
|
||||
|
||||
int64_t grain_size = 1;
|
||||
parallel_for(0, pools.size(), grain_size, [&](int64_t begin, int64_t end) {
|
||||
for (auto p = begin; p < end; p++) {
|
||||
for (const auto p : c10::irange(begin, end)) {
|
||||
auto pool_indices = pools[p];
|
||||
|
||||
// Skip empty pools
|
||||
@ -329,7 +330,7 @@ void cpu_sparse_coo_softmax(Tensor output, const Tensor& input, const int64_t di
|
||||
/* Compute mx */
|
||||
for (int64_t i : pool_indices) {
|
||||
auto values_row = values_accessor[i];
|
||||
for (int64_t j=0; j < nvalues; j++) {
|
||||
for (const auto j : c10::irange(nvalues)) {
|
||||
mx_row[j] = std::max(mx_row[j], values_row[j]);
|
||||
}
|
||||
}
|
||||
@ -338,7 +339,7 @@ void cpu_sparse_coo_softmax(Tensor output, const Tensor& input, const int64_t di
|
||||
for (int64_t i : pool_indices) {
|
||||
auto values_row = values_accessor[i];
|
||||
auto out_values_row = out_values_accessor[i];
|
||||
for (int64_t j=0; j < nvalues; j++) {
|
||||
for (const auto j : c10::irange(nvalues)) {
|
||||
auto v = std::exp(values_row[j] - mx_row[j]);
|
||||
if (!LogSoftMax) {
|
||||
out_values_row[j] = v;
|
||||
@ -347,7 +348,7 @@ void cpu_sparse_coo_softmax(Tensor output, const Tensor& input, const int64_t di
|
||||
}
|
||||
}
|
||||
|
||||
for (int64_t j=0; j < nvalues; j++) {
|
||||
for (const auto j : c10::irange(nvalues)) {
|
||||
if (LogSoftMax) {
|
||||
mx_row[j] += std::log(exp_sums_row[j]);
|
||||
} else {
|
||||
@ -359,7 +360,7 @@ void cpu_sparse_coo_softmax(Tensor output, const Tensor& input, const int64_t di
|
||||
for (int64_t i : pool_indices) {
|
||||
auto values_row = values_accessor[i];
|
||||
auto out_values_row = out_values_accessor[i];
|
||||
for (int64_t j=0; j < nvalues; j++) {
|
||||
for (const auto j : c10::irange(nvalues)) {
|
||||
if (LogSoftMax) {
|
||||
out_values_row[j] = values_row[j] - mx_row[j];
|
||||
} else {
|
||||
@ -421,7 +422,7 @@ void cpu_sparse_coo_softmax_backward(const Tensor& grad_input, const Tensor& gra
|
||||
values.set_(r);
|
||||
}
|
||||
} else {
|
||||
for(int64_t i=0; i<out_nnz; i++) {
|
||||
for (const auto i : c10::irange(out_nnz)) {
|
||||
auto low = std::lower_bound(grad_offsets.begin(), grad_offsets.end(), out_offsets[i]);
|
||||
auto j = low - grad_offsets.begin();
|
||||
if (j < grad_nnz && out_offsets[i] == grad_offsets[j]) {
|
||||
@ -456,7 +457,7 @@ void cpu_sparse_coo_softmax_backward(const Tensor& grad_input, const Tensor& gra
|
||||
|
||||
int64_t grain_size = 1;
|
||||
parallel_for(0, pools.size(), grain_size, [&](int64_t begin, int64_t end) {
|
||||
for (auto p = begin; p < end; p++) {
|
||||
for (const auto p : c10::irange(begin, end)) {
|
||||
auto pool_indices = pools[p];
|
||||
|
||||
// Skip empty pools
|
||||
@ -473,7 +474,7 @@ void cpu_sparse_coo_softmax_backward(const Tensor& grad_input, const Tensor& gra
|
||||
|
||||
if (j < grad_nnz && (out_offsets[i] == grad_offsets[j])) {
|
||||
auto grad_values_row = grad_values_accessor[j];
|
||||
for (int64_t k=0; k<nvalues; k++) {
|
||||
for (const auto k : c10::irange(nvalues)) {
|
||||
if (LogSoftMax) {
|
||||
tmp_row[k] -= grad_values_row[k];
|
||||
} else {
|
||||
@ -492,7 +493,7 @@ void cpu_sparse_coo_softmax_backward(const Tensor& grad_input, const Tensor& gra
|
||||
|
||||
if (j < grad_nnz && (out_offsets[i] == grad_offsets[j])) {
|
||||
auto grad_values_row = grad_values_accessor[j];
|
||||
for (int64_t k=0; k<nvalues; k++) {
|
||||
for (const auto k : c10::irange(nvalues)) {
|
||||
if (LogSoftMax) {
|
||||
values_row[k] = grad_values_row[k] + std::exp(out_values_row[k]) * tmp_row[k];
|
||||
} else {
|
||||
@ -500,7 +501,7 @@ void cpu_sparse_coo_softmax_backward(const Tensor& grad_input, const Tensor& gra
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int64_t k=0; k<nvalues; k++) {
|
||||
for (const auto k : c10::irange(nvalues)) {
|
||||
if (LogSoftMax) {
|
||||
values_row[k] = std::exp(out_values_row[k]) * tmp_row[k];
|
||||
} else {
|
||||
|
@ -13,6 +13,7 @@
|
||||
#include <ATen/native/Resize.h>
|
||||
#include <ATen/native/mkl/SparseBlasImpl.h>
|
||||
#include <ATen/native/sparse/SparseBlasImpl.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
@ -60,7 +61,7 @@ void convert_indices_from_coo_to_csr_cpu(const Tensor& result, const Tensor& inp
|
||||
|
||||
at::parallel_for(0, numel - 1, GRAIN_SIZE, [&](int64_t start, int64_t end) {
|
||||
input_t curr_value = data_in[start], next_value;
|
||||
for (int64_t i = start; i < end; i++) {
|
||||
for (const auto i : c10::irange(start, end)) {
|
||||
next_value = data_in[i + 1];
|
||||
for (; curr_value < next_value; curr_value++)
|
||||
data_out[curr_value + 1] = static_cast<output_t>(i + 1);
|
||||
|
@ -5,6 +5,7 @@
|
||||
#include <ATen/SparseTensorImpl.h>
|
||||
#include <ATen/SparseTensorUtils.h>
|
||||
#include <ATen/native/Resize.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <unordered_map>
|
||||
|
||||
namespace at { namespace native {
|
||||
@ -30,7 +31,7 @@ void csr_to_coo(const int64_t n_row, const int64_t Ap[], int64_t Bi[]) {
|
||||
Output:
|
||||
`Bi` is the row indices
|
||||
*/
|
||||
for (int64_t i = 0; i < n_row; i++) {
|
||||
for (const auto i : c10::irange(n_row)) {
|
||||
for (int64_t jj = Ap[i]; jj < Ap[i + 1]; jj++) {
|
||||
Bi[jj] = i;
|
||||
}
|
||||
@ -56,7 +57,7 @@ int64_t _csr_matmult_maxnnz(
|
||||
*/
|
||||
std::vector<int64_t> mask(n_col, -1);
|
||||
int64_t nnz = 0;
|
||||
for (int64_t i = 0; i < n_row; i++) {
|
||||
for (const auto i : c10::irange(n_row)) {
|
||||
int64_t row_nnz = 0;
|
||||
|
||||
for (int64_t jj = Ap[i]; jj < Ap[i + 1]; jj++) {
|
||||
@ -127,19 +128,19 @@ void _csr_matmult(
|
||||
|
||||
Cp[0] = 0;
|
||||
|
||||
for (int64_t i = 0; i < n_row; i++) {
|
||||
for (const auto i : c10::irange(n_row)) {
|
||||
int64_t head = -2;
|
||||
int64_t length = 0;
|
||||
|
||||
int64_t jj_start = Ap[i];
|
||||
int64_t jj_end = Ap[i + 1];
|
||||
for (int64_t jj = jj_start; jj < jj_end; jj++) {
|
||||
for (const auto jj : c10::irange(jj_start, jj_end)) {
|
||||
int64_t j = Aj[jj];
|
||||
scalar_t v = Ax[jj];
|
||||
|
||||
int64_t kk_start = Bp[j];
|
||||
int64_t kk_end = Bp[j + 1];
|
||||
for (int64_t kk = kk_start; kk < kk_end; kk++) {
|
||||
for (const auto kk : c10::irange(kk_start, kk_end)) {
|
||||
int64_t k = Bj[kk];
|
||||
|
||||
sums[k] += v * Bx[kk];
|
||||
@ -152,7 +153,8 @@ void _csr_matmult(
|
||||
}
|
||||
}
|
||||
|
||||
for (int64_t jj = 0; jj < length; jj++) {
|
||||
for (const auto jj : c10::irange(length)) {
|
||||
(void)jj; //Suppress unused variable warning
|
||||
Cj[nnz] = head;
|
||||
Cx[nnz] = sums[head];
|
||||
nnz++;
|
||||
|
@ -12,6 +12,7 @@
|
||||
|
||||
#include <ATen/native/Copy.h>
|
||||
#include <ATen/native/CPUBlas.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
namespace at {
|
||||
namespace native {
|
||||
@ -229,7 +230,7 @@ Tensor sparse_coo_tensor(const Tensor& indices, const Tensor& values_,
|
||||
auto cpu_min_indices_accessor = cpu_min_indices.accessor<int64_t, 1>();
|
||||
auto cpu_computed_indices_sizes_accessor =
|
||||
cpu_computed_indices_sizes.accessor<int64_t, 1>();
|
||||
for (int64_t d = 0; d < sparse_dim; d++) {
|
||||
for (const auto d : c10::irange(sparse_dim)) {
|
||||
int64_t min_index_in_dim = cpu_min_indices_accessor[d];
|
||||
TORCH_CHECK(
|
||||
min_index_in_dim >= 0,
|
||||
@ -244,11 +245,11 @@ Tensor sparse_coo_tensor(const Tensor& indices, const Tensor& values_,
|
||||
// If the indices doesn't have elements in it, there is not enough
|
||||
// information to know what the minimum sparse dimension sizes should be,
|
||||
// and in this case we set them to 0
|
||||
for (int64_t d = 0; d < sparse_dim; d++) {
|
||||
for (const auto d : c10::irange(sparse_dim)) {
|
||||
computed_sizes[static_cast<size_t>(d)] = 0;
|
||||
}
|
||||
}
|
||||
for (int64_t d = 0; d < dense_dim; d++) {
|
||||
for (const auto d : c10::irange(dense_dim)) {
|
||||
computed_sizes[static_cast<size_t>(sparse_dim + d)] = values.size(d + 1);
|
||||
}
|
||||
|
||||
@ -305,7 +306,7 @@ void _validate_sparse_coo_tensor_args(
|
||||
}
|
||||
auto cpu_min_indices_accessor = cpu_min_indices.accessor<int64_t, 1>();
|
||||
auto cpu_max_indices_accessor = cpu_max_indices.accessor<int64_t, 1>();
|
||||
for (int64_t d = 0; d < sparse_dim; d++) {
|
||||
for (const auto d : c10::irange(sparse_dim)) {
|
||||
// NB: This used to sync ndim times to access each entry; now we copy
|
||||
// everything to CPU first and then access it.
|
||||
int64_t min_index_in_dim = cpu_min_indices_accessor[d];
|
||||
@ -597,7 +598,7 @@ SparseTensor _coalesce_sparse_cpu(const SparseTensor& self) {
|
||||
int64_t blockSize = values.stride(0);
|
||||
scalar_t* values_ptr = values.data_ptr<scalar_t>();
|
||||
scalar_t* newValues_ptr = newValues.data_ptr<scalar_t>();
|
||||
for (int64_t j = 0; j < nnz; j++) {
|
||||
for (const auto j : c10::irange(nnz)) {
|
||||
int64_t pos = indicesPermutationAccessor[j];
|
||||
int64_t curr = indicesBufferAccessor[j];
|
||||
if (curr == prev) {
|
||||
@ -613,7 +614,7 @@ SparseTensor _coalesce_sparse_cpu(const SparseTensor& self) {
|
||||
}
|
||||
} else {
|
||||
++i;
|
||||
for (int64_t d = 0; d < sparse_dim; d++) {
|
||||
for (const auto d : c10::irange(sparse_dim)) {
|
||||
newIndicesAccessor[d][i] = indicesAccessor[d][pos];
|
||||
}
|
||||
if (values.numel() >
|
||||
@ -656,9 +657,9 @@ void inline sparse_mask_out_cpu_kernel(
|
||||
auto t_strides = t.strides();
|
||||
|
||||
at::parallel_for(0, r_nnz, 1000, [&](int64_t start, int64_t end) {
|
||||
for (auto i = start; i < end; i++) {
|
||||
for (const auto i : c10::irange(start, end)) {
|
||||
int64_t idx = 0;
|
||||
for (int64_t d = 0; d < sparse_dim; d++) {
|
||||
for (const auto d : c10::irange(sparse_dim)) {
|
||||
idx += mask_indices_accessor[d][i] * t_strides[d];
|
||||
}
|
||||
r_values_accessor[i] = t_ptr[idx];
|
||||
@ -706,14 +707,14 @@ SparseTensor& sparse_mask_out_cpu(
|
||||
// ]. Keeping this implementation because it is faster than
|
||||
// flatten_indices()
|
||||
Tensor indices = at::zeros({mask._nnz()}, mask_indices.options());
|
||||
for (int64_t d = 0; d < mask.sparse_dim(); d++) {
|
||||
for (const auto d : c10::irange(mask.sparse_dim())) {
|
||||
indices.mul_(mask.size(d));
|
||||
indices.add_(mask_indices.select(0, d));
|
||||
}
|
||||
|
||||
std::vector<int64_t> view_size(1 + mask.dense_dim());
|
||||
view_size[0] = -1;
|
||||
for (int64_t d = 0; d < mask.dense_dim(); d++) {
|
||||
for (const auto d : c10::irange(mask.dense_dim())) {
|
||||
view_size[d + 1] = mask.size(mask.sparse_dim() + d);
|
||||
}
|
||||
|
||||
@ -777,7 +778,7 @@ Tensor sparse_mask_helper_cpu(
|
||||
|
||||
// Step 1: flatten the sparse indices `t._indices()` tensor and then map this
|
||||
// flatten value `index` to the original position `i`
|
||||
for (int64_t i = 0; i < t_nnz; i++) {
|
||||
for (const auto i : c10::irange(t_nnz)) {
|
||||
int64_t index = ti_flattened_indices.data_ptr<int64_t>()[i];
|
||||
t_flatten_indices[index] = i;
|
||||
}
|
||||
@ -802,7 +803,7 @@ Tensor sparse_mask_helper_cpu(
|
||||
const auto r_values_stride = r_values.strides()[0] * r_values.element_size();
|
||||
const auto t_values_stride = t_v.strides()[0] * t_v.element_size();
|
||||
|
||||
for (auto i = start; i < end; i++) {
|
||||
for (const auto i : c10::irange(start, end)) {
|
||||
int64_t index = flattened_mask_indices.data_ptr<int64_t>()[i];
|
||||
auto iter = t_flatten_indices.find(index);
|
||||
if (iter != t_flatten_indices.end()) {
|
||||
|
@ -601,9 +601,9 @@ Tensor& add_out_dense_sparse_cpu(Tensor& r, const Tensor& dense, const SparseTen
|
||||
// accessors rely on nnz test
|
||||
if (nDim > nDimI) {
|
||||
auto indices_accessor = indices.accessor<int64_t, 2>();
|
||||
for (int64_t k = 0; k < sparse._nnz(); k++) {
|
||||
for (const auto k : c10::irange(sparse._nnz())) {
|
||||
Tensor dstBuffer = resultBuffer;
|
||||
for (int64_t d = 0; d < sparse.sparse_dim(); d++) {
|
||||
for (const auto d : c10::irange(sparse.sparse_dim())) {
|
||||
dstBuffer = dstBuffer.select(0, indices_accessor[d][k]);
|
||||
}
|
||||
Tensor srcBuffer = valuesBuffer.select(0, k);
|
||||
@ -970,7 +970,7 @@ SparseTensor& hspmm_out_sparse_cpu(const SparseTensor& sparse_, const Tensor& de
|
||||
auto indices_accessor = indices.accessor<int64_t, 2>();
|
||||
|
||||
int64_t i = -1, prevIdx = -1;
|
||||
for (int64_t j = 0; j < nnz; j++) {
|
||||
for (const auto j : c10::irange(nnz)) {
|
||||
int64_t currIdx = valueIndices_accessor[j];
|
||||
if (currIdx != prevIdx) {
|
||||
indices_accessor[0][++i] = currIdx;
|
||||
@ -1086,10 +1086,10 @@ SparseTensor& _sspaddmm_out_cpu(
|
||||
scalar_t* newv_ptr = newv.data_ptr<scalar_t>();
|
||||
scalar_t cast_alpha = alpha.to<scalar_t>();
|
||||
|
||||
for (int64_t h = 0; h < dim_i; h++) {
|
||||
for (const auto h : c10::irange(dim_i)) {
|
||||
int64_t i_start = csr_accessor[h];
|
||||
int64_t i_end = csr_accessor[h+1];
|
||||
for (int64_t i = i_start; i < i_end; i++) {
|
||||
for (const auto i : c10::irange(i_start, i_end)) {
|
||||
scalar_t val = values_accessor[i];
|
||||
int64_t col = indices_accessor[1][i];
|
||||
if (col >= 0 && col < dim_j) {
|
||||
@ -1103,7 +1103,7 @@ SparseTensor& _sspaddmm_out_cpu(
|
||||
}
|
||||
// Fill up the indices with the right values
|
||||
if (i_start != i_end) {
|
||||
for (int64_t i = 0; i < dim_k; i++) {
|
||||
for (const auto i : c10::irange(dim_k)) {
|
||||
newi_accessor[0][p+i] = h;
|
||||
newi_accessor[1][p+i] = i;
|
||||
}
|
||||
@ -1178,7 +1178,7 @@ Tensor _sparse_sum(const SparseTensor& input, IntArrayRef dims_to_sum) {
|
||||
|
||||
auto dims_to_keep_v = std::vector<int64_t>();
|
||||
auto dense_dims_to_sum_v = std::vector<int64_t>();
|
||||
for (int64_t d = 0; d < input_dim; d++) {
|
||||
for (const auto d : c10::irange(input_dim)) {
|
||||
if (dims_to_sum_b[d]) {
|
||||
if (d >= sparse_dim) dense_dims_to_sum_v.emplace_back(d + 1 - sparse_dim);
|
||||
}
|
||||
|
@ -3,6 +3,7 @@
|
||||
|
||||
#include <ATen/SparseTensorUtils.h>
|
||||
#include <ATen/cuda/CUDAUtils.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
namespace at { namespace native {
|
||||
|
||||
@ -34,7 +35,7 @@ SparseTensor& sparse_mask_out_cuda(SparseTensor& r, const Tensor& t, const Spars
|
||||
// Get a flattened sparse indices, similar to NOTE [ Flatten Sparse Indices ].
|
||||
// Keeping this implementation because it is faster than flatten_indices()
|
||||
Tensor indices = at::zeros({mask._nnz()}, mask_indices.options());
|
||||
for (int64_t d = 0; d < mask.sparse_dim(); d++) {
|
||||
for (const auto d : c10::irange(mask.sparse_dim())) {
|
||||
indices.mul_(mask.size(d));
|
||||
// This used to use a buffer but I deoptimized it
|
||||
indices.add_(mask_indices.select(0, d));
|
||||
@ -42,7 +43,7 @@ SparseTensor& sparse_mask_out_cuda(SparseTensor& r, const Tensor& t, const Spars
|
||||
|
||||
std::vector<int64_t> view_size(1 + mask.dense_dim());
|
||||
view_size[0] = -1;
|
||||
for (int64_t d = 0; d < mask.dense_dim(); d++) {
|
||||
for (const auto d : c10::irange(mask.dense_dim())) {
|
||||
view_size[d + 1] = mask.size(mask.sparse_dim() + d);
|
||||
}
|
||||
|
||||
|
@ -17,7 +17,7 @@ struct ParamsHash {
|
||||
size_t operator()(const Params& params) const {
|
||||
auto ptr = reinterpret_cast<const uint8_t*>(¶ms);
|
||||
uint32_t value = 0x811C9DC5;
|
||||
for (int i = 0; i < (int)sizeof(Params); ++i) {
|
||||
for (const auto i : c10::irange((int)sizeof(Params))) {
|
||||
value ^= ptr[i];
|
||||
value *= 0x01000193;
|
||||
}
|
||||
|
@ -2,6 +2,7 @@
|
||||
#include <c10/util/accumulate.h>
|
||||
#include <c10/util/ArrayRef.h>
|
||||
#include <c10/util/Exception.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
#ifdef USE_VULKAN_WRAPPER
|
||||
#include <vulkan_wrapper.h>
|
||||
@ -192,7 +193,7 @@ uint32_t VContext::getComputeQueueFamilyIndex() {
|
||||
vkGetPhysicalDeviceQueueFamilyProperties(
|
||||
physicalDevice_, &queueFamilyCount, queueFamilies.data());
|
||||
|
||||
for (uint32_t i = 0; i < queueFamilies.size(); ++i) {
|
||||
for (const auto i : c10::irange(queueFamilies.size())) {
|
||||
VkQueueFamilyProperties props = queueFamilies[i];
|
||||
if (props.queueCount > 0 && (props.queueFlags & VK_QUEUE_COMPUTE_BIT)) {
|
||||
return i;
|
||||
@ -274,7 +275,7 @@ uint32_t findMemoryType(
|
||||
const VkMemoryPropertyFlags properties) {
|
||||
VkPhysicalDeviceMemoryProperties memoryProperties{};
|
||||
vkGetPhysicalDeviceMemoryProperties(physicalDevice, &memoryProperties);
|
||||
for (uint32_t i = 0; i < memoryProperties.memoryTypeCount; ++i) {
|
||||
for (const auto i : c10::irange(memoryProperties.memoryTypeCount)) {
|
||||
if ((memoryTypeBits & (1 << i)) &&
|
||||
((memoryProperties.memoryTypes[i].propertyFlags & properties) ==
|
||||
properties)) {
|
||||
|
@ -9,6 +9,7 @@
|
||||
#include <ATen/native/vulkan/VulkanOpaqueTensorImpl.h>
|
||||
#include <ATen/native/vulkan/VulkanOps.h>
|
||||
#include <ATen/vulkan/Context.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
namespace at {
|
||||
namespace native {
|
||||
@ -265,13 +266,13 @@ Tensor cat(const TensorList tensors, int64_t dim) {
|
||||
int64_t cat_dim_size = 0;
|
||||
|
||||
std::vector<VulkanTensor> vTensors{};
|
||||
for (int i = 0; i < tensors.size(); ++i) {
|
||||
for (const auto i : c10::irange(tensors.size())) {
|
||||
const auto& t = tensors[i];
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
t.dim() == 4, "Vulkan cat expects 4 dimensional inputs");
|
||||
TORCH_INTERNAL_ASSERT(t.is_vulkan(), "Vulkan cat expects Vulkan inputs");
|
||||
|
||||
for (int d = 0; d < 4; ++d) {
|
||||
for (const auto d : c10::irange(4)) {
|
||||
if (d == dim) {
|
||||
continue;
|
||||
}
|
||||
|
@ -3,6 +3,7 @@
|
||||
#include <c10/util/accumulate.h>
|
||||
#include <c10/util/Exception.h>
|
||||
#include <c10/util/Optional.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
#include <ATen/native/vulkan/Vulkan.h>
|
||||
#include <ATen/native/vulkan/VulkanCommon.h>
|
||||
@ -629,17 +630,17 @@ VBuffer kernelNCHW_OCHW_repack_O4C4HWi4o4(
|
||||
memset(basePtr, 0, size);
|
||||
const float* src = weights;
|
||||
int ridx = 0;
|
||||
for (int oc = 0; oc < OC; ++oc) {
|
||||
for (const auto oc : c10::irange(OC)) {
|
||||
int oc_4 = oc / 4;
|
||||
int oc_4_i = oc % 4;
|
||||
float* dst_oc = basePtr + oc_4 * oc_4SizeNumel;
|
||||
for (int ic = 0; ic < C; ++ic) {
|
||||
for (const auto ic : c10::irange(C)) {
|
||||
int ic_4 = ic / 4;
|
||||
int ic_4_i = ic % 4;
|
||||
float* dst_ic = dst_oc + ic_4 * KW * KH * 16;
|
||||
for (int ky = 0; ky < KH; ++ky) {
|
||||
for (const auto ky : c10::irange(KH)) {
|
||||
float* dst_ky = dst_ic + ky * KW * 16;
|
||||
for (int kx = 0; kx < KW; ++kx) {
|
||||
for (const auto kx : c10::irange(KW)) {
|
||||
float* dst_kx = dst_ky + kx * 16;
|
||||
dst_kx[4 * ic_4_i + oc_4_i] = src[ridx++];
|
||||
}
|
||||
|
@ -1,5 +1,6 @@
|
||||
#include <ATen/native/vulkan/api/Runtime.h>
|
||||
#include <ATen/native/vulkan/api/Adapter.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
#include <sstream>
|
||||
|
||||
@ -244,7 +245,7 @@ uint32_t query_compute_queue_family_index(const VkPhysicalDevice physical_device
|
||||
&queue_family_count,
|
||||
queue_families_properties.data());
|
||||
|
||||
for (uint32_t i = 0; i < queue_families_properties.size(); ++i) {
|
||||
for (const auto i : c10::irange(queue_families_properties.size())) {
|
||||
const VkQueueFamilyProperties& properties = queue_families_properties[i];
|
||||
if (properties.queueCount > 0 && (properties.queueFlags & VK_QUEUE_COMPUTE_BIT)) {
|
||||
return i;
|
||||
|
@ -1005,8 +1005,7 @@ VmaDefragmentationContext defragCtx;
|
||||
vmaDefragmentationBegin(allocator, &defragInfo, nullptr, &defragCtx);
|
||||
vmaDefragmentationEnd(allocator, defragCtx);
|
||||
|
||||
for(uint32_t i = 0; i < allocCount; ++i)
|
||||
{
|
||||
for (const auto i : c10::irange(allocCount)) {
|
||||
if(allocationsChanged[i])
|
||||
{
|
||||
// Destroy buffer that is immutably bound to memory region which is no longer valid.
|
||||
@ -1083,8 +1082,7 @@ vkEndCommandBuffer(commandBuffer);
|
||||
|
||||
vmaDefragmentationEnd(allocator, defragCtx);
|
||||
|
||||
for(uint32_t i = 0; i < allocCount; ++i)
|
||||
{
|
||||
for (const auto i : c10::irange(allocCount)) {
|
||||
if(allocationsChanged[i])
|
||||
{
|
||||
// Destroy buffer that is immutably bound to memory region which is no longer valid.
|
||||
@ -4818,8 +4816,7 @@ T must be pointer type, e.g. VmaAllocation, VmaPool.
|
||||
template<typename T>
|
||||
static bool VmaValidatePointerArray(uint32_t count, const T* arr)
|
||||
{
|
||||
for(uint32_t i = 0; i < count; ++i)
|
||||
{
|
||||
for (const auto i : c10::irange(count)) {
|
||||
const T iPtr = arr[i];
|
||||
if(iPtr == VMA_NULL)
|
||||
{
|
||||
@ -7459,8 +7456,7 @@ private:
|
||||
{
|
||||
FreeSpace s = {};
|
||||
s.blockInfoIndex = SIZE_MAX;
|
||||
for(size_t i = 0; i < MAX_COUNT; ++i)
|
||||
{
|
||||
for (const auto i : c10::irange(MAX_COUNT)) {
|
||||
m_FreeSpaces[i] = s;
|
||||
}
|
||||
}
|
||||
@ -7474,8 +7470,7 @@ private:
|
||||
|
||||
// Find first invalid or the smallest structure.
|
||||
size_t bestIndex = SIZE_MAX;
|
||||
for(size_t i = 0; i < MAX_COUNT; ++i)
|
||||
{
|
||||
for (const auto i : c10::irange(MAX_COUNT)) {
|
||||
// Empty structure.
|
||||
if(m_FreeSpaces[i].blockInfoIndex == SIZE_MAX)
|
||||
{
|
||||
@ -7502,8 +7497,7 @@ private:
|
||||
{
|
||||
size_t bestIndex = SIZE_MAX;
|
||||
VkDeviceSize bestFreeSpaceAfter = 0;
|
||||
for(size_t i = 0; i < MAX_COUNT; ++i)
|
||||
{
|
||||
for (const auto i : c10::irange(MAX_COUNT)) {
|
||||
// Structure is valid.
|
||||
if(m_FreeSpaces[i].blockInfoIndex != SIZE_MAX)
|
||||
{
|
||||
@ -7846,8 +7840,7 @@ struct VmaCurrentBudgetData
|
||||
|
||||
VmaCurrentBudgetData()
|
||||
{
|
||||
for(uint32_t heapIndex = 0; heapIndex < VK_MAX_MEMORY_HEAPS; ++heapIndex)
|
||||
{
|
||||
for (const auto heapIndex : c10::irange(VK_MAX_MEMORY_HEAPS)) {
|
||||
m_BlockBytes[heapIndex] = 0;
|
||||
m_AllocationBytes[heapIndex] = 0;
|
||||
#if VMA_MEMORY_BUDGET
|
||||
@ -8447,8 +8440,7 @@ void VmaJsonWriter::ContinueString(const char* pStr)
|
||||
VMA_ASSERT(m_InsideString);
|
||||
|
||||
const size_t strLen = strlen(pStr);
|
||||
for(size_t i = 0; i < strLen; ++i)
|
||||
{
|
||||
for (const auto i : c10::irange(strLen)) {
|
||||
char ch = pStr[i];
|
||||
if(ch == '\\')
|
||||
{
|
||||
@ -8583,8 +8575,7 @@ void VmaJsonWriter::WriteIndent(bool oneLess)
|
||||
{
|
||||
--count;
|
||||
}
|
||||
for(size_t i = 0; i < count; ++i)
|
||||
{
|
||||
for (const auto i : c10::irange(count)) {
|
||||
m_SB.Add(INDENT);
|
||||
}
|
||||
}
|
||||
@ -9123,8 +9114,7 @@ bool VmaBlockMetadata_Generic::Validate() const
|
||||
VMA_VALIDATE(m_FreeSuballocationsBySize.size() == freeSuballocationsToRegister);
|
||||
|
||||
VkDeviceSize lastSize = 0;
|
||||
for(size_t i = 0; i < m_FreeSuballocationsBySize.size(); ++i)
|
||||
{
|
||||
for (const auto i : c10::irange(m_FreeSuballocationsBySize.size())) {
|
||||
VmaSuballocationList::iterator suballocItem = m_FreeSuballocationsBySize[i];
|
||||
|
||||
// Only free suballocations can be registered in m_FreeSuballocationsBySize.
|
||||
@ -10075,8 +10065,7 @@ bool VmaBlockMetadata_Linear::Validate() const
|
||||
{
|
||||
const size_t suballoc2ndCount = suballocations2nd.size();
|
||||
size_t nullItem2ndCount = 0;
|
||||
for(size_t i = 0; i < suballoc2ndCount; ++i)
|
||||
{
|
||||
for (const auto i : c10::irange(suballoc2ndCount)) {
|
||||
const VmaSuballocation& suballoc = suballocations2nd[i];
|
||||
const bool currFree = (suballoc.type == VMA_SUBALLOCATION_TYPE_FREE);
|
||||
|
||||
@ -10100,8 +10089,7 @@ bool VmaBlockMetadata_Linear::Validate() const
|
||||
VMA_VALIDATE(nullItem2ndCount == m_2ndNullItemsCount);
|
||||
}
|
||||
|
||||
for(size_t i = 0; i < m_1stNullItemsBeginCount; ++i)
|
||||
{
|
||||
for (const auto i : c10::irange(m_1stNullItemsBeginCount)) {
|
||||
const VmaSuballocation& suballoc = suballocations1st[i];
|
||||
VMA_VALIDATE(suballoc.type == VMA_SUBALLOCATION_TYPE_FREE &&
|
||||
suballoc.hAllocation == VK_NULL_HANDLE);
|
||||
@ -10109,8 +10097,7 @@ bool VmaBlockMetadata_Linear::Validate() const
|
||||
|
||||
size_t nullItem1stCount = m_1stNullItemsBeginCount;
|
||||
|
||||
for(size_t i = m_1stNullItemsBeginCount; i < suballoc1stCount; ++i)
|
||||
{
|
||||
for (const auto i : c10::irange(m_1stNullItemsBeginCount, suballoc1stCount)) {
|
||||
const VmaSuballocation& suballoc = suballocations1st[i];
|
||||
const bool currFree = (suballoc.type == VMA_SUBALLOCATION_TYPE_FREE);
|
||||
|
||||
@ -11301,10 +11288,7 @@ bool VmaBlockMetadata_Linear::CreateAllocationRequest_LowerAddress(
|
||||
// If conflict exists, allocation cannot be made here.
|
||||
if(allocSize % bufferImageGranularity || resultOffset % bufferImageGranularity)
|
||||
{
|
||||
for(size_t nextSuballocIndex = index1st;
|
||||
nextSuballocIndex < suballocations1st.size();
|
||||
nextSuballocIndex++)
|
||||
{
|
||||
for (const auto nextSuballocIndex : c10::irange(index1st, suballocations1st.size())) {
|
||||
const VmaSuballocation& nextSuballoc = suballocations1st[nextSuballocIndex];
|
||||
if(VmaBlocksOnSamePage(resultOffset, allocSize, nextSuballoc.offset, bufferImageGranularity))
|
||||
{
|
||||
@ -11712,8 +11696,7 @@ void VmaBlockMetadata_Linear::CleanupAfterFree()
|
||||
{
|
||||
const size_t nonNullItemCount = suballoc1stCount - nullItem1stCount;
|
||||
size_t srcIndex = m_1stNullItemsBeginCount;
|
||||
for(size_t dstIndex = 0; dstIndex < nonNullItemCount; ++dstIndex)
|
||||
{
|
||||
for (const auto dstIndex : c10::irange(nonNullItemCount)) {
|
||||
while(suballocations1st[srcIndex].hAllocation == VK_NULL_HANDLE)
|
||||
{
|
||||
++srcIndex;
|
||||
@ -11817,8 +11800,7 @@ bool VmaBlockMetadata_Buddy::Validate() const
|
||||
VMA_VALIDATE(m_SumFreeSize == ctx.calculatedSumFreeSize);
|
||||
|
||||
// Validate free node lists.
|
||||
for(uint32_t level = 0; level < m_LevelCount; ++level)
|
||||
{
|
||||
for (const auto level : c10::irange(m_LevelCount)) {
|
||||
VMA_VALIDATE(m_FreeList[level].front == VMA_NULL ||
|
||||
m_FreeList[level].front->free.prev == VMA_NULL);
|
||||
|
||||
@ -11840,8 +11822,7 @@ bool VmaBlockMetadata_Buddy::Validate() const
|
||||
}
|
||||
|
||||
// Validate that free lists ar higher levels are empty.
|
||||
for(uint32_t level = m_LevelCount; level < MAX_LEVELS; ++level)
|
||||
{
|
||||
for (const auto level : c10::irange(m_LevelCount, MAX_LEVELS)) {
|
||||
VMA_VALIDATE(m_FreeList[level].front == VMA_NULL && m_FreeList[level].back == VMA_NULL);
|
||||
}
|
||||
|
||||
@ -11850,8 +11831,7 @@ bool VmaBlockMetadata_Buddy::Validate() const
|
||||
|
||||
VkDeviceSize VmaBlockMetadata_Buddy::GetUnusedRangeSizeMax() const
|
||||
{
|
||||
for(uint32_t level = 0; level < m_LevelCount; ++level)
|
||||
{
|
||||
for (const auto level : c10::irange(m_LevelCount)) {
|
||||
if(m_FreeList[level].front != VMA_NULL)
|
||||
{
|
||||
return LevelToNodeSize(level);
|
||||
@ -12668,8 +12648,7 @@ VmaBlockVector::~VmaBlockVector()
|
||||
|
||||
VkResult VmaBlockVector::CreateMinBlocks()
|
||||
{
|
||||
for(size_t i = 0; i < m_MinBlockCount; ++i)
|
||||
{
|
||||
for (const auto i : c10::irange(m_MinBlockCount)) {
|
||||
VkResult res = CreateBlock(m_PreferredBlockSize, VMA_NULL);
|
||||
if(res != VK_SUCCESS)
|
||||
{
|
||||
@ -12692,8 +12671,7 @@ void VmaBlockVector::GetPoolStats(VmaPoolStats* pStats)
|
||||
pStats->unusedRangeSizeMax = 0;
|
||||
pStats->blockCount = blockCount;
|
||||
|
||||
for(uint32_t blockIndex = 0; blockIndex < blockCount; ++blockIndex)
|
||||
{
|
||||
for (const auto blockIndex : c10::irange(blockCount)) {
|
||||
const VmaDeviceMemoryBlock* const pBlock = m_Blocks[blockIndex];
|
||||
VMA_ASSERT(pBlock);
|
||||
VMA_HEAVY_ASSERT(pBlock->Validate());
|
||||
@ -12873,8 +12851,7 @@ VkResult VmaBlockVector::AllocatePage(
|
||||
if(strategy == VMA_ALLOCATION_CREATE_STRATEGY_BEST_FIT_BIT)
|
||||
{
|
||||
// Forward order in m_Blocks - prefer blocks with smallest amount of free space.
|
||||
for(size_t blockIndex = 0; blockIndex < m_Blocks.size(); ++blockIndex )
|
||||
{
|
||||
for (const auto blockIndex : c10::irange(m_Blocks.size())) {
|
||||
VmaDeviceMemoryBlock* const pCurrBlock = m_Blocks[blockIndex];
|
||||
VMA_ASSERT(pCurrBlock);
|
||||
VkResult res = AllocateFromBlock(
|
||||
@ -12932,8 +12909,7 @@ VkResult VmaBlockVector::AllocatePage(
|
||||
{
|
||||
// Allocate 1/8, 1/4, 1/2 as first blocks.
|
||||
const VkDeviceSize maxExistingBlockSize = CalcMaxBlockSize();
|
||||
for(uint32_t i = 0; i < NEW_BLOCK_SIZE_SHIFT_MAX; ++i)
|
||||
{
|
||||
for (const auto i : c10::irange(NEW_BLOCK_SIZE_SHIFT_MAX)) {
|
||||
const VkDeviceSize smallerNewBlockSize = newBlockSize / 2;
|
||||
if(smallerNewBlockSize > maxExistingBlockSize && smallerNewBlockSize >= size * 2)
|
||||
{
|
||||
@ -13013,8 +12989,7 @@ VkResult VmaBlockVector::AllocatePage(
|
||||
if(strategy == VMA_ALLOCATION_CREATE_STRATEGY_BEST_FIT_BIT)
|
||||
{
|
||||
// Forward order in m_Blocks - prefer blocks with smallest amount of free space.
|
||||
for(size_t blockIndex = 0; blockIndex < m_Blocks.size(); ++blockIndex )
|
||||
{
|
||||
for (const auto blockIndex : c10::irange(m_Blocks.size())) {
|
||||
VmaDeviceMemoryBlock* const pCurrBlock = m_Blocks[blockIndex];
|
||||
VMA_ASSERT(pCurrBlock);
|
||||
VmaAllocationRequest currRequest = {};
|
||||
@ -13238,8 +13213,7 @@ VkDeviceSize VmaBlockVector::CalcMaxBlockSize() const
|
||||
|
||||
void VmaBlockVector::Remove(VmaDeviceMemoryBlock* pBlock)
|
||||
{
|
||||
for(uint32_t blockIndex = 0; blockIndex < m_Blocks.size(); ++blockIndex)
|
||||
{
|
||||
for (const auto blockIndex : c10::irange(m_Blocks.size())) {
|
||||
if(m_Blocks[blockIndex] == pBlock)
|
||||
{
|
||||
VmaVectorRemove(m_Blocks, blockIndex);
|
||||
@ -13254,8 +13228,7 @@ void VmaBlockVector::IncrementallySortBlocks()
|
||||
if(m_Algorithm != VMA_POOL_CREATE_LINEAR_ALGORITHM_BIT)
|
||||
{
|
||||
// Bubble sort only until first swap.
|
||||
for(size_t i = 1; i < m_Blocks.size(); ++i)
|
||||
{
|
||||
for (const auto i : c10::irange(1, m_Blocks.size())) {
|
||||
if(m_Blocks[i - 1]->m_pMetadata->GetSumFreeSize() > m_Blocks[i]->m_pMetadata->GetSumFreeSize())
|
||||
{
|
||||
VMA_SWAP(m_Blocks[i - 1], m_Blocks[i]);
|
||||
@ -13413,8 +13386,7 @@ void VmaBlockVector::ApplyDefragmentationMovesCpu(
|
||||
|
||||
// Go over all moves. Mark blocks that are used with BLOCK_FLAG_USED.
|
||||
const size_t moveCount = moves.size();
|
||||
for(size_t moveIndex = 0; moveIndex < moveCount; ++moveIndex)
|
||||
{
|
||||
for (const auto moveIndex : c10::irange(moveCount)) {
|
||||
const VmaDefragmentationMove& move = moves[moveIndex];
|
||||
blockInfo[move.srcBlockIndex].flags |= BLOCK_FLAG_USED;
|
||||
blockInfo[move.dstBlockIndex].flags |= BLOCK_FLAG_USED;
|
||||
@ -13448,8 +13420,7 @@ void VmaBlockVector::ApplyDefragmentationMovesCpu(
|
||||
const VkDeviceSize nonCoherentAtomSize = m_hAllocator->m_PhysicalDeviceProperties.limits.nonCoherentAtomSize;
|
||||
VkMappedMemoryRange memRange = { VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE };
|
||||
|
||||
for(size_t moveIndex = 0; moveIndex < moveCount; ++moveIndex)
|
||||
{
|
||||
for (const auto moveIndex : c10::irange(moveCount)) {
|
||||
const VmaDefragmentationMove& move = moves[moveIndex];
|
||||
|
||||
const BlockInfo& srcBlockInfo = blockInfo[move.srcBlockIndex];
|
||||
@ -13520,8 +13491,7 @@ void VmaBlockVector::ApplyDefragmentationMovesGpu(
|
||||
|
||||
// Go over all moves. Mark blocks that are used with BLOCK_FLAG_USED.
|
||||
const size_t moveCount = moves.size();
|
||||
for(size_t moveIndex = 0; moveIndex < moveCount; ++moveIndex)
|
||||
{
|
||||
for (const auto moveIndex : c10::irange(moveCount)) {
|
||||
const VmaDefragmentationMove& move = moves[moveIndex];
|
||||
|
||||
//if(move.type == VMA_ALLOCATION_TYPE_UNKNOWN)
|
||||
@ -13560,8 +13530,7 @@ void VmaBlockVector::ApplyDefragmentationMovesGpu(
|
||||
// Go over all moves. Post data transfer commands to command buffer.
|
||||
if(pDefragCtx->res == VK_SUCCESS)
|
||||
{
|
||||
for(size_t moveIndex = 0; moveIndex < moveCount; ++moveIndex)
|
||||
{
|
||||
for (const auto moveIndex : c10::irange(moveCount)) {
|
||||
const VmaDefragmentationMove& move = moves[moveIndex];
|
||||
|
||||
const VmaBlockDefragmentationContext& srcBlockCtx = pDefragCtx->blockContexts[move.srcBlockIndex];
|
||||
@ -13686,8 +13655,7 @@ void VmaBlockVector::PrintDetailedMap(class VmaJsonWriter& json)
|
||||
|
||||
json.WriteString("Blocks");
|
||||
json.BeginObject();
|
||||
for(size_t i = 0; i < m_Blocks.size(); ++i)
|
||||
{
|
||||
for (const auto i : c10::irange(m_Blocks.size())) {
|
||||
json.BeginString();
|
||||
json.ContinueString(m_Blocks[i]->GetId());
|
||||
json.EndString();
|
||||
@ -13895,8 +13863,7 @@ void VmaBlockVector::CommitDefragmentations(
|
||||
size_t VmaBlockVector::CalcAllocationCount() const
|
||||
{
|
||||
size_t result = 0;
|
||||
for(size_t i = 0; i < m_Blocks.size(); ++i)
|
||||
{
|
||||
for (const auto i : c10::irange(m_Blocks.size())) {
|
||||
result += m_Blocks[i]->m_pMetadata->GetAllocationCount();
|
||||
}
|
||||
return result;
|
||||
@ -13928,8 +13895,7 @@ void VmaBlockVector::MakePoolAllocationsLost(
|
||||
{
|
||||
VmaMutexLockWrite lock(m_Mutex, m_hAllocator->m_UseMutex);
|
||||
size_t lostAllocationCount = 0;
|
||||
for(uint32_t blockIndex = 0; blockIndex < m_Blocks.size(); ++blockIndex)
|
||||
{
|
||||
for (const auto blockIndex : c10::irange(m_Blocks.size())) {
|
||||
VmaDeviceMemoryBlock* const pBlock = m_Blocks[blockIndex];
|
||||
VMA_ASSERT(pBlock);
|
||||
lostAllocationCount += pBlock->m_pMetadata->MakeAllocationsLost(currentFrameIndex, m_FrameInUseCount);
|
||||
@ -13948,8 +13914,7 @@ VkResult VmaBlockVector::CheckCorruption()
|
||||
}
|
||||
|
||||
VmaMutexLockRead lock(m_Mutex, m_hAllocator->m_UseMutex);
|
||||
for(uint32_t blockIndex = 0; blockIndex < m_Blocks.size(); ++blockIndex)
|
||||
{
|
||||
for (const auto blockIndex : c10::irange(m_Blocks.size())) {
|
||||
VmaDeviceMemoryBlock* const pBlock = m_Blocks[blockIndex];
|
||||
VMA_ASSERT(pBlock);
|
||||
VkResult res = pBlock->CheckCorruption(m_hAllocator);
|
||||
@ -13968,8 +13933,7 @@ void VmaBlockVector::AddStats(VmaStats* pStats)
|
||||
|
||||
VmaMutexLockRead lock(m_Mutex, m_hAllocator->m_UseMutex);
|
||||
|
||||
for(uint32_t blockIndex = 0; blockIndex < m_Blocks.size(); ++blockIndex)
|
||||
{
|
||||
for (const auto blockIndex : c10::irange(m_Blocks.size())) {
|
||||
const VmaDeviceMemoryBlock* const pBlock = m_Blocks[blockIndex];
|
||||
VMA_ASSERT(pBlock);
|
||||
VMA_HEAVY_ASSERT(pBlock->Validate());
|
||||
@ -13998,8 +13962,7 @@ VmaDefragmentationAlgorithm_Generic::VmaDefragmentationAlgorithm_Generic(
|
||||
{
|
||||
// Create block info for each block.
|
||||
const size_t blockCount = m_pBlockVector->m_Blocks.size();
|
||||
for(size_t blockIndex = 0; blockIndex < blockCount; ++blockIndex)
|
||||
{
|
||||
for (const auto blockIndex : c10::irange(blockCount)) {
|
||||
BlockInfo* pBlockInfo = vma_new(m_hAllocator, BlockInfo)(m_hAllocator->GetAllocationCallbacks());
|
||||
pBlockInfo->m_OriginalBlockIndex = blockIndex;
|
||||
pBlockInfo->m_pBlock = m_pBlockVector->m_Blocks[blockIndex];
|
||||
@ -14197,8 +14160,7 @@ VkResult VmaDefragmentationAlgorithm_Generic::DefragmentRound(
|
||||
size_t VmaDefragmentationAlgorithm_Generic::CalcBlocksWithNonMovableCount() const
|
||||
{
|
||||
size_t result = 0;
|
||||
for(size_t i = 0; i < m_Blocks.size(); ++i)
|
||||
{
|
||||
for (const auto i : c10::irange(m_Blocks.size())) {
|
||||
if(m_Blocks[i]->m_HasNonMovableAllocations)
|
||||
{
|
||||
++result;
|
||||
@ -14219,8 +14181,7 @@ VkResult VmaDefragmentationAlgorithm_Generic::Defragment(
|
||||
}
|
||||
|
||||
const size_t blockCount = m_Blocks.size();
|
||||
for(size_t blockIndex = 0; blockIndex < blockCount; ++blockIndex)
|
||||
{
|
||||
for (const auto blockIndex : c10::irange(blockCount)) {
|
||||
BlockInfo* pBlockInfo = m_Blocks[blockIndex];
|
||||
|
||||
if(m_AllAllocations)
|
||||
@ -14325,8 +14286,7 @@ VkResult VmaDefragmentationAlgorithm_Fast::Defragment(
|
||||
// Sort blocks in order from most destination.
|
||||
|
||||
m_BlockInfos.resize(blockCount);
|
||||
for(size_t i = 0; i < blockCount; ++i)
|
||||
{
|
||||
for (const auto i : c10::irange(blockCount)) {
|
||||
m_BlockInfos[i].origBlockIndex = i;
|
||||
}
|
||||
|
||||
@ -14539,8 +14499,7 @@ VkResult VmaDefragmentationAlgorithm_Fast::Defragment(
|
||||
void VmaDefragmentationAlgorithm_Fast::PreprocessMetadata()
|
||||
{
|
||||
const size_t blockCount = m_pBlockVector->GetBlockCount();
|
||||
for(size_t blockIndex = 0; blockIndex < blockCount; ++blockIndex)
|
||||
{
|
||||
for (const auto blockIndex : c10::irange(blockCount)) {
|
||||
VmaBlockMetadata_Generic* const pMetadata =
|
||||
(VmaBlockMetadata_Generic*)m_pBlockVector->GetBlock(blockIndex)->m_pMetadata;
|
||||
pMetadata->m_FreeCount = 0;
|
||||
@ -14567,8 +14526,7 @@ void VmaDefragmentationAlgorithm_Fast::PreprocessMetadata()
|
||||
void VmaDefragmentationAlgorithm_Fast::PostprocessMetadata()
|
||||
{
|
||||
const size_t blockCount = m_pBlockVector->GetBlockCount();
|
||||
for(size_t blockIndex = 0; blockIndex < blockCount; ++blockIndex)
|
||||
{
|
||||
for (const auto blockIndex : c10::irange(blockCount)) {
|
||||
VmaBlockMetadata_Generic* const pMetadata =
|
||||
(VmaBlockMetadata_Generic*)m_pBlockVector->GetBlock(blockIndex)->m_pMetadata;
|
||||
const VkDeviceSize blockSize = pMetadata->GetSize();
|
||||
@ -14778,8 +14736,7 @@ VmaDefragmentationContext_T::~VmaDefragmentationContext_T()
|
||||
|
||||
void VmaDefragmentationContext_T::AddPools(uint32_t poolCount, const VmaPool* pPools)
|
||||
{
|
||||
for(uint32_t poolIndex = 0; poolIndex < poolCount; ++poolIndex)
|
||||
{
|
||||
for (const auto poolIndex : c10::irange(poolCount)) {
|
||||
VmaPool pool = pPools[poolIndex];
|
||||
VMA_ASSERT(pool);
|
||||
// Pools with algorithm other than default are not defragmented.
|
||||
@ -14817,8 +14774,7 @@ void VmaDefragmentationContext_T::AddAllocations(
|
||||
VkBool32* pAllocationsChanged)
|
||||
{
|
||||
// Dispatch pAllocations among defragmentators. Create them when necessary.
|
||||
for(uint32_t allocIndex = 0; allocIndex < allocationCount; ++allocIndex)
|
||||
{
|
||||
for (const auto allocIndex : c10::irange(allocationCount)) {
|
||||
const VmaAllocation hAlloc = pAllocations[allocIndex];
|
||||
VMA_ASSERT(hAlloc);
|
||||
// DedicatedAlloc cannot be defragmented.
|
||||
@ -15615,14 +15571,12 @@ void VmaRecorder::WriteConfiguration(
|
||||
fprintf(m_File, "PhysicalDeviceLimits,nonCoherentAtomSize,%llu\n", devProps.limits.nonCoherentAtomSize);
|
||||
|
||||
fprintf(m_File, "PhysicalDeviceMemory,HeapCount,%u\n", memProps.memoryHeapCount);
|
||||
for(uint32_t i = 0; i < memProps.memoryHeapCount; ++i)
|
||||
{
|
||||
for (const auto i : c10::irange(memProps.memoryHeapCount)) {
|
||||
fprintf(m_File, "PhysicalDeviceMemory,Heap,%u,size,%llu\n", i, memProps.memoryHeaps[i].size);
|
||||
fprintf(m_File, "PhysicalDeviceMemory,Heap,%u,flags,%u\n", i, memProps.memoryHeaps[i].flags);
|
||||
}
|
||||
fprintf(m_File, "PhysicalDeviceMemory,TypeCount,%u\n", memProps.memoryTypeCount);
|
||||
for(uint32_t i = 0; i < memProps.memoryTypeCount; ++i)
|
||||
{
|
||||
for (const auto i : c10::irange(memProps.memoryTypeCount)) {
|
||||
fprintf(m_File, "PhysicalDeviceMemory,Type,%u,heapIndex,%u\n", i, memProps.memoryTypes[i].heapIndex);
|
||||
fprintf(m_File, "PhysicalDeviceMemory,Type,%u,propertyFlags,%u\n", i, memProps.memoryTypes[i].propertyFlags);
|
||||
}
|
||||
@ -15830,8 +15784,7 @@ VmaAllocator_T::VmaAllocator_T(const VmaAllocatorCreateInfo* pCreateInfo) :
|
||||
|
||||
if(pCreateInfo->pHeapSizeLimit != VMA_NULL)
|
||||
{
|
||||
for(uint32_t heapIndex = 0; heapIndex < GetMemoryHeapCount(); ++heapIndex)
|
||||
{
|
||||
for (const auto heapIndex : c10::irange(GetMemoryHeapCount())) {
|
||||
const VkDeviceSize limit = pCreateInfo->pHeapSizeLimit[heapIndex];
|
||||
if(limit != VK_WHOLE_SIZE)
|
||||
{
|
||||
@ -15844,8 +15797,7 @@ VmaAllocator_T::VmaAllocator_T(const VmaAllocatorCreateInfo* pCreateInfo) :
|
||||
}
|
||||
}
|
||||
|
||||
for(uint32_t memTypeIndex = 0; memTypeIndex < GetMemoryTypeCount(); ++memTypeIndex)
|
||||
{
|
||||
for (const auto memTypeIndex : c10::irange(GetMemoryTypeCount())) {
|
||||
const VkDeviceSize preferredBlockSize = CalcPreferredBlockSize(memTypeIndex);
|
||||
|
||||
m_pBlockVectors[memTypeIndex] = vma_new(this, VmaBlockVector)(
|
||||
@ -16747,14 +16699,11 @@ void VmaAllocator_T::CalculateStats(VmaStats* pStats)
|
||||
{
|
||||
// Initialize.
|
||||
InitStatInfo(pStats->total);
|
||||
for(size_t i = 0; i < VK_MAX_MEMORY_TYPES; ++i)
|
||||
InitStatInfo(pStats->memoryType[i]);
|
||||
for(size_t i = 0; i < VK_MAX_MEMORY_HEAPS; ++i)
|
||||
InitStatInfo(pStats->memoryHeap[i]);
|
||||
for (const auto i : c10::irange(VK_MAX_MEMORY_TYPES))InitStatInfo(pStats->memoryType[i]);
|
||||
for (const auto i : c10::irange(VK_MAX_MEMORY_HEAPS))InitStatInfo(pStats->memoryHeap[i]);
|
||||
|
||||
// Process default pools.
|
||||
for(uint32_t memTypeIndex = 0; memTypeIndex < GetMemoryTypeCount(); ++memTypeIndex)
|
||||
{
|
||||
for (const auto memTypeIndex : c10::irange(GetMemoryTypeCount())) {
|
||||
VmaBlockVector* const pBlockVector = m_pBlockVectors[memTypeIndex];
|
||||
VMA_ASSERT(pBlockVector);
|
||||
pBlockVector->AddStats(pStats);
|
||||
@ -16770,8 +16719,7 @@ void VmaAllocator_T::CalculateStats(VmaStats* pStats)
|
||||
}
|
||||
|
||||
// Process dedicated allocations.
|
||||
for(uint32_t memTypeIndex = 0; memTypeIndex < GetMemoryTypeCount(); ++memTypeIndex)
|
||||
{
|
||||
for (const auto memTypeIndex : c10::irange(GetMemoryTypeCount())) {
|
||||
const uint32_t memHeapIndex = MemoryTypeIndexToHeapIndex(memTypeIndex);
|
||||
VmaMutexLockRead dedicatedAllocationsLock(m_DedicatedAllocationsMutex[memTypeIndex], m_UseMutex);
|
||||
AllocationVectorType* const pDedicatedAllocVector = m_pDedicatedAllocations[memTypeIndex];
|
||||
@ -16788,10 +16736,8 @@ void VmaAllocator_T::CalculateStats(VmaStats* pStats)
|
||||
|
||||
// Postprocess.
|
||||
VmaPostprocessCalcStatInfo(pStats->total);
|
||||
for(size_t i = 0; i < GetMemoryTypeCount(); ++i)
|
||||
VmaPostprocessCalcStatInfo(pStats->memoryType[i]);
|
||||
for(size_t i = 0; i < GetMemoryHeapCount(); ++i)
|
||||
VmaPostprocessCalcStatInfo(pStats->memoryHeap[i]);
|
||||
for (const auto i : c10::irange(GetMemoryTypeCount()))VmaPostprocessCalcStatInfo(pStats->memoryType[i]);
|
||||
for (const auto i : c10::irange(GetMemoryHeapCount()))VmaPostprocessCalcStatInfo(pStats->memoryHeap[i]);
|
||||
}
|
||||
|
||||
void VmaAllocator_T::GetBudget(VmaBudget* outBudget, uint32_t firstHeap, uint32_t heapCount)
|
||||
@ -17114,8 +17060,7 @@ VkResult VmaAllocator_T::CheckCorruption(uint32_t memoryTypeBits)
|
||||
VkResult finalRes = VK_ERROR_FEATURE_NOT_PRESENT;
|
||||
|
||||
// Process default pools.
|
||||
for(uint32_t memTypeIndex = 0; memTypeIndex < GetMemoryTypeCount(); ++memTypeIndex)
|
||||
{
|
||||
for (const auto memTypeIndex : c10::irange(GetMemoryTypeCount())) {
|
||||
if(((1u << memTypeIndex) & memoryTypeBits) != 0)
|
||||
{
|
||||
VmaBlockVector* const pBlockVector = m_pBlockVectors[memTypeIndex];
|
||||
@ -17463,8 +17408,7 @@ VkResult VmaAllocator_T::FlushOrInvalidateAllocations(
|
||||
typedef VmaSmallVector<VkMappedMemoryRange, RangeAllocator, 16> RangeVector;
|
||||
RangeVector ranges = RangeVector(RangeAllocator(GetAllocationCallbacks()));
|
||||
|
||||
for(uint32_t allocIndex = 0; allocIndex < allocationCount; ++allocIndex)
|
||||
{
|
||||
for (const auto allocIndex : c10::irange(allocationCount)) {
|
||||
const VmaAllocation alloc = allocations[allocIndex];
|
||||
const VkDeviceSize offset = offsets != VMA_NULL ? offsets[allocIndex] : 0;
|
||||
const VkDeviceSize size = sizes != VMA_NULL ? sizes[allocIndex] : VK_WHOLE_SIZE;
|
||||
@ -17559,8 +17503,7 @@ uint32_t VmaAllocator_T::CalculateGlobalMemoryTypeBits() const
|
||||
if(!m_UseAmdDeviceCoherentMemory)
|
||||
{
|
||||
// Exclude memory types that have VK_MEMORY_PROPERTY_DEVICE_COHERENT_BIT_AMD.
|
||||
for(uint32_t memTypeIndex = 0; memTypeIndex < GetMemoryTypeCount(); ++memTypeIndex)
|
||||
{
|
||||
for (const auto memTypeIndex : c10::irange(GetMemoryTypeCount())) {
|
||||
if((m_MemProps.memoryTypes[memTypeIndex].propertyFlags & VK_MEMORY_PROPERTY_DEVICE_COHERENT_BIT_AMD_COPY) != 0)
|
||||
{
|
||||
memoryTypeBits &= ~(1u << memTypeIndex);
|
||||
@ -17650,8 +17593,7 @@ void VmaAllocator_T::UpdateVulkanBudget()
|
||||
{
|
||||
VmaMutexLockWrite lockWrite(m_Budget.m_BudgetMutex, m_UseMutex);
|
||||
|
||||
for(uint32_t heapIndex = 0; heapIndex < GetMemoryHeapCount(); ++heapIndex)
|
||||
{
|
||||
for (const auto heapIndex : c10::irange(GetMemoryHeapCount())) {
|
||||
m_Budget.m_VulkanUsage[heapIndex] = budgetProps.heapUsage[heapIndex];
|
||||
m_Budget.m_VulkanBudget[heapIndex] = budgetProps.heapBudget[heapIndex];
|
||||
m_Budget.m_BlockBytesAtBudgetFetch[heapIndex] = m_Budget.m_BlockBytes[heapIndex].load();
|
||||
@ -17713,8 +17655,7 @@ uint32_t VmaAllocator_T::GetGpuDefragmentationMemoryTypeBits()
|
||||
void VmaAllocator_T::PrintDetailedMap(VmaJsonWriter& json)
|
||||
{
|
||||
bool dedicatedAllocationsStarted = false;
|
||||
for(uint32_t memTypeIndex = 0; memTypeIndex < GetMemoryTypeCount(); ++memTypeIndex)
|
||||
{
|
||||
for (const auto memTypeIndex : c10::irange(GetMemoryTypeCount())) {
|
||||
VmaMutexLockRead dedicatedAllocationsLock(m_DedicatedAllocationsMutex[memTypeIndex], m_UseMutex);
|
||||
AllocationVectorType* const pDedicatedAllocVector = m_pDedicatedAllocations[memTypeIndex];
|
||||
VMA_ASSERT(pDedicatedAllocVector);
|
||||
@ -17751,8 +17692,7 @@ void VmaAllocator_T::PrintDetailedMap(VmaJsonWriter& json)
|
||||
|
||||
{
|
||||
bool allocationsStarted = false;
|
||||
for(uint32_t memTypeIndex = 0; memTypeIndex < GetMemoryTypeCount(); ++memTypeIndex)
|
||||
{
|
||||
for (const auto memTypeIndex : c10::irange(GetMemoryTypeCount())) {
|
||||
if(m_pBlockVectors[memTypeIndex]->IsEmpty() == false)
|
||||
{
|
||||
if(allocationsStarted == false)
|
||||
@ -17783,8 +17723,7 @@ void VmaAllocator_T::PrintDetailedMap(VmaJsonWriter& json)
|
||||
{
|
||||
json.WriteString("Pools");
|
||||
json.BeginObject();
|
||||
for(size_t poolIndex = 0; poolIndex < poolCount; ++poolIndex)
|
||||
{
|
||||
for (const auto poolIndex : c10::irange(poolCount)) {
|
||||
json.BeginString();
|
||||
json.ContinueString(m_Pools[poolIndex]->GetId());
|
||||
json.EndString();
|
||||
@ -18425,8 +18364,7 @@ VMA_CALL_PRE VkResult VMA_CALL_POST vmaAllocateMemoryPages(
|
||||
|
||||
if(pAllocationInfo != VMA_NULL && result == VK_SUCCESS)
|
||||
{
|
||||
for(size_t i = 0; i < allocationCount; ++i)
|
||||
{
|
||||
for (const auto i : c10::irange(allocationCount)) {
|
||||
allocator->GetAllocationInfo(pAllocations[i], pAllocationInfo + i);
|
||||
}
|
||||
}
|
||||
|
@ -3,6 +3,7 @@
|
||||
#include <ATen/native/utils/ParamUtils.h>
|
||||
#include <ATen/native/vulkan/ops/Common.h>
|
||||
#include <ATen/native/vulkan/api/Utils.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
namespace at {
|
||||
namespace native {
|
||||
@ -32,7 +33,7 @@ inline bool is_pointwise(const IntArrayRef filter) {
|
||||
|
||||
bool all_lessthan(const IntArrayRef arr, const int t) {
|
||||
bool retval = true;
|
||||
for (size_t i = 0; i < arr.size(); i++) {
|
||||
for (const auto i : c10::irange(arr.size())) {
|
||||
retval = retval && (arr[i] < t);
|
||||
}
|
||||
return retval;
|
||||
@ -173,8 +174,8 @@ vTensor pack_weights_2d(
|
||||
for (int64_t src_ic = 0; src_ic < src_filter[Layout::Filter::input]; ++src_ic) {
|
||||
const int64_t dst_ic4 = src_ic / 4;
|
||||
|
||||
for (int64_t src_ih = 0; src_ih < src_kh_sz; ++src_ih) {
|
||||
for (int64_t src_iw = 0; src_iw < src_kw_sz; ++src_iw) {
|
||||
for (const auto src_ih : c10::irange(src_kh_sz)) {
|
||||
for (const auto src_iw : c10::irange(src_kw_sz)) {
|
||||
memcpy(
|
||||
dst_weight_c_ptr + (dst_oh * src_kh_sz + src_ih) * dst_kw_sz +
|
||||
dst_ic4 * src_kw_sz * 4 + src_iw * 4 + src_ic % 4,
|
||||
@ -225,11 +226,11 @@ vTensor pack_weights_2d_winograd_2_3(
|
||||
float* const dst_weight_ptr = v_weight_payload.get();
|
||||
memset(dst_weight_ptr, 0, v_weight.nbytes());
|
||||
|
||||
for (int64_t src_oc = 0; src_oc < src_oc_sz; ++src_oc) {
|
||||
for (const auto src_oc : c10::irange(src_oc_sz)) {
|
||||
const int64_t dst_oh = src_oc / 4;
|
||||
const int64_t dst_iw = src_oc % 4;
|
||||
|
||||
for (int64_t src_ic = 0; src_ic < src_ic_sz; ++src_ic) {
|
||||
for (const auto src_ic : c10::irange(src_ic_sz)) {
|
||||
const int64_t dst_ow = src_ic / 4;
|
||||
const int64_t dst_c = src_ic % 4;
|
||||
|
||||
@ -344,7 +345,7 @@ vTensor pack_biases(
|
||||
float* const dst_bias_ptr = v_bias_payload.get();
|
||||
|
||||
memset(dst_bias_ptr, 0, v_bias.nbytes());
|
||||
for (int64_t i = 0; i < src_w; ++i) {
|
||||
for (const auto i : c10::irange(src_w)) {
|
||||
const int64_t c = i % 4;
|
||||
const int64_t x = i / 4;
|
||||
dst_bias_ptr[c * packed_w + x] = src_bias_ptr[i];
|
||||
|
@ -1,4 +1,5 @@
|
||||
#include <ATen/native/vulkan/ops/Mm.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
namespace at {
|
||||
namespace native {
|
||||
@ -47,8 +48,8 @@ vTensor pack_weights(
|
||||
float* const dst_weight_ptr = v_weight_payload.get();
|
||||
memset(dst_weight_ptr, 0, v_weight.nbytes());
|
||||
|
||||
for (int64_t src_h = 0; src_h < src_kh_sz; ++src_h) {
|
||||
for (int64_t src_w = 0; src_w < src_kw_sz; ++src_w) {
|
||||
for (const auto src_h : c10::irange(src_kh_sz)) {
|
||||
for (const auto src_w : c10::irange(src_kw_sz)) {
|
||||
int64_t dst_plane = 2*(src_h%2) + (src_w%2);
|
||||
int64_t dst_index = (src_h/2)*dst_kw_sz + (src_w/2);
|
||||
memcpy(
|
||||
@ -109,8 +110,8 @@ vTensor pack_biases(
|
||||
float* const dst_bias_ptr = v_bias_payload.get();
|
||||
memset(dst_bias_ptr, 0, v_bias.nbytes());
|
||||
|
||||
for (int64_t src_h = 0; src_h < src_kh_sz; ++src_h) {
|
||||
for (int64_t src_w = 0; src_w < src_kw_sz; ++src_w) {
|
||||
for (const auto src_h : c10::irange(src_kh_sz)) {
|
||||
for (const auto src_w : c10::irange(src_kw_sz)) {
|
||||
int64_t dst_plane = 2*(src_h%2) + (src_w%2);
|
||||
int64_t dst_index = (src_h/2)*dst_kw_sz + (src_w/2);
|
||||
memcpy(
|
||||
|
@ -1,4 +1,5 @@
|
||||
#include <ATen/native/vulkan/ops/Common.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <torch/library.h>
|
||||
|
||||
namespace at {
|
||||
@ -35,7 +36,7 @@ Tensor reflection_pad2d(const Tensor& self_arg, IntArrayRef padding) {
|
||||
const vTensor& v_self = convert(self);
|
||||
|
||||
c10::SmallVector<int64_t, 4> output_size(input_dim);
|
||||
for (size_t d = 0; d < input_dim; ++d) {
|
||||
for (const auto d : c10::irange(input_dim)) {
|
||||
if (d == input_dim - 1) {
|
||||
output_size[d] = input_size[d] + pad_right + pad_left;
|
||||
} else if (d == input_dim - 2) {
|
||||
|
@ -7,6 +7,7 @@
|
||||
#include <ATen/native/utils/Factory.h>
|
||||
#include <ATen/native/utils/ParamUtils.h>
|
||||
#include <ATen/native/xnnpack/Convolution.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
namespace at {
|
||||
namespace native {
|
||||
@ -150,11 +151,11 @@ const Tensor reorder_weights_for_transpose_conv(const Tensor& weight_nhwc,
|
||||
float* in_ptr = weight_nhwc.data_ptr<float>();
|
||||
|
||||
int out_index = 0;
|
||||
for (int g = 0; g < num_groups; g++) {
|
||||
for (int o = 0; o < output_channels_per_group; o++) {
|
||||
for (int w = 0; w < kernel_width; w++) {
|
||||
for (int h = 0; h < kernel_height; h++) {
|
||||
for (int i = 0; i < input_channels_per_group; i++) {
|
||||
for (const auto g : c10::irange(num_groups)) {
|
||||
for (const auto o : c10::irange(output_channels_per_group)) {
|
||||
for (const auto w : c10::irange(kernel_width)) {
|
||||
for (const auto h : c10::irange(kernel_height)) {
|
||||
for (const auto i : c10::irange(input_channels_per_group)) {
|
||||
int in_index = (g*g_offset) + (i*i_offset) + (h*h_offset) + (w*w_offset) + (o*o_offset);
|
||||
out_ptr[out_index] = in_ptr[in_index];
|
||||
out_index++;
|
||||
@ -210,7 +211,7 @@ ContextConv2D create(
|
||||
|
||||
if (transposed) {
|
||||
const Tensor weight_reordered = reorder_weights_for_transpose_conv(weight_nhwc, groups);
|
||||
for (int i = 0; i < 4; i++) {
|
||||
for (const auto i : c10::irange(4)) {
|
||||
weight_sizes[i] = weight_reordered.size(i);
|
||||
}
|
||||
create_status = xnn_create_deconvolution2d_nhwc_f32(
|
||||
@ -238,7 +239,7 @@ ContextConv2D create(
|
||||
0u, // flags
|
||||
&convolution_op); // operator
|
||||
} else {
|
||||
for (int i = 0; i < 4; i++) {
|
||||
for (const auto i : c10::irange(4)) {
|
||||
weight_sizes[i] = weight_nhwc.size(i);
|
||||
}
|
||||
create_status = xnn_create_convolution2d_nhwc_f32(
|
||||
|
@ -4,6 +4,7 @@
|
||||
#include <ATen/nnapi/nnapi_bind.h>
|
||||
#include <ATen/nnapi/nnapi_wrapper.h>
|
||||
#include <ATen/nnapi/nnapi_model_loader.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
namespace torch {
|
||||
namespace nnapi {
|
||||
@ -103,7 +104,7 @@ void NnapiCompilation::run(
|
||||
TORCH_CHECK((int32_t)inputs.size() == num_inputs_);
|
||||
TORCH_CHECK((int32_t)outputs.size() == num_outputs_);
|
||||
|
||||
for (size_t i = 0; i < inputs.size(); i++) {
|
||||
for (const auto i : c10::irange(inputs.size())) {
|
||||
auto& t = inputs[i];
|
||||
// TODO: Check contiguous and dtype.
|
||||
ANeuralNetworksOperandType op_type;
|
||||
@ -117,7 +118,7 @@ void NnapiCompilation::run(
|
||||
t.nbytes());
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < outputs.size(); i++) {
|
||||
for (const auto i : c10::irange(outputs.size())) {
|
||||
auto& t = outputs[i];
|
||||
// TODO: Check contiguous and dtype.
|
||||
check_nnapi->Execution_setOutput(
|
||||
@ -131,7 +132,7 @@ void NnapiCompilation::run(
|
||||
check_nnapi->Execution_compute(execution);
|
||||
|
||||
// TODO: Maybe skip this for fixed-size outputs?
|
||||
for (size_t i = 0; i < outputs.size(); i++) {
|
||||
for (const auto i : c10::irange(outputs.size())) {
|
||||
auto& t = outputs[i];
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
|
||||
uint32_t rank;
|
||||
|
@ -3,6 +3,7 @@
|
||||
#include <ATen/ATen.h>
|
||||
#include <ATen/CPUApplyUtils.h>
|
||||
#include <ATen/test/test_assert.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
#include <iostream>
|
||||
using namespace std;
|
||||
@ -10,7 +11,7 @@ using namespace at;
|
||||
|
||||
void fill_tensor(int64_t scalar, Tensor& t_) {
|
||||
auto t = t_.view(-1);
|
||||
for (int64_t i = 0; i < t.numel(); i++) {
|
||||
for (const auto i : c10::irange(t.numel())) {
|
||||
t[i] = (i + 1) * scalar;
|
||||
}
|
||||
}
|
||||
@ -42,7 +43,7 @@ void test(DeprecatedTypeProperties& type, IntArrayRef shape, int64_t a = 0, int6
|
||||
auto a4 = at::empty({0}, at::TensorOptions(kCPU).dtype(kDouble));
|
||||
|
||||
std::vector<Tensor> tensors({a0, a1, a2, a3, a4});
|
||||
for (size_t i = 0; i < tensors.size(); i++) {
|
||||
for (const auto i : c10::irange(tensors.size())) {
|
||||
tensors[i].resize_(shape);
|
||||
fill_tensor(i + 1, tensors[i]);
|
||||
if (a >= 0 && b >= 0) {
|
||||
@ -55,7 +56,7 @@ void test(DeprecatedTypeProperties& type, IntArrayRef shape, int64_t a = 0, int6
|
||||
a0, a1, [](scalar_t& y, const scalar_t& x) { y = x * x; });
|
||||
CPU_tensor_apply2<double, scalar_t>(
|
||||
a4, a1, [](double& y, scalar_t x) { y = (double)(x * x); });
|
||||
for (int64_t i = 0; i < a0.numel(); i++) {
|
||||
for (const auto i : c10::irange(a0.numel())) {
|
||||
auto target = a1.data_ptr<scalar_t>()[i] * a1.data_ptr<scalar_t>()[i];
|
||||
ASSERT(a0.data_ptr<scalar_t>()[i] == target);
|
||||
ASSERT(a4.data_ptr<double>()[i] == target);
|
||||
@ -71,7 +72,7 @@ void test(DeprecatedTypeProperties& type, IntArrayRef shape, int64_t a = 0, int6
|
||||
a4, a1, a2, [](double& y, const scalar_t& x, const scalar_t& z) {
|
||||
y = (double)(x * x + z);
|
||||
});
|
||||
for (int64_t i = 0; i < a0.numel(); i++) {
|
||||
for (const auto i : c10::irange(a0.numel())) {
|
||||
auto target = a1.data_ptr<scalar_t>()[i] * a1.data_ptr<scalar_t>()[i];
|
||||
target = target + a2.data_ptr<scalar_t>()[i];
|
||||
ASSERT(a0.data_ptr<scalar_t>()[i] == target);
|
||||
@ -97,7 +98,7 @@ void test(DeprecatedTypeProperties& type, IntArrayRef shape, int64_t a = 0, int6
|
||||
[](double& y, const scalar_t& x, const scalar_t& z, const scalar_t& a) {
|
||||
y = (double)(x * x + z * a);
|
||||
});
|
||||
for (int64_t i = 0; i < a0.numel(); i++) {
|
||||
for (const auto i : c10::irange(a0.numel())) {
|
||||
auto target = a1.data_ptr<scalar_t>()[i] * a1.data_ptr<scalar_t>()[i];
|
||||
target = target + a2.data_ptr<scalar_t>()[i] * a3.data_ptr<scalar_t>()[i];
|
||||
ASSERT(a0.data_ptr<scalar_t>()[i] == target);
|
||||
|
@ -1,6 +1,7 @@
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <ATen/ATen.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
#include <iostream>
|
||||
using namespace std;
|
||||
@ -102,7 +103,7 @@ void trace() {
|
||||
auto foo_a = foo.accessor<float, 2>();
|
||||
float trace = 0;
|
||||
|
||||
for (int i = 0; i < foo_a.size(0); i++) {
|
||||
for (const auto i : c10::irange(foo_a.size(0))) {
|
||||
trace += foo_a[i][i];
|
||||
}
|
||||
|
||||
@ -237,8 +238,8 @@ TEST_F(atest, atest) {
|
||||
// foo = foo[3];
|
||||
auto foo_v = foo.accessor<uint8_t, 2>();
|
||||
|
||||
for (int i = 0; i < foo_v.size(0); i++) {
|
||||
for (int j = 0; j < foo_v.size(1); j++) {
|
||||
for (const auto i : c10::irange(foo_v.size(0))) {
|
||||
for (const auto j : c10::irange(foo_v.size(1))) {
|
||||
foo_v[i][j]++;
|
||||
}
|
||||
}
|
||||
|
@ -4,6 +4,7 @@
|
||||
#include <ATen/core/Reduction.h>
|
||||
#include <torch/cuda.h>
|
||||
#include <ATen/test/test_assert.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
// for TH compat test only...
|
||||
struct THFloatTensor;
|
||||
@ -84,7 +85,8 @@ void TestAdd(DeprecatedTypeProperties& type) {
|
||||
void TestZeros(DeprecatedTypeProperties& type) {
|
||||
auto begin = std::chrono::high_resolution_clock::now();
|
||||
Tensor a = zeros({1024, 1024}, type);
|
||||
for (int i = 1; i < 1000; ++i) {
|
||||
for (const auto i : c10::irange(1, 1000)) {
|
||||
(void)i; // Suppress unused variable warning
|
||||
a = zeros({128, 128}, type);
|
||||
}
|
||||
auto end = std::chrono::high_resolution_clock::now();
|
||||
@ -102,7 +104,8 @@ void TestLoadsOfAdds(DeprecatedTypeProperties& type) {
|
||||
auto begin = std::chrono::high_resolution_clock::now();
|
||||
Tensor d = ones({3, 4}, type);
|
||||
Tensor r = zeros({3, 4}, type);
|
||||
for (auto i = 0; i < 100000; i++) {
|
||||
for (const auto i : c10::irange(100000)) {
|
||||
(void)i; // Suppress unused variable warning
|
||||
add_out(r, r, d);
|
||||
}
|
||||
auto end = std::chrono::high_resolution_clock::now();
|
||||
@ -119,7 +122,8 @@ void TestLoadOfAddsWithCopy(DeprecatedTypeProperties& type) {
|
||||
auto begin = std::chrono::high_resolution_clock::now();
|
||||
Tensor d = ones({3, 4}, type);
|
||||
Tensor r = zeros({3, 4}, type);
|
||||
for (auto i = 0; i < 100000; i++) {
|
||||
for (const auto i : c10::irange(100000)) {
|
||||
(void)i; // Suppress unused variable warning
|
||||
r = add(r, d);
|
||||
}
|
||||
auto end = std::chrono::high_resolution_clock::now();
|
||||
@ -176,7 +180,7 @@ void TestCopyBroadcasting(DeprecatedTypeProperties& type) {
|
||||
Tensor a = zeros({4, 3}, type);
|
||||
Tensor e = rand({3}, type);
|
||||
a.copy_(e);
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
for (const auto i : c10::irange(4)) {
|
||||
ASSERT_TRUE(a[i].equal(e));
|
||||
}
|
||||
}
|
||||
@ -247,13 +251,13 @@ void TestToString() {
|
||||
void TestIndexingByScalar() {
|
||||
Tensor tensor = arange(0, 10, kInt);
|
||||
Tensor one = ones({}, kInt);
|
||||
for (int64_t i = 0; i < tensor.numel(); ++i) {
|
||||
for (const auto i : c10::irange(tensor.numel())) {
|
||||
ASSERT_TRUE(tensor[i].equal(one * i));
|
||||
}
|
||||
for (size_t i = 0; i < static_cast<uint64_t>(tensor.numel()); ++i) {
|
||||
ASSERT_TRUE(tensor[i].equal(one * static_cast<int64_t>(i)));
|
||||
}
|
||||
for (int i = 0; i < tensor.numel(); ++i) {
|
||||
for (const auto i : c10::irange(tensor.numel())) {
|
||||
ASSERT_TRUE(tensor[i].equal(one * i));
|
||||
}
|
||||
// NOLINTNEXTLINE(bugprone-too-small-loop-variable)
|
||||
@ -272,7 +276,7 @@ void TestIndexingByScalar() {
|
||||
void TestIndexingByZerodimTensor() {
|
||||
Tensor tensor = arange(0, 10, kInt);
|
||||
Tensor one = ones({}, kInt);
|
||||
for (int i = 0; i < tensor.numel(); ++i) {
|
||||
for (const auto i : c10::irange(tensor.numel())) {
|
||||
ASSERT_TRUE(tensor[one * i].equal(one * i));
|
||||
}
|
||||
// Throw StartsWith(
|
||||
|
@ -4,6 +4,7 @@
|
||||
#include <ATen/Utils.h>
|
||||
#include <ATen/CPUGeneratorImpl.h>
|
||||
#include <ATen/core/PhiloxRNGEngine.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <thread>
|
||||
#include <limits>
|
||||
#include <random>
|
||||
@ -160,7 +161,8 @@ TEST(CPUGeneratorImpl, TestPhiloxEngineOffset1) {
|
||||
// So if you want to skip 8 values, offset would
|
||||
// be 2, since 2*4=8.
|
||||
at::Philox4_32_10 engine2(123, 1, 2);
|
||||
for(int i = 0; i < 8; i++){
|
||||
for (const auto i : c10::irange(8)) {
|
||||
(void)i; // Suppress unused variable warning
|
||||
// Note: instead of using the engine() call 8 times
|
||||
// we could have achieved the same functionality by
|
||||
// calling the incr() function twice.
|
||||
@ -221,14 +223,16 @@ TEST(CPUGeneratorImpl, TestMT19937EngineReproducibility) {
|
||||
// test with zero seed
|
||||
at::mt19937 engine1(0);
|
||||
std::mt19937 engine2(0);
|
||||
for(int i = 0; i < 10000; i++) {
|
||||
for (const auto i : c10::irange(10000)) {
|
||||
(void)i; // Suppress unused variable warning
|
||||
ASSERT_EQ(engine1(), engine2());
|
||||
}
|
||||
|
||||
// test with large seed
|
||||
engine1 = at::mt19937(2147483647);
|
||||
engine2 = std::mt19937(2147483647);
|
||||
for(int i = 0; i < 10000; i++) {
|
||||
for (const auto i : c10::irange(10000)) {
|
||||
(void)i; // Suppress unused variable warning
|
||||
ASSERT_EQ(engine1(), engine2());
|
||||
}
|
||||
|
||||
@ -237,7 +241,8 @@ TEST(CPUGeneratorImpl, TestMT19937EngineReproducibility) {
|
||||
auto seed = rd();
|
||||
engine1 = at::mt19937(seed);
|
||||
engine2 = std::mt19937(seed);
|
||||
for(int i = 0; i < 10000; i++) {
|
||||
for (const auto i : c10::irange(10000)) {
|
||||
(void)i; // Suppress unused variable warning
|
||||
ASSERT_EQ(engine1(), engine2());
|
||||
}
|
||||
|
||||
|
@ -2,6 +2,7 @@
|
||||
|
||||
#include <ATen/ATen.h>
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <caffe2/core/init.h>
|
||||
#include <caffe2/core/operator.h>
|
||||
#include <caffe2/core/context_gpu.h>
|
||||
@ -34,7 +35,7 @@ TEST(CUDACaffe2ToPytorch, SimpleLegacy) {
|
||||
|
||||
auto at_cpu = at_tensor.cpu();
|
||||
auto it = at_cpu.data_ptr<int64_t>();
|
||||
for (int64_t i = 0; i < 16; i++) {
|
||||
for (const auto i : c10::irange(16)) {
|
||||
ASSERT_EQ(it[i], 777);
|
||||
}
|
||||
}
|
||||
@ -53,7 +54,7 @@ TEST(CUDACaffe2ToPytorch, Simple) {
|
||||
|
||||
auto at_cpu = at_tensor.cpu();
|
||||
auto it = at_cpu.data_ptr<int64_t>();
|
||||
for (int64_t i = 0; i < 16; i++) {
|
||||
for (const auto i : c10::irange(16)) {
|
||||
ASSERT_EQ(it[i], 777);
|
||||
}
|
||||
}
|
||||
@ -109,7 +110,7 @@ TEST(CUDAPytorchToCaffe2, Op) {
|
||||
ASSERT_EQ(result.GetDeviceType(), caffe2::CUDA);
|
||||
|
||||
auto data = result.data<float>();
|
||||
for (int64_t i = 0; i < 25; i++) {
|
||||
for (const auto i : c10::irange(25)) {
|
||||
ASSERT_EQ(cuda_get(data + i), 3.0);
|
||||
}
|
||||
at::Tensor at_result(result);
|
||||
|
@ -3,6 +3,7 @@
|
||||
#include <gtest/gtest.h>
|
||||
#include <torch/torch.h>
|
||||
#include <c10/util/intrusive_ptr.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <ATen/core/Dict.h>
|
||||
|
||||
// Snippets for checking assembly.
|
||||
@ -643,7 +644,7 @@ TEST(IValueTest, IdentityComparisonAndHashing) {
|
||||
auto moreSampleIValues = makeMoreSampleIValues();
|
||||
|
||||
ASSERT_EQ(sampleIValues.size(), moreSampleIValues.size());
|
||||
for (int ii = 0; ii < sampleIValues.size(); ++ii) {
|
||||
for (const auto ii : c10::irange(sampleIValues.size())) {
|
||||
if (sampleIValues[ii].isComplexDouble() ||
|
||||
sampleIValues[ii].isBlob() ||
|
||||
sampleIValues[ii].isList() ||
|
||||
|
@ -2,6 +2,7 @@
|
||||
|
||||
#include <ATen/ATen.h>
|
||||
#include <ATen/CPUFunctions.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
using namespace at;
|
||||
|
||||
@ -115,7 +116,7 @@ TEST(MathKernelTest, MishBackward) {
|
||||
|
||||
TEST(MathKernelTest, NarrowCopy) {
|
||||
auto x = rand({5, 8, 7});
|
||||
for (int64_t dim = 0; dim < 3; ++dim) {
|
||||
for (const auto dim : c10::irange(3)) {
|
||||
const int64_t start = 1, length = 4;
|
||||
auto y_ref = x.narrow(dim, start, length);
|
||||
auto y_test = at::native::narrow_copy_dense(x, dim, start, length);
|
||||
|
@ -1,6 +1,7 @@
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <ATen/ATen.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
using namespace at;
|
||||
|
||||
@ -16,7 +17,7 @@ using namespace at;
|
||||
|
||||
void requireEqualTensorList(TensorList t1, TensorList t2) {
|
||||
ASSERT_EQ(t1.size(), t2.size());
|
||||
for (size_t i = 0; i < t1.size(); ++i) {
|
||||
for (const auto i : c10::irange(t1.size())) {
|
||||
ASSERT_EQUAL(t1[i], t2[i]);
|
||||
}
|
||||
}
|
||||
@ -74,7 +75,7 @@ void TestStack(TensorOptions T, Tensor& t) {
|
||||
auto z = rand({2, 3, 4});
|
||||
|
||||
auto inputs = {x, y, z};
|
||||
for (int64_t dim = 0; dim < 4; ++dim) {
|
||||
for (const auto dim : c10::irange(4)) {
|
||||
_test_stack(inputs, dim, at::stack);
|
||||
}
|
||||
}
|
||||
@ -85,7 +86,7 @@ void TestStack(TensorOptions T, Tensor& t) {
|
||||
auto z = rand({2, 3, 4});
|
||||
|
||||
auto inputs = {x, y, z};
|
||||
for (int64_t dim = 0; dim < 4; ++dim) {
|
||||
for (const auto dim : c10::irange(4)) {
|
||||
_test_stack(inputs, dim, at::native::_stack);
|
||||
}
|
||||
}
|
||||
@ -96,7 +97,7 @@ void TestStack(TensorOptions T, Tensor& t) {
|
||||
auto z = rand({2, 3, 4});
|
||||
|
||||
auto inputs = {x, y, z};
|
||||
for (int64_t dim = 0; dim < 4; ++dim) {
|
||||
for (const auto dim : c10::irange(4)) {
|
||||
_test_stack(inputs, dim, at::native::_stack_cpu);
|
||||
}
|
||||
}
|
||||
|
@ -1,6 +1,7 @@
|
||||
#include <ATen/Operators.h>
|
||||
#include <ATen/test/test_assert.h>
|
||||
#include <c10/util/Exception.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <ATen/ATen.h>
|
||||
@ -34,7 +35,7 @@ TEST(PackedtensoraccessorTest, TransposeTest) {
|
||||
t = rand({size}, CPU(kFloat));
|
||||
auto original_1d = t.packed_accessor64<float, 1, DefaultPtrTraits>();
|
||||
auto transposed_1d = original_1d.transpose(0, 0);
|
||||
for (int i = 0; i < size; i++){
|
||||
for (const auto i : c10::irange(size)) {
|
||||
ASSERT_EQ(original_1d[i], transposed_1d[i]);
|
||||
}
|
||||
|
||||
|
@ -1,6 +1,7 @@
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <ATen/native/Pow.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
#include <torch/types.h>
|
||||
#include <torch/utils.h>
|
||||
@ -203,7 +204,7 @@ void tensor_pow_tensor(const Vals vals, c10::ScalarType vals_dtype, Pows pows, c
|
||||
std::cout.precision(dbl::max_digits10);
|
||||
|
||||
const auto vals_tensor = torch::tensor(vals, vals_dtype);
|
||||
for (size_t shift = 0; shift < pows.size(); shift++) {
|
||||
for (const auto shift : c10::irange(pows.size())) {
|
||||
const auto pows_tensor = torch::tensor(pows, pows_dtype);
|
||||
|
||||
const auto actual_pow = vals_tensor.pow(pows_tensor);
|
||||
|
@ -11,6 +11,7 @@
|
||||
// For quantize_val
|
||||
#include <ATen/native/quantized/affine_quantizer.h>
|
||||
#include <c10/core/ScalarType.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <ATen/quantized/Quantizer.h>
|
||||
|
||||
using namespace at;
|
||||
@ -30,14 +31,14 @@ TEST(TestQTensor, QuantDequantAPIs) {
|
||||
// int_repr
|
||||
Tensor int_repr = qr.int_repr();
|
||||
auto* int_repr_data = int_repr.data_ptr<uint8_t>();
|
||||
for (auto i = 0; i < num_elements; ++i) {
|
||||
for (const auto i : c10::irange(num_elements)) {
|
||||
ASSERT_EQ(int_repr_data[i], 3);
|
||||
}
|
||||
|
||||
// Check for correct quantization
|
||||
auto r_data = r.data_ptr<float>();
|
||||
auto qr_data = qr.data_ptr<quint8>();
|
||||
for (auto i = 0; i < num_elements; ++i) {
|
||||
for (const auto i : c10::irange(num_elements)) {
|
||||
ASSERT_EQ(
|
||||
native::quantize_val<quint8>(scale, zero_point, r_data[i]).val_,
|
||||
qr_data[i].val_);
|
||||
@ -46,10 +47,10 @@ TEST(TestQTensor, QuantDequantAPIs) {
|
||||
// Check for correct dequantization
|
||||
Tensor rqr = qr.dequantize();
|
||||
auto rqr_data = rqr.data_ptr<float>();
|
||||
for (auto i = 0; i < num_elements; ++i) {
|
||||
for (const auto i : c10::irange(num_elements)) {
|
||||
ASSERT_EQ(r_data[i], rqr_data[i]);
|
||||
}
|
||||
for (auto i = 0; i < num_elements; ++i) {
|
||||
for (const auto i : c10::irange(num_elements)) {
|
||||
ASSERT_EQ(
|
||||
r_data[i],
|
||||
native::dequantize_val(qr.q_scale(), qr.q_zero_point(), qr_data[i]));
|
||||
@ -60,7 +61,7 @@ TEST(TestQTensor, QuantDequantAPIs) {
|
||||
int64_t new_zero_point = 1;
|
||||
Tensor reqr = at::quantize_per_tensor(r, new_scale, new_zero_point, kQInt8);
|
||||
auto reqr_data = reqr.data_ptr<qint8>();
|
||||
for (auto i = 0; i < num_elements; ++i) {
|
||||
for (const auto i : c10::irange(num_elements)) {
|
||||
reqr_data[i].val_ =
|
||||
native::requantize_val<quint8, qint8>(
|
||||
scale, zero_point, new_scale, new_zero_point, qr_data[i])
|
||||
@ -85,7 +86,7 @@ TEST(TestQTensor, RoundingMode) {
|
||||
Tensor qx = at::quantize_per_tensor(x, /*scale=*/1.0, zero_point, kQUInt8);
|
||||
|
||||
auto qx_data = qx.data_ptr<quint8>();
|
||||
for (size_t idx = 0; idx < x_values.size(); ++idx) {
|
||||
for (const auto idx : c10::irange(x_values.size())) {
|
||||
ASSERT_EQ(qx_expect[idx], qx_data[idx].val_)
|
||||
<< "Tie breaking during rounding element " << idx << " failed!";
|
||||
}
|
||||
@ -108,14 +109,14 @@ TEST(TestQTensor, EmptyQuantized) {
|
||||
{numel}, at::device(at::kCPU).dtype(kQUInt8), scale, zero_point);
|
||||
// Assigning to QTensor
|
||||
auto* q_data = q.data_ptr<quint8>();
|
||||
for (int i = 0; i < numel; ++i) {
|
||||
for (const auto i : c10::irange(numel)) {
|
||||
q_data[i].val_ = val;
|
||||
}
|
||||
|
||||
// dequantize
|
||||
auto r = q.dequantize();
|
||||
auto* r_data = r.data_ptr<float>();
|
||||
for (int i = 0; i < numel; ++i) {
|
||||
for (const auto i : c10::irange(numel)) {
|
||||
ASSERT_EQ(r_data[i], (val - zero_point) * scale);
|
||||
}
|
||||
}
|
||||
@ -134,14 +135,14 @@ TEST(TestQTensor, EmptyPerchannelQuantized) {
|
||||
at::device(at::kCPU).dtype(kQUInt8));
|
||||
// Assigning to QTensor
|
||||
auto* q_data = q.data_ptr<quint8>();
|
||||
for (int i = 0; i < numel; ++i) {
|
||||
for (const auto i : c10::irange(numel)) {
|
||||
q_data[i].val_ = val;
|
||||
}
|
||||
|
||||
// dequantize
|
||||
auto r = q.dequantize();
|
||||
auto* r_data = r.data_ptr<float>();
|
||||
for (int i = 0; i < numel; ++i) {
|
||||
for (const auto i : c10::irange(numel)) {
|
||||
ASSERT_EQ(
|
||||
r_data[i],
|
||||
(val - zero_points[i].item().to<int>()) * scales[i].item().to<float>());
|
||||
@ -222,7 +223,7 @@ TEST(TestQTensor, FromBlobQuantizedPerTensor) {
|
||||
custom_vec->reserve(numel);
|
||||
|
||||
uint8_t* custom_data = custom_vec->data();
|
||||
for (auto i = 0; i < numel; ++i) {
|
||||
for (const auto i : c10::irange(numel)) {
|
||||
custom_data[i] = i;
|
||||
}
|
||||
bool customDataDeleted{false};
|
||||
@ -236,7 +237,7 @@ TEST(TestQTensor, FromBlobQuantizedPerTensor) {
|
||||
Tensor qtensor = at::from_blob_quantized_per_tensor_affine(custom_data, shape, deleter, scale, zero_point, options);
|
||||
|
||||
uint8_t* q_data = (uint8_t*)qtensor.data_ptr<quint8>();
|
||||
for (auto i = 0; i < numel; ++i) {
|
||||
for (const auto i : c10::irange(numel)) {
|
||||
ASSERT_EQ((int)custom_data[i], (int)q_data[i]);
|
||||
}
|
||||
ASSERT_EQ((float)qtensor.q_scale(), (float)scale);
|
||||
@ -258,7 +259,7 @@ TEST(TestQTensor, FromBlobQuantizedPerChannel) {
|
||||
custom_vec->reserve(numel);
|
||||
|
||||
uint8_t* custom_data = custom_vec->data();
|
||||
for (auto i = 0; i < numel; ++i) {
|
||||
for (const auto i : c10::irange(numel)) {
|
||||
custom_data[i] = i;
|
||||
}
|
||||
bool customDataDeleted{false};
|
||||
@ -271,7 +272,7 @@ TEST(TestQTensor, FromBlobQuantizedPerChannel) {
|
||||
{
|
||||
Tensor qtensor = at::from_blob_quantized_per_channel_affine(custom_data, shape, deleter, scales, zero_points, ch_axis, options);
|
||||
uint8_t* q_data = (uint8_t*)qtensor.data_ptr<quint8>();
|
||||
for (auto i = 0; i < numel; ++i) {
|
||||
for (const auto i : c10::irange(numel)) {
|
||||
ASSERT_EQ((int)custom_data[i], (int)q_data[i]);
|
||||
}
|
||||
ASSERT_TRUE(at::allclose(qtensor.q_per_channel_scales(), scales));
|
||||
|
@ -1,6 +1,7 @@
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <ATen/ATen.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <caffe2/core/init.h>
|
||||
#include <caffe2/core/operator.h>
|
||||
|
||||
@ -8,13 +9,13 @@ TEST(Caffe2ToPytorch, SimpleLegacy) {
|
||||
caffe2::Tensor c2_tensor(caffe2::CPU);
|
||||
c2_tensor.Resize(4, 4);
|
||||
auto data = c2_tensor.mutable_data<int64_t>();
|
||||
for (int64_t i = 0; i < 16; i++) {
|
||||
for (const auto i : c10::irange(16)) {
|
||||
data[i] = i;
|
||||
}
|
||||
at::Tensor at_tensor(c2_tensor);
|
||||
|
||||
auto it = at_tensor.data_ptr<int64_t>();
|
||||
for (int64_t i = 0; i < 16; i++) {
|
||||
for (const auto i : c10::irange(16)) {
|
||||
ASSERT_EQ(it[i], i);
|
||||
}
|
||||
}
|
||||
@ -22,13 +23,13 @@ TEST(Caffe2ToPytorch, SimpleLegacy) {
|
||||
TEST(Caffe2ToPytorch, Simple) {
|
||||
caffe2::Tensor c2_tensor = caffe2::empty({4, 4}, at::kLong);
|
||||
auto data = c2_tensor.mutable_data<int64_t>();
|
||||
for (int64_t i = 0; i < 16; i++) {
|
||||
for (const auto i : c10::irange(16)) {
|
||||
data[i] = i;
|
||||
}
|
||||
at::Tensor at_tensor(c2_tensor);
|
||||
|
||||
auto it = at_tensor.data_ptr<int64_t>();
|
||||
for (int64_t i = 0; i < 16; i++) {
|
||||
for (const auto i : c10::irange(16)) {
|
||||
ASSERT_EQ(it[i], i);
|
||||
}
|
||||
}
|
||||
@ -37,7 +38,7 @@ TEST(Caffe2ToPytorch, ExternalData) {
|
||||
caffe2::Tensor c2_tensor = caffe2::empty({4, 4}, at::kLong);
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays,cppcoreguidelines-avoid-magic-numbers)
|
||||
int64_t buf[16];
|
||||
for (int64_t i = 0; i < 16; i++) {
|
||||
for (const auto i : c10::irange(16)) {
|
||||
buf[i] = i;
|
||||
}
|
||||
c2_tensor.ShareExternalPointer(buf, 16 * sizeof(int64_t));
|
||||
@ -48,7 +49,7 @@ TEST(Caffe2ToPytorch, ExternalData) {
|
||||
at_tensor.permute({1, 0});
|
||||
at_tensor.permute({1, 0});
|
||||
auto it = at_tensor.data_ptr<int64_t>();
|
||||
for (int64_t i = 0; i < 16; i++) {
|
||||
for (const auto i : c10::irange(16)) {
|
||||
ASSERT_EQ(it[i], i);
|
||||
}
|
||||
ASSERT_FALSE(at_tensor.storage().resizable());
|
||||
@ -60,7 +61,7 @@ TEST(Caffe2ToPytorch, Op) {
|
||||
caffe2::Tensor c2_tensor(caffe2::CPU);
|
||||
c2_tensor.Resize(3, 3);
|
||||
auto data = c2_tensor.mutable_data<int64_t>();
|
||||
for (int64_t i = 0; i < 9; i++) {
|
||||
for (const auto i : c10::irange(9)) {
|
||||
data[i] = i;
|
||||
}
|
||||
at::Tensor at_tensor(c2_tensor);
|
||||
@ -107,7 +108,7 @@ TEST(Caffe2ToPytorch, PartiallyInitialized) {
|
||||
TEST(Caffe2ToPytorch, MutualResizes) {
|
||||
caffe2::Tensor c2_tensor = caffe2::empty({5, 5}, at::kFloat);
|
||||
auto data = c2_tensor.mutable_data<float>();
|
||||
for (int64_t i = 0; i < 25; i++) {
|
||||
for (const auto i : c10::irange(25)) {
|
||||
data[i] = 0;
|
||||
}
|
||||
|
||||
@ -171,7 +172,7 @@ TEST(PytorchToCaffe2, Op) {
|
||||
auto result = XBlobGetMutableTensor(workspace.CreateBlob("d"), {5, 5}, at::kCPU);
|
||||
|
||||
auto it = result.data<float>();
|
||||
for (int64_t i = 0; i < 25; i++) {
|
||||
for (const auto i : c10::irange(25)) {
|
||||
ASSERT_EQ(it[i], 3.0);
|
||||
}
|
||||
at::Tensor at_result(result);
|
||||
@ -202,7 +203,7 @@ TEST(PytorchToCaffe2, SharedStorageRead) {
|
||||
|
||||
auto result = XBlobGetMutableTensor(workspace.CreateBlob("c"), {5, 5}, at::kCPU);
|
||||
auto it = result.data<float>();
|
||||
for (int64_t i = 0; i < 25; i++) {
|
||||
for (const auto i : c10::irange(25)) {
|
||||
ASSERT_EQ(it[i], 2.0);
|
||||
}
|
||||
at::Tensor at_result(result);
|
||||
@ -259,7 +260,7 @@ TEST(PytorchToCaffe2, Strided) {
|
||||
ASSERT_ANY_THROW(caffe2::Tensor c2_tensor(at_tensor));
|
||||
// but calling contiguous is fine
|
||||
caffe2::Tensor c2_tensor(at_tensor.contiguous());
|
||||
for (int64_t i = 0; i < 25; i++) {
|
||||
for (const auto i : c10::irange(25)) {
|
||||
ASSERT_EQ(c2_tensor.data<float>()[i], 1.0);
|
||||
}
|
||||
}
|
||||
|
@ -1,5 +1,6 @@
|
||||
#include <ATen/ATen.h>
|
||||
#include <ATen/Parallel.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <test/cpp/tensorexpr/test_base.h>
|
||||
#include <thread>
|
||||
|
||||
@ -13,7 +14,8 @@ void test(int given_num_threads) {
|
||||
ASSERT_TRUE(given_num_threads >= 0);
|
||||
ASSERT_EQ(at::get_num_threads(), given_num_threads);
|
||||
auto t_sum = t.sum();
|
||||
for (int i = 0; i < 1000; ++i) {
|
||||
for (const auto i : c10::irange(1000)) {
|
||||
(void)i; // Suppress unused variable warning
|
||||
t_sum = t_sum + t.sum();
|
||||
}
|
||||
}
|
||||
|
@ -1,4 +1,5 @@
|
||||
#include <ATen/test/vec_test_all_types.h>
|
||||
#include <c10/util/irange.h>
|
||||
namespace {
|
||||
#if GTEST_HAS_TYPED_TEST
|
||||
template <typename T>
|
||||
@ -455,7 +456,7 @@ namespace {
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
|
||||
CACHE_ALIGN VT expected_vals[vec::size()];
|
||||
auto vals = 1 << (vec::size());
|
||||
for (int val = 0; val < vals; ++val) {
|
||||
for (const auto val : c10::irange(vals)) {
|
||||
for (int i = 0; i < vec::size(); ++i) {
|
||||
if (val & (1 << i)) {
|
||||
test_vals[i] = std::numeric_limits<VT>::quiet_NaN();
|
||||
@ -747,7 +748,7 @@ namespace {
|
||||
CACHE_ALIGN VT test_vals[vec::size()];
|
||||
//all sets will be within 0 2^(n-1)
|
||||
auto power_sets = 1 << (vec::size());
|
||||
for (int expected = 0; expected < power_sets; expected++) {
|
||||
for (const auto expected : c10::irange(power_sets)) {
|
||||
// generate test_val based on expected
|
||||
for (int i = 0; i < vec::size(); ++i)
|
||||
{
|
||||
@ -894,7 +895,7 @@ namespace {
|
||||
void blend_init(T(&a)[N], T(&b)[N]) {
|
||||
a[0] = (T)1.0;
|
||||
b[0] = a[0] + (T)N;
|
||||
for (int i = 1; i < N; i++) {
|
||||
for (const auto i : c10::irange(1, N)) {
|
||||
a[i] = a[i - 1] + (T)(1.0);
|
||||
b[i] = b[i - 1] + (T)(1.0);
|
||||
}
|
||||
@ -905,7 +906,7 @@ namespace {
|
||||
auto add = Complex<float>(1., 100.);
|
||||
a[0] = Complex<float>(1., 100.);
|
||||
b[0] = Complex<float>(5., 1000.);
|
||||
for (int i = 1; i < 4; i++) {
|
||||
for (const auto i : c10::irange(1, 4)) {
|
||||
a[i] = a[i - 1] + add;
|
||||
b[i] = b[i - 1] + add;
|
||||
}
|
||||
@ -1051,7 +1052,8 @@ namespace {
|
||||
float minv = static_cast<float>(static_cast<double>(min_val) * 2.0);
|
||||
float maxv = static_cast<float>(static_cast<double>(max_val) * 2.0);
|
||||
ValueGen<float> gen(minv, maxv, seed.add(2));
|
||||
for (int i = 0; i < trials; i++) {
|
||||
for (const auto i : c10::irange(trials)) {
|
||||
(void)i; // Suppress unused variable warning
|
||||
float scale = generator_sc.get();
|
||||
float inv_scale = 1.0f / static_cast<float>(scale);
|
||||
auto zero_point_val = generator_zp.get();
|
||||
@ -1088,7 +1090,8 @@ namespace {
|
||||
ValueGen<int> generator(min_val, max_val, seed.add(1));
|
||||
//scale
|
||||
ValueGen<float> generator_sc(1.f, 15.f, seed.add(2));
|
||||
for (int i = 0; i < trials; i++) {
|
||||
for (const auto i : c10::irange(trials)) {
|
||||
(void)i; // Suppress unused variable warning
|
||||
float scale = generator_sc.get();
|
||||
int32_t zero_point_val = generator.get();
|
||||
float scale_zp_premul = -(scale * zero_point_val);
|
||||
@ -1135,7 +1138,8 @@ namespace {
|
||||
ValueGen<int32_t> generator(min_val, max_val, seed);
|
||||
//scale
|
||||
ValueGen<float> generator_sc(1.f, 15.f, seed.add(1));
|
||||
for (int i = 0; i < trials; i++) {
|
||||
for (const auto i : c10::irange(trials)) {
|
||||
(void)i; // Suppress unused variable warning
|
||||
float multiplier = 1.f / (generator_sc.get());
|
||||
auto zero_point_val = generator.get();
|
||||
int index = 0;
|
||||
@ -1172,7 +1176,8 @@ namespace {
|
||||
typename vec::int_vec_return_type expected_int_ret;
|
||||
auto seed = TestSeed();
|
||||
ValueGen<underlying> generator(min_val, max_val, seed);
|
||||
for (int i = 0; i < trials; i++) {
|
||||
for (const auto i : c10::irange(trials)) {
|
||||
(void)i; // Suppress unused variable warning
|
||||
//generate vals
|
||||
for (int j = 0; j < vec::size(); j++) {
|
||||
qint_vals[j] = generator.get();
|
||||
@ -1251,7 +1256,7 @@ namespace {
|
||||
CACHE_ALIGN VT ref_y[N];
|
||||
auto seed = TestSeed();
|
||||
ValueGen<VT> generator(VT(-100), VT(100), seed);
|
||||
for (int64_t i = 0; i < N; i++) {
|
||||
for (const auto i : c10::irange(N)) {
|
||||
x1[i] = generator.get();
|
||||
x2[i] = generator.get();
|
||||
x3[i] = generator.get();
|
||||
@ -1263,19 +1268,19 @@ namespace {
|
||||
};
|
||||
// test map: y = x1
|
||||
at::vec::map<VT>([](vec x) { return x; }, y, x1, N);
|
||||
for (int64_t i = 0; i < N; i++) { ref_y[i] = x1[i]; }
|
||||
for (const auto i : c10::irange(N)) { ref_y[i] = x1[i]; }
|
||||
cmp(y, ref_y);
|
||||
// test map2: y = x1 + x2
|
||||
at::vec::map2<VT>([](vec x1, vec x2) { return x1 + x2; }, y, x1, x2, N);
|
||||
for (int64_t i = 0; i < N; i++) { ref_y[i] = x1[i] + x2[i]; }
|
||||
for (const auto i : c10::irange(N)) { ref_y[i] = x1[i] + x2[i]; }
|
||||
cmp(y, ref_y);
|
||||
// test map3: y = x1 + x2 + x3
|
||||
at::vec::map3<VT>([](vec x1, vec x2, vec x3) { return x1 + x2 + x3; }, y, x1, x2, x3, N);
|
||||
for (int64_t i = 0; i < N; i++) { ref_y[i] = x1[i] + x2[i] + x3[i]; }
|
||||
for (const auto i : c10::irange(N)) { ref_y[i] = x1[i] + x2[i] + x3[i]; }
|
||||
cmp(y, ref_y);
|
||||
// test map4: y = x1 + x2 + x3 + x4
|
||||
at::vec::map4<VT>([](vec x1, vec x2, vec x3, vec x4) { return x1 + x2 + x3 + x4; }, y, x1, x2, x3, x4, N);
|
||||
for (int64_t i = 0; i < N; i++) { ref_y[i] = x1[i] + x2[i] + x3[i] + x4[i]; }
|
||||
for (const auto i : c10::irange(N)) { ref_y[i] = x1[i] + x2[i] + x3[i] + x4[i]; }
|
||||
cmp(y, ref_y);
|
||||
}
|
||||
TYPED_TEST(FunctionalBF16Tests, Reduce) {
|
||||
@ -1294,7 +1299,7 @@ namespace {
|
||||
CACHE_ALIGN VT x_b3[N];
|
||||
auto seed = TestSeed();
|
||||
ValueGen<RT> generator(RT(-1), RT(1), seed);
|
||||
for (int64_t i = 0; i < N; i++) {
|
||||
for (const auto i : c10::irange(N)) {
|
||||
x_f1[i] = generator.get();
|
||||
x_f2[i] = generator.get();
|
||||
x_f3[i] = generator.get();
|
||||
@ -1362,7 +1367,7 @@ namespace {
|
||||
CACHE_ALIGN VT y_b[N];
|
||||
auto seed = TestSeed();
|
||||
ValueGen<RT> generator(RT(-1), RT(1), seed);
|
||||
for (int64_t i = 0; i < N; i++) {
|
||||
for (const auto i : c10::irange(N)) {
|
||||
x_f1[i] = generator.get();
|
||||
x_f2[i] = generator.get();
|
||||
x_f3[i] = generator.get();
|
||||
@ -1379,7 +1384,7 @@ namespace {
|
||||
for (int64_t len = 1; len <= N; len++) {
|
||||
at::vec::map<RT>([](auto x) { return x; }, y_f, x_f1, len);
|
||||
at::vec::map<VT>([](auto x) { return x; }, y_b, x_b1, len);
|
||||
for (int64_t i = 0; i < len; i++) {
|
||||
for (const auto i : c10::irange(len)) {
|
||||
ASSERT_TRUE(cmp(y_f[i], y_b[i])) << "Failure Details:\nTest Seed to reproduce: " << seed
|
||||
<< "\nmap, Length: " << len << "; index: " << i << "; fp32 reference: " << y_f[i] << "; bf16 value: " << RT(y_b[i]);
|
||||
}
|
||||
@ -1388,7 +1393,7 @@ namespace {
|
||||
for (int64_t len = 1; len <= N; len++) {
|
||||
at::vec::map2<RT>([](auto x, auto y) { return x + y; }, y_f, x_f1, x_f2, len);
|
||||
at::vec::map2<VT>([](auto x, auto y) { return x + y; }, y_b, x_b1, x_b2, len);
|
||||
for (int64_t i = 0; i < len; i++) {
|
||||
for (const auto i : c10::irange(len)) {
|
||||
ASSERT_TRUE(cmp(y_f[i], y_b[i])) << "Failure Details:\nTest Seed to reproduce: " << seed
|
||||
<< "\nmap2, Length: " << len << "; index: " << i << "; fp32 reference: " << y_f[i] << "; bf16 value: " << RT(y_b[i]);
|
||||
}
|
||||
@ -1397,7 +1402,7 @@ namespace {
|
||||
for (int64_t len = 1; len <= N; len++) {
|
||||
at::vec::map3<RT>([](auto x, auto y, auto z) { return x + y * z; }, y_f, x_f1, x_f2, x_f3, len);
|
||||
at::vec::map3<VT>([](auto x, auto y, auto z) { return x + y * z; }, y_b, x_b1, x_b2, x_b3, len);
|
||||
for (int64_t i = 0; i < len; i++) {
|
||||
for (const auto i : c10::irange(len)) {
|
||||
ASSERT_TRUE(cmp(y_f[i], y_b[i])) << "Failure Details:\nTest Seed to reproduce: " << seed
|
||||
<< "\nmap3, Length: " << len << "; index: " << i << "; fp32 reference: " << y_f[i] << "; bf16 value: " << RT(y_b[i]);
|
||||
}
|
||||
@ -1406,7 +1411,7 @@ namespace {
|
||||
for (int64_t len = 1; len <= N; len++) {
|
||||
at::vec::map4<RT>([](auto x, auto y, auto z, auto w) { return x + y * z - w; }, y_f, x_f1, x_f2, x_f3, x_f4, len);
|
||||
at::vec::map4<VT>([](auto x, auto y, auto z, auto w) { return x + y * z - w; }, y_b, x_b1, x_b2, x_b3, x_b4, len);
|
||||
for (int64_t i = 0; i < len; i++) {
|
||||
for (const auto i : c10::irange(len)) {
|
||||
ASSERT_TRUE(cmp(y_f[i], y_b[i])) << "Failure Details:\nTest Seed to reproduce: " << seed
|
||||
<< "\nmap4, Length: " << len << "; index: " << i << "; fp32 reference: " << y_f[i] << "; bf16 value: " << RT(y_b[i]);
|
||||
}
|
||||
|
@ -1,6 +1,7 @@
|
||||
#pragma once
|
||||
#include <ATen/cpu/vec/vec.h>
|
||||
#include <ATen/cpu/vec/functional.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <gtest/gtest.h>
|
||||
#include <chrono>
|
||||
#include <exception>
|
||||
@ -869,8 +870,7 @@ public:
|
||||
act.store(actArr);
|
||||
if (bitwise)
|
||||
{
|
||||
for (int i = 0; i < sizeX; i++)
|
||||
{
|
||||
for (const auto i : c10::irange(sizeX)) {
|
||||
BVT b_exp = bit_cast<BVT>(expArr[i]);
|
||||
BVT b_act = bit_cast<BVT>(actArr[i]);
|
||||
EXPECT_EQ(b_exp, b_act) << getDetail(i / unitStorageCount);
|
||||
@ -880,8 +880,7 @@ public:
|
||||
}
|
||||
else if (checkWithTolerance)
|
||||
{
|
||||
for (int i = 0; i < sizeX; i++)
|
||||
{
|
||||
for (const auto i : c10::irange(sizeX)) {
|
||||
EXPECT_EQ(nearlyEqual<UVT>(expArr[i], actArr[i], absErr), true) << expArr[i] << "!=" << actArr[i] << "\n" << getDetail(i / unitStorageCount);
|
||||
if (::testing::Test::HasFailure())
|
||||
return true;
|
||||
@ -889,8 +888,7 @@ public:
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int i = 0; i < sizeX; i++)
|
||||
{
|
||||
for (const auto i : c10::irange(sizeX)) {
|
||||
if (std::is_same<UVT, float>::value)
|
||||
{
|
||||
if (!check_both_nan(expArr[i], actArr[i])) {
|
||||
@ -952,8 +950,9 @@ void test_unary(
|
||||
UVT start = dmn_argc > 0 ? dmn.ArgsDomain[0].start : default_start;
|
||||
UVT end = dmn_argc > 0 ? dmn.ArgsDomain[0].end : default_end;
|
||||
ValueGen<VT> generator(start, end, seed.add(changeSeedBy));
|
||||
for (int trial = 0; trial < trialCount; trial++) {
|
||||
for (int k = 0; k < el_count; k++) {
|
||||
for (const auto trial : c10::irange(trialCount)) {
|
||||
(void)trial; // Suppress unused variable warning
|
||||
for (const auto k : c10::irange(el_count)) {
|
||||
vals[k] = generator.get();
|
||||
call_filter(filter, vals[k]);
|
||||
//map operator
|
||||
@ -1011,8 +1010,9 @@ void test_binary(
|
||||
UVT end1 = dmn_argc > 1 ? dmn.ArgsDomain[1].end : default_end;
|
||||
ValueGen<VT> generator0(start0, end0, seed.add(changeSeedBy));
|
||||
ValueGen<VT> generator1(start1, end1, seed.add(changeSeedBy + 1));
|
||||
for (int trial = 0; trial < trialCount; trial++) {
|
||||
for (int k = 0; k < el_count; k++) {
|
||||
for (const auto trial : c10::irange(trialCount)) {
|
||||
(void)trial; // Suppress unused variable warning
|
||||
for (const auto k : c10::irange(el_count)) {
|
||||
vals0[k] = generator0.get();
|
||||
vals1[k] = generator1.get();
|
||||
call_filter(filter, vals0[k], vals1[k]);
|
||||
@ -1076,8 +1076,9 @@ void test_ternary(
|
||||
ValueGen<VT> generator1(start1, end1, seed.add(changeSeedBy + 1));
|
||||
ValueGen<VT> generator2(start2, end2, seed.add(changeSeedBy + 2));
|
||||
|
||||
for (int trial = 0; trial < trialCount; trial++) {
|
||||
for (int k = 0; k < el_count; k++) {
|
||||
for (const auto trial : c10::irange(trialCount)) {
|
||||
(void)trial; // Suppress unused variable warning
|
||||
for (const auto k : c10::irange(el_count)) {
|
||||
vals0[k] = generator0.get();
|
||||
vals1[k] = generator1.get();
|
||||
vals2[k] = generator2.get();
|
||||
|
@ -3,6 +3,7 @@
|
||||
|
||||
#include <ATen/ATen.h>
|
||||
#include <ATen/core/Vitals.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <cstdlib>
|
||||
|
||||
using namespace at::vitals;
|
||||
@ -62,7 +63,7 @@ TEST(Vitals, MultiString) {
|
||||
}
|
||||
|
||||
TEST(Vitals, OnAndOff) {
|
||||
for (auto i = 0; i < 2; ++i) {
|
||||
for (const auto i : c10::irange(2)) {
|
||||
std::stringstream buffer;
|
||||
|
||||
std::streambuf* sbuf = std::cout.rdbuf();
|
||||
|
@ -3,6 +3,7 @@
|
||||
#include <ATen/ATen.h>
|
||||
#include <ATen/BatchedTensorImpl.h>
|
||||
#include <ATen/VmapTransforms.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
using namespace at;
|
||||
|
||||
@ -55,7 +56,7 @@ TEST(VmapTest, TestBatchedTensor) {
|
||||
// returns {{lvl=0,dim=0}, {lvl=1,dim=1}, ..., {lvl=kVmapNumLevels-1,dim=kVmapNumLevels-1}};
|
||||
static BatchDims maxBatchDimsAtFront() {
|
||||
BatchDims result;
|
||||
for (int64_t lvl = 0; lvl < kVmapNumLevels; lvl++) {
|
||||
for (const auto lvl : c10::irange(kVmapNumLevels)) {
|
||||
result.emplace_back(lvl, /*dim=*/lvl);
|
||||
}
|
||||
return result;
|
||||
@ -169,7 +170,8 @@ TEST(VmapTest, TestBatchedTensorActualDim) {
|
||||
{
|
||||
// ActualDim on kVmapMaxTensorDims sized underlying tensor
|
||||
auto tensor = ones({});
|
||||
for (int64_t i = 0; i < kVmapMaxTensorDims; i++) {
|
||||
for (const auto i : c10::irange(kVmapMaxTensorDims)) {
|
||||
(void)i; // Suppress unused variable warning
|
||||
tensor = tensor.unsqueeze(0);
|
||||
}
|
||||
ASSERT_EQ(tensor.dim(), kVmapMaxTensorDims);
|
||||
@ -260,7 +262,7 @@ TEST(VmapTest, TestMultiBatchVmapTransform) {
|
||||
BatchDims batch_dims = {
|
||||
{0, 2}, {1, 1}, {2, kVmapNumLevels - 1}, {3, 5}, {4, 0}, {5, 3}, {6, 4}
|
||||
};
|
||||
for (int64_t level = 7; level < kVmapNumLevels; level++ ) {
|
||||
for (const auto level : c10::irange(7, kVmapNumLevels)) {
|
||||
batch_dims.emplace_back(level, /*dim=*/level - 1);
|
||||
}
|
||||
auto tensor = ones(sizes);
|
||||
@ -303,7 +305,7 @@ TEST(VmapTest, TestVmapPhysicalViewGetPhysicalDims) {
|
||||
|
||||
static void checkBatchDimsEqual(BatchDimsRef bdims, BatchDimsRef expected_bdims) {
|
||||
ASSERT_EQ(bdims.size(), expected_bdims.size());
|
||||
for (int64_t idx = 0; idx < bdims.size(); idx++) {
|
||||
for (const auto idx : c10::irange(bdims.size())) {
|
||||
ASSERT_EQ(bdims[idx].dim(), expected_bdims[idx].dim());
|
||||
ASSERT_EQ(bdims[idx].level(), expected_bdims[idx].level());
|
||||
}
|
||||
@ -394,7 +396,7 @@ TEST(VmapTest, TestBatchedTensorSum) {
|
||||
static void checkBroadcastingVmapTransform(TensorList inputs, TensorList expected_outputs) {
|
||||
auto outputs = BroadcastingVmapTransform::logicalToPhysical(inputs);
|
||||
ASSERT_EQ(outputs.size(), expected_outputs.size());
|
||||
for (int64_t idx = 0; idx < outputs.size(); idx++) {
|
||||
for (const auto idx : c10::irange(outputs.size())) {
|
||||
const auto& output = outputs[idx].tensor();
|
||||
ASSERT_EQ(output.data_ptr(), expected_outputs[idx].data_ptr());
|
||||
ASSERT_TRUE(at::allclose(output, expected_outputs[idx]));
|
||||
@ -878,7 +880,7 @@ TEST(VmapTest, TestBatchedTensorPermute) {
|
||||
static void checkMultiBatchVmapTransform(TensorList inputs, TensorList expected_outputs) {
|
||||
auto outputs = MultiBatchVmapTransform::logicalToPhysical(inputs);
|
||||
ASSERT_EQ(outputs.size(), expected_outputs.size());
|
||||
for (int64_t idx = 0; idx < outputs.size(); idx++) {
|
||||
for (const auto idx : c10::irange(outputs.size())) {
|
||||
const auto& output = outputs[idx].tensor();
|
||||
ASSERT_EQ(output.data_ptr(), expected_outputs[idx].data_ptr());
|
||||
ASSERT_EQ(output.sizes(), expected_outputs[idx].sizes());
|
||||
|
@ -5,6 +5,7 @@
|
||||
#include <ATen/ATen.h>
|
||||
#include <ATen/core/dispatch/Dispatcher.h>
|
||||
#include <ATen/vulkan/Context.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
bool checkRtol(const at::Tensor& diff, const std::vector<at::Tensor> inputs) {
|
||||
double maxValue = 0.0;
|
||||
@ -145,7 +146,7 @@ TEST(VulkanTest, addScalar) {
|
||||
auto t_in = at::rand({3, 2, 2, 3}, at::device(at::kCPU).dtype(at::kFloat));
|
||||
float* data = t_in.data_ptr<float>();
|
||||
auto numel = t_in.numel();
|
||||
for (int i = 0; i < numel; i++) {
|
||||
for (const auto i : c10::irange(numel)) {
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
data[i] = i;
|
||||
}
|
||||
@ -772,7 +773,7 @@ TEST(VulkanTest, tensor5d_transpose) {
|
||||
at::empty({1, 2, 3, 2, 1}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
|
||||
float* data = t_in.data_ptr<float>();
|
||||
auto numel = t_in.numel();
|
||||
for (int i = 0; i < numel; i++) {
|
||||
for (const auto i : c10::irange(numel)) {
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
data[i] = i;
|
||||
}
|
||||
@ -816,7 +817,7 @@ TEST(VulkanTest, slice) {
|
||||
at::empty({1, 4, 2, 2}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
|
||||
float* data = t_in.data_ptr<float>();
|
||||
auto numel = t_in.numel();
|
||||
for (int i = 0; i < numel; i++) {
|
||||
for (const auto i : c10::irange(numel)) {
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
data[i] = i;
|
||||
}
|
||||
@ -841,7 +842,7 @@ TEST(VulkanTest, select) {
|
||||
at::empty({1, 4, 2, 2}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
|
||||
float* data = t_in.data_ptr<float>();
|
||||
auto numel = t_in.numel();
|
||||
for (int i = 0; i < numel; i++) {
|
||||
for (const auto i : c10::irange(numel)) {
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
data[i] = i;
|
||||
}
|
||||
@ -866,7 +867,7 @@ TEST(VulkanTest, unsqueeze) {
|
||||
at::empty({1, 2, 2}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
|
||||
float* data = t_in.data_ptr<float>();
|
||||
auto numel = t_in.numel();
|
||||
for (int i = 0; i < numel; i++) {
|
||||
for (const auto i : c10::irange(numel)) {
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
data[i] = i;
|
||||
}
|
||||
|
@ -1,4 +1,5 @@
|
||||
#include <benchmark/benchmark.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
|
||||
#include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
|
||||
#include <torch/csrc/jit/tensorexpr/loopnest.h>
|
||||
@ -15,14 +16,14 @@ class ConcatBench : public benchmark::Fixture {
|
||||
input_sizes_ = std::move(input_sizes);
|
||||
concat_dim_ = concat_dim;
|
||||
inputs_.resize(input_sizes_.size());
|
||||
for (size_t i = 0; i < input_sizes_.size(); ++i) {
|
||||
for (const auto i : c10::irange(input_sizes_.size())) {
|
||||
inputs_[i] = torch::ones({input_sizes_[i][0], input_sizes_[i][1]});
|
||||
}
|
||||
output_size_.resize(input_sizes_.front().size());
|
||||
for (size_t i = 0; i < output_size_.size(); ++i) {
|
||||
for (const auto i : c10::irange(output_size_.size())) {
|
||||
if (i == static_cast<size_t>(concat_dim_)) {
|
||||
output_size_[i] = 0;
|
||||
for (size_t j = 0; j < input_sizes_.size(); ++j) {
|
||||
for (const auto j : c10::irange(input_sizes_.size())) {
|
||||
output_size_[i] += input_sizes_[j][i];
|
||||
}
|
||||
} else {
|
||||
@ -64,7 +65,7 @@ class ConcatBench : public benchmark::Fixture {
|
||||
[&](const VarHandle& m, const VarHandle& n) {
|
||||
int d = 0;
|
||||
std::vector<int> cumulative_concat_dim_sizes(num_inputs);
|
||||
for (size_t i = 0; i < num_inputs; ++i) {
|
||||
for (const auto i : c10::irange(num_inputs)) {
|
||||
cumulative_concat_dim_sizes[i] = d;
|
||||
d += input_sizes_[i][concat_dim_];
|
||||
}
|
||||
@ -119,7 +120,7 @@ class ConcatBench : public benchmark::Fixture {
|
||||
{input_sizes_[i][0], input_sizes_[i][1]},
|
||||
kFloat));
|
||||
std::vector<VarPtr> for_vars(num_inputs);
|
||||
for (size_t d = 0; d < num_dims; ++d) {
|
||||
for (const auto d : c10::irange(num_dims)) {
|
||||
for_vars[d] =
|
||||
alloc<Var>("i" + std::to_string(i) + "_" + std::to_string(d), kInt);
|
||||
}
|
||||
|
@ -1,5 +1,6 @@
|
||||
#include <benchmark/benchmark.h>
|
||||
#include <c10/core/InferenceMode.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <torch/csrc/jit/codegen/fuser/interface.h>
|
||||
#include <torch/torch.h>
|
||||
|
||||
@ -22,7 +23,8 @@ static void FusedOverhead(benchmark::State& state) {
|
||||
auto z = torch::ones({1});
|
||||
|
||||
// Warmup.
|
||||
for (int i = 0; i < 8; i++) {
|
||||
for (const auto i : c10::irange(8)) {
|
||||
(void)i; // Suppress unused variable warning
|
||||
m.run_method("two_adds", x, y, z);
|
||||
}
|
||||
|
||||
@ -43,7 +45,8 @@ static void UnfusedOverhead(benchmark::State& state) {
|
||||
auto z = torch::ones({1});
|
||||
|
||||
// Warmup.
|
||||
for (int i = 0; i < 8; i++) {
|
||||
for (const auto i : c10::irange(8)) {
|
||||
(void)i; // Suppress unused variable warning
|
||||
m.run_method("two_adds", x, y, z);
|
||||
}
|
||||
|
||||
|
@ -1,4 +1,5 @@
|
||||
#include <benchmark/benchmark.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <torch/csrc/jit/tensorexpr/analysis.h>
|
||||
#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
|
||||
#include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
|
||||
@ -53,7 +54,7 @@ BENCHMARK_DEFINE_F(ParallelAdd, Simple)(benchmark::State& state) {
|
||||
float* c_ptr = C.data_ptr<float>();
|
||||
std::vector<void*> args({c_ptr, a_ptr, b_ptr});
|
||||
cg.value<int>(args);
|
||||
for (int i = 0; i < M; i++) {
|
||||
for (const auto i : c10::irange(M)) {
|
||||
float diff = fabs(a_ptr[i] + b_ptr[i] - c_ptr[i]);
|
||||
TORCH_CHECK(diff < 1e-5);
|
||||
}
|
||||
|
@ -1,4 +1,5 @@
|
||||
#include <benchmark/benchmark.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <torch/csrc/jit/tensorexpr/analysis.h>
|
||||
#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
|
||||
#include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
|
||||
@ -78,7 +79,7 @@ static void reduce1d_naive(at::Tensor& A, at::Tensor& B) {
|
||||
int size = A.numel();
|
||||
TORCH_CHECK(B.numel() == 1);
|
||||
*pB = 0.;
|
||||
for (int i = 0; i < size; i++) {
|
||||
for (const auto i : c10::irange(size)) {
|
||||
*pB += pA[i];
|
||||
}
|
||||
}
|
||||
@ -101,18 +102,18 @@ static void reduce1d_native_rfactor(at::Tensor& A, at::Tensor& B) {
|
||||
TORCH_CHECK(size % kChunkSize == 0);
|
||||
*pB = 0.;
|
||||
float temp[kChunkSize];
|
||||
for (int j = 0; j < kChunkSize; j++) {
|
||||
for (const auto j : c10::irange(kChunkSize)) {
|
||||
temp[j] = 0;
|
||||
}
|
||||
|
||||
int chunk_count = size / kChunkSize;
|
||||
for (int i = 0; i < chunk_count; i++) {
|
||||
for (int j = 0; j < kChunkSize; j++) {
|
||||
for (const auto i : c10::irange(chunk_count)) {
|
||||
for (const auto j : c10::irange(kChunkSize)) {
|
||||
temp[j] += pA[i * kChunkSize + j];
|
||||
}
|
||||
}
|
||||
|
||||
for (int j = 0; j < kChunkSize; j++) {
|
||||
for (const auto j : c10::irange(kChunkSize)) {
|
||||
*pB += temp[j];
|
||||
}
|
||||
}
|
||||
@ -163,7 +164,7 @@ static void reduce1d_native_vector(at::Tensor& A, at::Tensor& B) {
|
||||
temp = _mm256_setzero_ps();
|
||||
|
||||
int tile_count = size / kChunkSize;
|
||||
for (int i = 0; i < tile_count; i++) {
|
||||
for (const auto i : c10::irange(tile_count)) {
|
||||
__m256 data = _mm256_load_ps(pA + i * kChunkSize);
|
||||
temp = _mm256_add_ps(temp, data);
|
||||
}
|
||||
@ -196,7 +197,7 @@ static void reduce1d_native_tiled(at::Tensor& A, at::Tensor& B) {
|
||||
kChunkSize,
|
||||
" ! = 0");
|
||||
__m256 t[kTileSize];
|
||||
for (int j = 0; j < kTileSize; j++) {
|
||||
for (const auto j : c10::irange(kTileSize)) {
|
||||
t[j] = _mm256_setzero_ps();
|
||||
}
|
||||
|
||||
@ -211,7 +212,7 @@ static void reduce1d_native_tiled(at::Tensor& A, at::Tensor& B) {
|
||||
}
|
||||
|
||||
float result = sum_f32x8(t[0]);
|
||||
for (int j = 1; j < kTileSize; j++) {
|
||||
for (const auto j : c10::irange(1, kTileSize)) {
|
||||
result += sum_f32x8(t[j]);
|
||||
}
|
||||
*pB = result;
|
||||
@ -540,16 +541,16 @@ BENCHMARK_DEFINE_F(Reduce2DRow, Hand)(benchmark::State& state) {
|
||||
for (int m_outer = 0; m_outer < M; m_outer += Mb) {
|
||||
float bregs[Mb][Nb] = {0.0f};
|
||||
for (int n_outer = 0; n_outer < N; n_outer += Nb) {
|
||||
for (int m_inner = 0; m_inner < Mb; m_inner++) {
|
||||
for (int n_inner = 0; n_inner < Nb; n_inner++) {
|
||||
for (const auto m_inner : c10::irange(Mb)) {
|
||||
for (const auto n_inner : c10::irange(Nb)) {
|
||||
bregs[m_inner][n_inner] +=
|
||||
a[(m_outer + m_inner) * N + n_outer + n_inner];
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int m_inner = 0; m_inner < Mb; m_inner++) {
|
||||
for (const auto m_inner : c10::irange(Mb)) {
|
||||
b[m_outer + m_inner] = 0.f;
|
||||
for (int n_inner = 0; n_inner < Nb; n_inner++) {
|
||||
for (const auto n_inner : c10::irange(Nb)) {
|
||||
b[m_outer + m_inner] += bregs[m_inner][n_inner];
|
||||
}
|
||||
}
|
||||
|
@ -24,6 +24,7 @@
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/utils/string_utils.h"
|
||||
#include "c10/util/string_utils.h"
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
using std::map;
|
||||
using std::shared_ptr;
|
||||
@ -55,12 +56,12 @@ void writeTextOutput(
|
||||
int dims_size = tensor_proto.dims_size();
|
||||
long long elem_dim_size =
|
||||
dims_size > 1 ? tensor_proto.dims(1) : tensor_proto.dims(0);
|
||||
for (int i = 2; i < dims_size; i++) {
|
||||
for (const auto i : c10::irange(2, dims_size)) {
|
||||
elem_dim_size *= tensor_proto.dims(i);
|
||||
}
|
||||
std::vector<std::string> lines;
|
||||
std::string dims;
|
||||
for (int i = 0; i < dims_size; i++) {
|
||||
for (const auto i : c10::irange(dims_size)) {
|
||||
int dim = tensor_proto.dims(i);
|
||||
if (i > 0) {
|
||||
dims += ", ";
|
||||
|
@ -2,6 +2,7 @@
|
||||
#include <c10/core/DeviceType.h>
|
||||
#include <c10/mobile/CPUCachingAllocator.h>
|
||||
#include <c10/mobile/CPUProfilingAllocator.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
// TODO: rename flags to C10
|
||||
C10_DEFINE_bool(
|
||||
@ -30,7 +31,7 @@ void memset_junk(void* data, size_t num) {
|
||||
int32_t int64_count = num / sizeof(kJunkPattern64);
|
||||
int32_t remaining_bytes = num % sizeof(kJunkPattern64);
|
||||
int64_t* data_i64 = reinterpret_cast<int64_t*>(data);
|
||||
for (int i = 0; i < int64_count; i++) {
|
||||
for (const auto i : c10::irange(int64_count)) {
|
||||
data_i64[i] = kJunkPattern64;
|
||||
}
|
||||
if (remaining_bytes > 0) {
|
||||
|
@ -5,6 +5,7 @@
|
||||
#include <c10/core/WrapDimMinimal.h>
|
||||
#include <c10/core/impl/LocalDispatchKeySet.h>
|
||||
#include <c10/util/Optional.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
C10_DEFINE_bool(
|
||||
caffe2_keep_on_shrink,
|
||||
@ -335,7 +336,7 @@ bool TensorImpl::compute_non_overlapping_and_dense() const {
|
||||
}
|
||||
SmallVector<int64_t, 5> perm;
|
||||
perm.resize(dim());
|
||||
for (int64_t i = 0; i < dim(); i++) {
|
||||
for (const auto i : c10::irange(dim())) {
|
||||
perm[i] = i;
|
||||
}
|
||||
// Sort by strides, leaving 0 and 1 sized dims at the end of the array
|
||||
@ -349,7 +350,7 @@ bool TensorImpl::compute_non_overlapping_and_dense() const {
|
||||
sizes_and_strides_.stride_at_unchecked(b);
|
||||
});
|
||||
auto require_stride = 1;
|
||||
for (int64_t i = 0; i < dim(); i++) {
|
||||
for (const auto i : c10::irange(dim())) {
|
||||
const auto size_perm_i = sizes_and_strides_.size_at_unchecked(perm[i]);
|
||||
if (size_perm_i < 2) {
|
||||
return true;
|
||||
|
@ -19,6 +19,7 @@
|
||||
#include <c10/util/Logging.h>
|
||||
#include <c10/util/Optional.h>
|
||||
#include <c10/util/accumulate.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <c10/util/python_stub.h>
|
||||
|
||||
// A global boolean variable to control whether we free memory when a Tensor
|
||||
@ -68,7 +69,7 @@ inline std::vector<int64_t> ToVectorint64_t(ArrayRef<int> src) {
|
||||
*/
|
||||
inline int64_t size_from_dim_(int k, IntArrayRef dims) {
|
||||
int64_t r = 1;
|
||||
for (size_t i = k; i < dims.size(); ++i) {
|
||||
for (const auto i : c10::irange(k, dims.size())) {
|
||||
r *= dims[i];
|
||||
}
|
||||
return r;
|
||||
@ -78,7 +79,7 @@ inline int64_t size_from_dim_(int k, IntArrayRef dims) {
|
||||
inline int64_t size_to_dim_(int k, IntArrayRef dims) {
|
||||
TORCH_CHECK((unsigned)k <= dims.size());
|
||||
int64_t r = 1;
|
||||
for (int i = 0; i < k; ++i) {
|
||||
for (const auto i : c10::irange(k)) {
|
||||
r *= dims[i];
|
||||
}
|
||||
return r;
|
||||
@ -2163,7 +2164,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
|
||||
auto old_numel = numel_;
|
||||
sizes_and_strides_.resize(src.size());
|
||||
int64_t new_numel = 1;
|
||||
for (size_t i = 0; i < src.size(); ++i) {
|
||||
for (const auto i : c10::irange(src.size())) {
|
||||
new_numel *= src[i];
|
||||
sizes_and_strides_.size_at_unchecked(i) = src[i];
|
||||
}
|
||||
|
@ -2,6 +2,7 @@
|
||||
|
||||
#include <c10/core/impl/InlineDeviceGuard.h>
|
||||
#include <c10/util/ArrayRef.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
namespace c10 {
|
||||
namespace impl {
|
||||
@ -237,7 +238,7 @@ class InlineMultiStreamGuard {
|
||||
static DeviceType getDeviceTypeOfStreams(ArrayRef<Stream> streams) {
|
||||
TORCH_INTERNAL_ASSERT(!streams.empty());
|
||||
DeviceType type = streams[0].device_type();
|
||||
for (size_t idx = 1; idx < streams.size(); idx++) {
|
||||
for (const auto idx : c10::irange(1, streams.size())) {
|
||||
TORCH_CHECK_VALUE(
|
||||
streams[idx].device_type() == type,
|
||||
"Streams have a mix of device types: stream 0 is on ",
|
||||
|
@ -201,7 +201,7 @@ static SizesAndStrides makeBig(int offset = 0) {
|
||||
|
||||
static void checkSmall(const SizesAndStrides& sm, int offset = 0) {
|
||||
std::vector<int64_t> sizes(3), strides(3);
|
||||
for (int ii = 0; ii < 3; ++ii) {
|
||||
for (const auto ii : c10::irange(3)) {
|
||||
sizes[ii] = ii + 1 + offset;
|
||||
strides[ii] = 2 * (ii + 1 + offset);
|
||||
}
|
||||
@ -210,7 +210,7 @@ static void checkSmall(const SizesAndStrides& sm, int offset = 0) {
|
||||
|
||||
static void checkBig(const SizesAndStrides& big, int offset = 0) {
|
||||
std::vector<int64_t> sizes(8), strides(8);
|
||||
for (int ii = 0; ii < 8; ++ii) {
|
||||
for (const auto ii : c10::irange(8)) {
|
||||
sizes[ii] = ii - 1 + offset;
|
||||
strides[ii] = 2 * (ii - 1 + offset);
|
||||
}
|
||||
|
@ -1,6 +1,7 @@
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <c10/util/Bitset.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
using c10::utils::bitset;
|
||||
|
||||
@ -37,7 +38,7 @@ TEST(BitsetTest, givenEmptyBitset_whenSettingBit_thenIsSet) {
|
||||
TEST(BitsetTest, givenEmptyBitset_whenSettingBit_thenOthersStayUnset) {
|
||||
bitset b;
|
||||
b.set(6);
|
||||
for (size_t i = 0; i < 6; ++i) {
|
||||
for (const auto i : c10::irange(6)) {
|
||||
EXPECT_FALSE(b.get(i));
|
||||
}
|
||||
for (size_t i = 7; i < bitset::NUM_BITS(); ++i) {
|
||||
@ -56,10 +57,10 @@ TEST(BitsetTest, givenNonemptyBitset_whenSettingBit_thenOthersStayAtOldValue) {
|
||||
bitset b;
|
||||
b.set(6);
|
||||
b.set(30);
|
||||
for (size_t i = 0; i < 6; ++i) {
|
||||
for (const auto i : c10::irange(6)) {
|
||||
EXPECT_FALSE(b.get(i));
|
||||
}
|
||||
for (size_t i = 7; i < 30; ++i) {
|
||||
for (const auto i : c10::irange(7, 30)) {
|
||||
EXPECT_FALSE(b.get(i));
|
||||
}
|
||||
for (size_t i = 31; i < bitset::NUM_BITS(); ++i) {
|
||||
@ -82,7 +83,7 @@ TEST(
|
||||
b.set(6);
|
||||
b.set(30);
|
||||
b.unset(6);
|
||||
for (size_t i = 0; i < 30; ++i) {
|
||||
for (const auto i : c10::irange(30)) {
|
||||
EXPECT_FALSE(b.get(i));
|
||||
}
|
||||
EXPECT_TRUE(b.get(30));
|
||||
@ -100,7 +101,7 @@ struct IndexCallbackMock final {
|
||||
|
||||
void expect_was_called_for_indices(std::vector<size_t> expected_indices) {
|
||||
EXPECT_EQ(expected_indices.size(), called_for_indices.size());
|
||||
for (size_t i = 0; i < expected_indices.size(); ++i) {
|
||||
for (const auto i : c10::irange(expected_indices.size())) {
|
||||
EXPECT_EQ(expected_indices[i], called_for_indices[i]);
|
||||
}
|
||||
}
|
||||
|
@ -1,6 +1,7 @@
|
||||
// clang-format off
|
||||
#include <c10/util/BFloat16.h>
|
||||
#include <c10/util/BFloat16-math.h>
|
||||
#include <c10/util/irange.h>
|
||||
// clang-format on
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
@ -24,7 +25,7 @@ float float_from_bytes(uint32_t sign, uint32_t exponent, uint32_t fraction) {
|
||||
TEST(BFloat16Conversion, FloatToBFloat16AndBack) {
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,cppcoreguidelines-avoid-magic-numbers,modernize-avoid-c-arrays)
|
||||
float in[100];
|
||||
for (int i = 0; i < 100; ++i) {
|
||||
for (const auto i : c10::irange(100)) {
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions,cppcoreguidelines-avoid-magic-numbers)
|
||||
in[i] = i + 1.25;
|
||||
}
|
||||
@ -34,7 +35,7 @@ TEST(BFloat16Conversion, FloatToBFloat16AndBack) {
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,cppcoreguidelines-avoid-magic-numbers,modernize-avoid-c-arrays)
|
||||
float out[100];
|
||||
|
||||
for (int i = 0; i < 100; ++i) {
|
||||
for (const auto i : c10::irange(100)) {
|
||||
bfloats[i].x = c10::detail::bits_from_f32(in[i]);
|
||||
out[i] = c10::detail::f32_from_bits(bfloats[i].x);
|
||||
|
||||
@ -47,7 +48,7 @@ TEST(BFloat16Conversion, FloatToBFloat16AndBack) {
|
||||
TEST(BFloat16Conversion, FloatToBFloat16RNEAndBack) {
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,cppcoreguidelines-avoid-magic-numbers,modernize-avoid-c-arrays)
|
||||
float in[100];
|
||||
for (int i = 0; i < 100; ++i) {
|
||||
for (const auto i : c10::irange(100)) {
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions,cppcoreguidelines-avoid-magic-numbers)
|
||||
in[i] = i + 1.25;
|
||||
}
|
||||
@ -57,7 +58,7 @@ TEST(BFloat16Conversion, FloatToBFloat16RNEAndBack) {
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,cppcoreguidelines-avoid-magic-numbers,modernize-avoid-c-arrays)
|
||||
float out[100];
|
||||
|
||||
for (int i = 0; i < 100; ++i) {
|
||||
for (const auto i : c10::irange(100)) {
|
||||
bfloats[i].x = c10::detail::round_to_nearest_even(in[i]);
|
||||
out[i] = c10::detail::f32_from_bits(bfloats[i].x);
|
||||
|
||||
|
@ -4,6 +4,7 @@
|
||||
|
||||
#include <c10/macros/Macros.h>
|
||||
#include <c10/util/Exception.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <c10/util/order_preserving_flat_hash_map.h>
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
@ -15,14 +16,15 @@ using dict_int_int =
|
||||
ska_ordered::order_preserving_flat_hash_map<int64_t, int64_t>;
|
||||
|
||||
dict_int_int test_dict(dict_int_int& dict) {
|
||||
for (int64_t i = 0; i < 100; ++i) {
|
||||
for (const auto i : c10::irange(100)) {
|
||||
dict[i] = i + 1;
|
||||
}
|
||||
|
||||
int64_t i = 0;
|
||||
int64_t entry_i = 0;
|
||||
for (auto entry : dict) {
|
||||
TORCH_INTERNAL_ASSERT(entry.first == i && entry.second == i + 1);
|
||||
++i;
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
entry.first == entry_i && entry.second == entry_i + 1);
|
||||
++entry_i;
|
||||
}
|
||||
|
||||
// erase a few entries by themselves
|
||||
@ -33,29 +35,32 @@ dict_int_int test_dict(dict_int_int& dict) {
|
||||
|
||||
// erase via iterators
|
||||
auto begin = dict.begin();
|
||||
for (size_t i = 0; i < 20; ++i)
|
||||
for (const auto i : c10::irange(20)) {
|
||||
(void)i; // Suppress unused variable warning
|
||||
begin++;
|
||||
}
|
||||
|
||||
auto end = begin;
|
||||
for (size_t i = 0; i < 20; ++i) {
|
||||
for (const auto i : c10::irange(20)) {
|
||||
(void)i; // Suppress unused variable warning
|
||||
erase_set.insert(end->first);
|
||||
end++;
|
||||
}
|
||||
dict.erase(begin, end);
|
||||
|
||||
std::vector<size_t> order;
|
||||
for (size_t i = 0; i < 100; ++i) {
|
||||
for (const auto i : c10::irange(100)) {
|
||||
if (!erase_set.count(i)) {
|
||||
order.push_back(i);
|
||||
}
|
||||
}
|
||||
|
||||
i = 0;
|
||||
entry_i = 0;
|
||||
for (auto entry : dict) {
|
||||
TORCH_INTERNAL_ASSERT(order[i] == entry.first);
|
||||
TORCH_INTERNAL_ASSERT(dict[order[i]] == entry.second);
|
||||
TORCH_INTERNAL_ASSERT(entry.second == order[i] + 1);
|
||||
i++;
|
||||
TORCH_INTERNAL_ASSERT(order[entry_i] == entry.first);
|
||||
TORCH_INTERNAL_ASSERT(dict[order[entry_i]] == entry.second);
|
||||
TORCH_INTERNAL_ASSERT(entry.second == order[entry_i] + 1);
|
||||
entry_i++;
|
||||
}
|
||||
TORCH_INTERNAL_ASSERT(dict.size() == order.size());
|
||||
return dict;
|
||||
@ -113,12 +118,12 @@ TEST(OrderedPreservingDictTest, DictCollisions) {
|
||||
|
||||
for (auto init_dict_size : {27, 34, 41}) {
|
||||
bad_hash_dict dict;
|
||||
for (int64_t i = 0; i < init_dict_size; ++i) {
|
||||
for (const auto i : c10::irange(init_dict_size)) {
|
||||
dict[i] = i + 1;
|
||||
}
|
||||
|
||||
int64_t i = 0;
|
||||
for (auto entry : dict) {
|
||||
for (const auto& entry : dict) {
|
||||
TORCH_INTERNAL_ASSERT(entry.first == i && entry.second == i + 1);
|
||||
++i;
|
||||
}
|
||||
@ -131,20 +136,22 @@ TEST(OrderedPreservingDictTest, DictCollisions) {
|
||||
|
||||
// erase a few entries via iterator
|
||||
auto begin = dict.begin();
|
||||
for (size_t i = 0; i < 10; ++i) {
|
||||
for (const auto j : c10::irange(10)) {
|
||||
(void)j; // Suppress unused variable warning
|
||||
begin++;
|
||||
}
|
||||
auto end = begin;
|
||||
for (size_t i = 0; i < 7; ++i) {
|
||||
for (const auto j : c10::irange(7)) {
|
||||
(void)j; // Suppress unused variable warning
|
||||
erase_set.insert(end->first);
|
||||
end++;
|
||||
}
|
||||
dict.erase(begin, end);
|
||||
|
||||
std::vector<int64_t> order;
|
||||
for (int64_t i = 0; i < init_dict_size; ++i) {
|
||||
if (!erase_set.count(i)) {
|
||||
order.push_back(i);
|
||||
for (const auto j : c10::irange(init_dict_size)) {
|
||||
if (!erase_set.count(j)) {
|
||||
order.push_back(j);
|
||||
}
|
||||
}
|
||||
|
||||
@ -167,7 +174,7 @@ TEST(OrderedPreservingDictTest, test_range_insert) {
|
||||
// check values
|
||||
const int nb_values = 1000;
|
||||
std::vector<std::pair<int, int>> values;
|
||||
for (int i = 0; i < nb_values; i++) {
|
||||
for (const auto i : c10::irange(nb_values)) {
|
||||
// NOLINTNEXTLINE(modernize-use-emplace,performance-inefficient-vector-operation)
|
||||
values.push_back(std::make_pair(i, i + 1));
|
||||
}
|
||||
@ -190,7 +197,7 @@ TEST(OrderedPreservingDictTest, test_range_erase_all) {
|
||||
// insert x values, delete all
|
||||
const std::size_t nb_values = 1000;
|
||||
dict_int_int map;
|
||||
for (size_t i = 0; i < nb_values; ++i) {
|
||||
for (const auto i : c10::irange(nb_values)) {
|
||||
map[i] = i + 1;
|
||||
}
|
||||
auto it = map.erase(map.begin(), map.end());
|
||||
@ -206,7 +213,7 @@ TEST(OrderedPreservingDictTest, test_range_erase) {
|
||||
|
||||
const std::size_t nb_values = 1000;
|
||||
HMap map;
|
||||
for (size_t i = 0; i < nb_values; ++i) {
|
||||
for (const auto i : c10::irange(nb_values)) {
|
||||
map[c10::guts::to_string(i)] = i;
|
||||
auto begin = map.begin();
|
||||
for (size_t j = 0; j <= i; ++j, begin++) {
|
||||
@ -305,7 +312,7 @@ TEST(OrderedPreservingDictTest, test_copy_constructor_and_operator) {
|
||||
|
||||
const std::size_t nb_values = 100;
|
||||
HMap map;
|
||||
for (size_t i = 0; i < nb_values; ++i) {
|
||||
for (const auto i : c10::irange(nb_values)) {
|
||||
map[c10::guts::to_string(i)] = c10::guts::to_string(i);
|
||||
}
|
||||
|
||||
|
@ -1,6 +1,7 @@
|
||||
#include <c10/util/Backtrace.h>
|
||||
#include <c10/util/Optional.h>
|
||||
#include <c10/util/Type.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
@ -281,8 +282,7 @@ std::string get_backtrace(
|
||||
// Toggles to true after the first skipped python frame.
|
||||
bool has_skipped_python_frames = false;
|
||||
|
||||
for (size_t frame_number = 0; frame_number < callstack.size();
|
||||
++frame_number) {
|
||||
for (const auto frame_number : c10::irange(callstack.size())) {
|
||||
const auto frame = parse_frame_information(symbols[frame_number]);
|
||||
|
||||
if (skip_python_frames && frame && is_python_frame(*frame)) {
|
||||
|
@ -27,6 +27,7 @@
|
||||
#include <c10/util/flat_hash_map.h>
|
||||
|
||||
#include <c10/core/ScalarType.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
/*
|
||||
* TypeIdentifier is a small type containing an id.
|
||||
@ -170,7 +171,7 @@ struct TypeMetaData final {
|
||||
template <typename T>
|
||||
inline void _PlacementNew(void* ptr, size_t n) {
|
||||
T* typed_ptr = static_cast<T*>(ptr);
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
for (const auto i : c10::irange(n)) {
|
||||
new (typed_ptr + i) T;
|
||||
}
|
||||
}
|
||||
@ -234,7 +235,7 @@ template <typename T>
|
||||
inline void _Copy(const void* src, void* dst, size_t n) {
|
||||
const T* typed_src = static_cast<const T*>(src);
|
||||
T* typed_dst = static_cast<T*>(dst);
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
for (const auto i : c10::irange(n)) {
|
||||
typed_dst[i] = typed_src[i];
|
||||
}
|
||||
}
|
||||
@ -274,7 +275,7 @@ inline constexpr TypeMetaData::Copy* _PickCopy() {
|
||||
template <typename T>
|
||||
inline void _PlacementDelete(void* ptr, size_t n) {
|
||||
T* typed_ptr = static_cast<T*>(ptr);
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
for (const auto i : c10::irange(n)) {
|
||||
typed_ptr[i].~T();
|
||||
}
|
||||
}
|
||||
|
@ -3,6 +3,7 @@
|
||||
#include <string>
|
||||
#include <ATen/ATen.h>
|
||||
#include <c10/macros/Macros.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <caffe2/core/context.h>
|
||||
#include <caffe2/core/operator.h>
|
||||
#include <caffe2/utils/math.h>
|
||||
@ -130,7 +131,7 @@ private:
|
||||
void assignListStartingAt(
|
||||
size_t offset,
|
||||
const std::vector<at::Tensor>& tensors) {
|
||||
for (size_t i = 0; i < tensors.size(); i++) {
|
||||
for (const auto i : c10::irange(tensors.size())) {
|
||||
assignTo(Output(offset + i), tensors[i]);
|
||||
}
|
||||
}
|
||||
@ -176,7 +177,7 @@ private:
|
||||
std::stringstream descriptor;
|
||||
descriptor << op;
|
||||
std::vector<std::string> attrs;
|
||||
for(size_t i = 0; i < operator_def.arg_size(); i++) {
|
||||
for (const auto i : c10::irange(operator_def.arg_size())) {
|
||||
auto & attr = operator_def.arg(i);
|
||||
if(attr.name() == "operator" || attr.name() == "type" )
|
||||
continue;
|
||||
@ -223,7 +224,7 @@ private:
|
||||
std::vector<int64_t> ints =
|
||||
OperatorBase::GetRepeatedArgument<int64_t>(name, {});
|
||||
std::array<bool, N> result;
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
for (const auto i : c10::irange(N)) {
|
||||
result[i] = ints.at(i);
|
||||
}
|
||||
return result;
|
||||
|
@ -118,8 +118,8 @@ class Fp16FCAccOp final : public Operator<Context> {
|
||||
if (!W_fbgemm->packed()) {
|
||||
float* W_fp16_trans = new float[W_size];
|
||||
fbgemm::Float16ToFloat_avx2(W_fbgemm->pmat(), W_fp16_trans, W_size);
|
||||
for (int i = 0; i < N; i++) {
|
||||
for (int j = 0; j < K; j++) {
|
||||
for (const auto i : c10::irange(N)) {
|
||||
for (const auto j : c10::irange(K)) {
|
||||
W_fp16_[j * N + i] = W_fp16_trans[i * K + j];
|
||||
}
|
||||
}
|
||||
@ -136,8 +136,8 @@ class Fp16FCAccOp final : public Operator<Context> {
|
||||
const auto& W = Input(1);
|
||||
W_data = W.template data<T_W>();
|
||||
// Transpose W
|
||||
for (int i = 0; i < N; i++) {
|
||||
for (int j = 0; j < K; j++) {
|
||||
for (const auto i : c10::irange(N)) {
|
||||
for (const auto j : c10::irange(K)) {
|
||||
W_fp16_[j * N + i] = W_data[i * K + j];
|
||||
}
|
||||
}
|
||||
@ -352,7 +352,7 @@ class Fp16FCAccOp final : public Operator<Context> {
|
||||
#ifdef LOG_LEVEL_FOR_FBFCPACKEDACC16_ACCURACY_LOG
|
||||
float compute_L2_norm(float* A, int size) {
|
||||
float square_sum = 0.0;
|
||||
for (int i = 0; i < size; i++) {
|
||||
for (const auto i : c10::irange(size)) {
|
||||
square_sum += A[i] * A[i];
|
||||
}
|
||||
return std::sqrt(square_sum);
|
||||
@ -360,7 +360,7 @@ class Fp16FCAccOp final : public Operator<Context> {
|
||||
|
||||
float compute_relative_error(float* A, float* A_ref, int size) {
|
||||
float error = 0.0;
|
||||
for (int i = 0; i < size; i++) {
|
||||
for (const auto i : c10::irange(size)) {
|
||||
error += (A[i] - A_ref[i]) * (A[i] - A_ref[i]);
|
||||
}
|
||||
error = std::sqrt(error);
|
||||
|
@ -22,7 +22,7 @@ void Int8DequantizeNNPI(
|
||||
const float X_scale,
|
||||
const int32_t X_offset) {
|
||||
float X_scale_fp32 = 1.0f / X_scale;
|
||||
for (auto i = 0; i < N; ++i) {
|
||||
for (const auto i : c10::irange(N)) {
|
||||
out[i] = (float)(static_cast<int32_t>(in[i]) - X_offset) / X_scale_fp32;
|
||||
}
|
||||
} // namespace
|
||||
|
@ -53,12 +53,12 @@ void Int8QuantizeNNPI(
|
||||
std::vector<float> inv_scalev(N, inv_scale_fp16);
|
||||
std::vector<float> offsetv(N, -offset_tmp);
|
||||
fake_fp16::fma_fp16(N, in_fp16.data(), inv_scalev.data(), offsetv.data());
|
||||
for (int i = 0; i < N; i++) {
|
||||
for (const auto i : c10::irange(N)) {
|
||||
offsetv[i] = round(offsetv[i]);
|
||||
}
|
||||
fbgemm::RoundToFloat16(
|
||||
offsetv.data(), offsetv.data(), N, false /* no clamping */);
|
||||
for (int i = 0; i < N; i++) {
|
||||
for (const auto i : c10::irange(N)) {
|
||||
float halfRes = offsetv[i];
|
||||
if (std::isinf(halfRes)) {
|
||||
if (halfRes > 0) {
|
||||
|
@ -29,7 +29,7 @@ void SwishFakeInt8NNPI(
|
||||
int32_t quant_val = 0;
|
||||
uint8_t result = 0;
|
||||
|
||||
for (auto i = 0; i < N; ++i) {
|
||||
for (const auto i : c10::irange(N)) {
|
||||
deq_val = (static_cast<uint8_t>(in[i]) - X_offset) / X_scale_fp32;
|
||||
deq_swish = deq_val / (1 + exp(-deq_val));
|
||||
quant_val = round(deq_swish / Y_scale + Y_offset);
|
||||
|
@ -129,7 +129,7 @@ class LayerNormFakeFp16Op final : public Operator<CPUContext> {
|
||||
FLAGS_caffe2_fbgemm_fake_fp16_clamp,
|
||||
false /*USE_ACC_FP16*/);
|
||||
|
||||
for (int i = 0; i < M; ++i) {
|
||||
for (const auto i : c10::irange(M)) {
|
||||
// fma_fp16(A, B, Out) -> Out = A * B + Out
|
||||
std::vector<float> out(N);
|
||||
std::memcpy(out.data(), bias_data.data(), sizeof(float) * N);
|
||||
@ -169,7 +169,7 @@ class LayerNormFakeFp16Op final : public Operator<CPUContext> {
|
||||
const int32_t qmin = std::numeric_limits<uint8_t>::min();
|
||||
const int32_t qmax = std::numeric_limits<uint8_t>::max();
|
||||
|
||||
for (int i = 0; i < Nout; i++) {
|
||||
for (const auto i : c10::irange(Nout)) {
|
||||
float halfRes = offsetv[i];
|
||||
halfRes = round(halfRes);
|
||||
if (std::isinf(halfRes)) {
|
||||
|
@ -85,7 +85,7 @@ class SparseLengthsFused4BitRowwiseFakeFP16Op final : public Operator<Context> {
|
||||
const auto scale_bias_offset = 2 * sizeof(at::Half);
|
||||
const int64_t input_fused_block_size = input_block_size + scale_bias_offset;
|
||||
int64_t current = 0;
|
||||
for (int m = 0; m < output_size; ++m) {
|
||||
for (const auto m : c10::irange(output_size)) {
|
||||
if (!use_fp16_for_embedding_only) {
|
||||
memset(rowTempSums[0].data(), 0, sizeof(float) * output_block_size);
|
||||
memset(rowTempSums[1].data(), 0, sizeof(float) * output_block_size);
|
||||
@ -135,7 +135,7 @@ class SparseLengthsFused4BitRowwiseFakeFP16Op final : public Operator<Context> {
|
||||
// Unpack int4 elements
|
||||
std::vector<float> input_rounded(output_block_size);
|
||||
int k = 0;
|
||||
for (int j = 0; j < input_block_size; j++) {
|
||||
for (const auto j : c10::irange(input_block_size)) {
|
||||
input_rounded[k++] =
|
||||
input[input_fused_block_size * indices_data[current] + j] & 0x0f;
|
||||
input_rounded[k++] =
|
||||
@ -150,7 +150,7 @@ class SparseLengthsFused4BitRowwiseFakeFP16Op final : public Operator<Context> {
|
||||
input_rounded.data(),
|
||||
product_rounded.data());
|
||||
|
||||
for (int j = 0; j < output_block_size; ++j) {
|
||||
for (const auto j : c10::irange(output_block_size)) {
|
||||
product_rounded[j] += bias;
|
||||
}
|
||||
|
||||
@ -190,7 +190,7 @@ class SparseLengthsFused4BitRowwiseFakeFP16Op final : public Operator<Context> {
|
||||
}
|
||||
|
||||
if (!use_fp16_for_embedding_only) {
|
||||
for (int j = 0; j < output_block_size; ++j) {
|
||||
for (const auto j : c10::irange(output_block_size)) {
|
||||
out[j] = rowTempSums[0][j] + rowTempSums[1][j];
|
||||
}
|
||||
fbgemm::RoundToFloat16(
|
||||
|
@ -84,7 +84,7 @@ class SparseLengthsFused8BitRowwiseFakeFP16Op final : public Operator<Context> {
|
||||
const auto scale_bias_offset = 8 / sizeof(uint8_t);
|
||||
const int64_t fused_block_size = block_size + scale_bias_offset;
|
||||
int64_t current = 0;
|
||||
for (int m = 0; m < output_size; ++m) {
|
||||
for (const auto m : c10::irange(output_size)) {
|
||||
memset(out, 0, sizeof(float) * block_size);
|
||||
memset(rowTempSums[0].data(), 0, sizeof(float) * block_size);
|
||||
memset(rowTempSums[1].data(), 0, sizeof(float) * block_size);
|
||||
@ -152,7 +152,7 @@ class SparseLengthsFused8BitRowwiseFakeFP16Op final : public Operator<Context> {
|
||||
|
||||
// Fake fp16 rounding of input/ it is already ints
|
||||
std::vector<float> input_rounded(block_size);
|
||||
for (int j = 0; j < block_size; ++j) {
|
||||
for (const auto j : c10::irange(block_size)) {
|
||||
input_rounded[j] =
|
||||
input[fused_block_size * indices_data[current] + j];
|
||||
}
|
||||
@ -164,7 +164,7 @@ class SparseLengthsFused8BitRowwiseFakeFP16Op final : public Operator<Context> {
|
||||
TypedAxpy<float, float>(
|
||||
block_size, scale, input_rounded.data(), product_rounded.data());
|
||||
|
||||
for (int j = 0; j < block_size; ++j) {
|
||||
for (const auto j : c10::irange(block_size)) {
|
||||
product_rounded[j] += bias;
|
||||
}
|
||||
|
||||
@ -215,7 +215,7 @@ class SparseLengthsFused8BitRowwiseFakeFP16Op final : public Operator<Context> {
|
||||
block_size,
|
||||
FLAGS_caffe2_fbgemm_fake_fp16_clamp);
|
||||
|
||||
for (int j = 0; j < block_size; ++j) {
|
||||
for (const auto j : c10::irange(block_size)) {
|
||||
product_rounded[j] += bias;
|
||||
}
|
||||
// Fake fp16 rounding of w x scale x input + w x bias
|
||||
@ -239,7 +239,7 @@ class SparseLengthsFused8BitRowwiseFakeFP16Op final : public Operator<Context> {
|
||||
block_size,
|
||||
FLAGS_caffe2_fbgemm_fake_fp16_clamp);
|
||||
} else if (use_acc_fp32) {
|
||||
for (int j = 0; j < block_size; ++j) {
|
||||
for (const auto j : c10::irange(block_size)) {
|
||||
float deqVal = fake_fp16::fmafp32_avx_emulation(
|
||||
scale,
|
||||
input_rounded[j],
|
||||
@ -256,7 +256,7 @@ class SparseLengthsFused8BitRowwiseFakeFP16Op final : public Operator<Context> {
|
||||
|
||||
TypedAxpy<float, float>(block_size, scale, input_rounded.data(), out);
|
||||
|
||||
for (int j = 0; j < block_size; ++j) {
|
||||
for (const auto j : c10::irange(block_size)) {
|
||||
out[j] += bias;
|
||||
}
|
||||
}
|
||||
@ -264,7 +264,7 @@ class SparseLengthsFused8BitRowwiseFakeFP16Op final : public Operator<Context> {
|
||||
}
|
||||
|
||||
if (use_nnpi_fma || use_acc_fp32) {
|
||||
for (int j = 0; j < block_size; ++j) {
|
||||
for (const auto j : c10::irange(block_size)) {
|
||||
out[j] = rowTempSums[0][j] + rowTempSums[1][j];
|
||||
}
|
||||
}
|
||||
|
@ -94,7 +94,7 @@ class SparseLengthsReductionFakeFp16Op final : public Operator<CPUContext> {
|
||||
float* out = out_data;
|
||||
|
||||
int64_t current = 0;
|
||||
for (int m = 0; m < output_size; ++m) {
|
||||
for (const auto m : c10::irange(output_size)) {
|
||||
memset(out, 0, sizeof(float) * block_size);
|
||||
if (current + lengths[m] > index_size) {
|
||||
return false;
|
||||
|
@ -39,7 +39,7 @@ class TanhInt8QuantizeNNPIOp final : public Operator<CPUContext> {
|
||||
Y_scale = 1.0f / Y_scale;
|
||||
|
||||
// create table once
|
||||
for (int i = 0; i < lutSize; i++) {
|
||||
for (const auto i : c10::irange(lutSize)) {
|
||||
short input = i + tanhLUTMinOffset;
|
||||
float x = _cvtsh_ss(input);
|
||||
float tanh_x = tanh(x);
|
||||
@ -54,7 +54,7 @@ class TanhInt8QuantizeNNPIOp final : public Operator<CPUContext> {
|
||||
}
|
||||
|
||||
const float* X_data = X.template data<float>();
|
||||
for (int i = 0; i < X.numel(); i++) {
|
||||
for (const auto i : c10::irange(X.numel())) {
|
||||
short val = _cvtss_sh(X_data[i], 0);
|
||||
unsigned short max16BitPositive = 0x7FFF;
|
||||
unsigned short input16Bit = (*(unsigned short*)& val);
|
||||
|
@ -159,7 +159,7 @@ class SpatialBNFakeLoweredFp16Op : public Operator<CPUContext> {
|
||||
const int stride = C * HxW;
|
||||
const float* X_ptr = X;
|
||||
float* Y_ptr = Y;
|
||||
for (int i = 0; i < N; ++i) {
|
||||
for (const auto i : c10::irange(N)) {
|
||||
EigenArrayMap<float>(Y_ptr, HxW, C) =
|
||||
ConstEigenArrayMap<float>(X_ptr, HxW, C).rowwise() -
|
||||
mean_arr.transpose();
|
||||
@ -356,9 +356,9 @@ class SpatialBNFakeFp16Op : public Operator<CPUContext> {
|
||||
float* Y_ptr = Y;
|
||||
|
||||
// Do Y = X * scale + bias
|
||||
for (int i = 0; i < N; i++) {
|
||||
for (int j = 0; j < C; j++) {
|
||||
for (int k = 0; k < HxW; k++) {
|
||||
for (const auto i : c10::irange(N)) {
|
||||
for (const auto j : c10::irange(C)) {
|
||||
for (const auto k : c10::irange(HxW)) {
|
||||
Y_ptr[HxW * j + k] = bias[j];
|
||||
}
|
||||
|
||||
|
@ -18,7 +18,7 @@ class SumFP16FP16AccOp : public Operator<Context> {
|
||||
size_t N = input0.numel();
|
||||
auto* output = Output(0, input0.sizes(), at::dtype<float>());
|
||||
// Dimension checking
|
||||
for (int i = 1; i < InputSize(); ++i) {
|
||||
for (const auto i : c10::irange(1, InputSize())) {
|
||||
if (output->sizes() != Input(i).sizes()) {
|
||||
CAFFE_THROW(
|
||||
"Check failed: output->sizes() == Input(i).sizes().",
|
||||
@ -37,7 +37,7 @@ class SumFP16FP16AccOp : public Operator<Context> {
|
||||
std::vector<float> t1(N);
|
||||
std::vector<float> t2(N);
|
||||
|
||||
for (auto i = 0; i < InputSize(); i++) {
|
||||
for (const auto i : c10::irange(InputSize())) {
|
||||
fbgemm::RoundToFloat16(
|
||||
Input(i).template data<float>(),
|
||||
t1.data(),
|
||||
|
@ -85,13 +85,13 @@ class AllgatherOp final : public Operator<Context> {
|
||||
|
||||
// Verify tensors all have same size
|
||||
size_t size = Input(1).numel();
|
||||
for (auto i = 2; i < InputSize(); i++) {
|
||||
for (const auto i : c10::irange(2, InputSize())) {
|
||||
CAFFE_ENFORCE_EQ(Input(i).numel(), size);
|
||||
}
|
||||
|
||||
// Verify tensors all have same type
|
||||
TypeMeta meta = Input(1).dtype();
|
||||
for (auto i = 2; i < InputSize(); i++) {
|
||||
for (const auto i : c10::irange(2, InputSize())) {
|
||||
CAFFE_ENFORCE(Input(i).dtype() == meta);
|
||||
}
|
||||
|
||||
@ -113,7 +113,7 @@ class AllgatherOp final : public Operator<Context> {
|
||||
params.inputs.resize(InputSize() - 1);
|
||||
params.size = Input(1).numel();
|
||||
params.meta = Input(1).dtype();
|
||||
for (auto i = 0; i < params.inputs.size(); i++) {
|
||||
for (const auto i : c10::irange(params.inputs.size())) {
|
||||
params.inputs[i] = Input(i + 1).raw_data();
|
||||
}
|
||||
params.outputs.resize(OutputSize());
|
||||
|
@ -65,19 +65,19 @@ class AllreduceOp final : public Operator<Context> {
|
||||
|
||||
// Verify inputs == outputs
|
||||
CAFFE_ENFORCE_EQ(init_.inputs.size(), init_.outputs.size());
|
||||
for (auto i = 0U; i < init_.inputs.size(); i++) {
|
||||
for (const auto i : c10::irange(0U, init_.inputs.size())) {
|
||||
CAFFE_ENFORCE_EQ(init_.inputs[i], init_.outputs[i]);
|
||||
}
|
||||
|
||||
// Verify tensors all have same size
|
||||
auto size = Input(1).numel();
|
||||
for (auto i = 2; i < InputSize(); i++) {
|
||||
for (const auto i : c10::irange(2, InputSize())) {
|
||||
CAFFE_ENFORCE_EQ(Input(i).numel(), size);
|
||||
}
|
||||
|
||||
// Verify tensors all have same type
|
||||
TypeMeta meta = Input(1).dtype();
|
||||
for (auto i = 2; i < InputSize(); i++) {
|
||||
for (const auto i : c10::irange(2, InputSize())) {
|
||||
CAFFE_ENFORCE(Input(i).dtype() == meta);
|
||||
}
|
||||
|
||||
@ -115,7 +115,7 @@ class AllreduceOp final : public Operator<Context> {
|
||||
params.context = OperatorBase::Input<std::shared_ptr<::gloo::Context>>(0);
|
||||
params.inputs.resize(InputSize() - 1);
|
||||
params.outputs.resize(OutputSize());
|
||||
for (auto i = 0U; i < params.inputs.size(); i++) {
|
||||
for (const auto i : c10::irange(0U, params.inputs.size())) {
|
||||
params.inputs[i] = Input(i + 1).raw_data();
|
||||
params.outputs[i] = Output(i)->raw_mutable_data();
|
||||
}
|
||||
|
@ -60,19 +60,19 @@ class BroadcastOp final : public Operator<Context> {
|
||||
|
||||
// Verify inputs == outputs
|
||||
CAFFE_ENFORCE_EQ(init_.inputs.size(), init_.outputs.size());
|
||||
for (auto i = 0; i < init_.inputs.size(); i++) {
|
||||
for (const auto i : c10::irange(init_.inputs.size())) {
|
||||
CAFFE_ENFORCE_EQ(init_.inputs[i], init_.outputs[i]);
|
||||
}
|
||||
|
||||
// Verify tensors all have same size
|
||||
size_t size = Input(1).numel();
|
||||
for (auto i = 2; i < InputSize(); i++) {
|
||||
for (const auto i : c10::irange(2, InputSize())) {
|
||||
CAFFE_ENFORCE_EQ(Input(i).numel(), size);
|
||||
}
|
||||
|
||||
// Verify tensors all have same size
|
||||
TypeMeta meta = Input(1).dtype();
|
||||
for (auto i = 2; i < InputSize(); i++) {
|
||||
for (const auto i : c10::irange(2, InputSize())) {
|
||||
CAFFE_ENFORCE(Input(i).dtype() == meta);
|
||||
}
|
||||
|
||||
@ -94,7 +94,7 @@ class BroadcastOp final : public Operator<Context> {
|
||||
params.context = OperatorBase::Input<std::shared_ptr<::gloo::Context>>(0);
|
||||
params.inputs.resize(InputSize() - 1);
|
||||
params.outputs.resize(OutputSize());
|
||||
for (auto i = 0; i < params.inputs.size(); i++) {
|
||||
for (const auto i : c10::irange(params.inputs.size())) {
|
||||
params.inputs[i] = Input(i + 1).raw_data();
|
||||
params.outputs[i] = Output(i)->raw_mutable_data();
|
||||
}
|
||||
|
@ -75,7 +75,7 @@ class ReduceScatterOp final : public Operator<Context> {
|
||||
|
||||
// Verify inputs == outputs
|
||||
CAFFE_ENFORCE_EQ(init_.inputs.size(), init_.outputs.size());
|
||||
for (auto i = 0; i < init_.inputs.size(); i++) {
|
||||
for (const auto i : c10::irange(init_.inputs.size())) {
|
||||
CAFFE_ENFORCE_EQ(init_.inputs[i], init_.outputs[i]);
|
||||
}
|
||||
|
||||
@ -107,7 +107,7 @@ class ReduceScatterOp final : public Operator<Context> {
|
||||
params.context = OperatorBase::Input<std::shared_ptr<::gloo::Context>>(0);
|
||||
params.inputs.resize(InputSize() - 2);
|
||||
params.outputs.resize(OutputSize() - 1);
|
||||
for (auto i = 0; i < params.inputs.size(); i++) {
|
||||
for (const auto i : c10::irange(params.inputs.size())) {
|
||||
params.inputs[i] = Input(i + 1).raw_data();
|
||||
params.outputs[i] = Output(i)->raw_mutable_data();
|
||||
}
|
||||
|
@ -1241,7 +1241,7 @@ inline cl_int getInfoHelper(Func f, cl_uint name, size_t<N>* param, long)
|
||||
return err;
|
||||
}
|
||||
|
||||
for(int i = 0; i < N; ++i) {
|
||||
for (const auto i : c10::irange(N)) {
|
||||
(*param)[i] = value[i];
|
||||
}
|
||||
|
||||
|
@ -9,6 +9,8 @@
|
||||
#include "caffe2/core/blob.h"
|
||||
#include "caffe2/core/blob_serializer_base.h"
|
||||
#include "caffe2/core/tensor.h"
|
||||
|
||||
#include <c10/util/irange.h>
|
||||
#include <c10/util/typeid.h>
|
||||
#include "caffe2/core/types.h"
|
||||
#include "caffe2/utils/simple_queue.h"
|
||||
@ -201,7 +203,7 @@ void ExtendRepeatedField(
|
||||
#else
|
||||
// We unfortunately do still need to support old protobuf versions in some
|
||||
// build configurations.
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
for (const auto i : c10::irange(size)) {
|
||||
field->Add(0);
|
||||
}
|
||||
#endif
|
||||
@ -236,7 +238,7 @@ inline void CopyToProtoWithCast(
|
||||
context->template CopyToCPU<SrcType>(size, src, buffer.get());
|
||||
context->FinishDeviceComputation();
|
||||
field->Reserve(size);
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
for (const auto i : c10::irange(size)) {
|
||||
field->Add(static_cast<DstType>(buffer[i]));
|
||||
}
|
||||
}
|
||||
@ -267,7 +269,7 @@ inline void CopyFromProtoWithCast(
|
||||
// CPUContext. Remove it if it is performance critical.
|
||||
unique_ptr<DstType[]> buffer(new DstType[size]);
|
||||
const SrcType* src = field.data();
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
for (const auto i : c10::irange(size)) {
|
||||
buffer[i] = static_cast<DstType>(src[i]);
|
||||
}
|
||||
context->template CopyFromCPU<DstType>(size, buffer.get(), dst);
|
||||
|
@ -17,6 +17,7 @@
|
||||
|
||||
#if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
|
||||
#include <c10/core/GeneratorImpl.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <ATen/core/DistributionsHelper.h>
|
||||
#include <ATen/core/MT19937RNGEngine.h>
|
||||
#else
|
||||
@ -155,7 +156,7 @@ class TORCH_API CPUContext final : public BaseContext {
|
||||
static_cast<const void*>(src),
|
||||
static_cast<void*>(dst));
|
||||
} else {
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
for (const auto i : c10::irange(n)) {
|
||||
dst[i] = src[i];
|
||||
}
|
||||
}
|
||||
|
@ -4,6 +4,7 @@
|
||||
#include <mutex>
|
||||
|
||||
#include <c10/util/Registry.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <c10/util/string_view.h>
|
||||
#include "caffe2/core/blob_serialization.h"
|
||||
#include "caffe2/proto/caffe2_pb.h"
|
||||
@ -248,7 +249,8 @@ class TORCH_API DBReader {
|
||||
*value = cursor_->value();
|
||||
|
||||
// In sharded mode, each read skips num_shards_ records
|
||||
for (uint32_t s = 0; s < num_shards_; s++) {
|
||||
for (const auto s : c10::irange(num_shards_)) {
|
||||
(void)s; // Suppress unused variable
|
||||
cursor_->Next();
|
||||
if (!cursor_->Valid()) {
|
||||
MoveToBeginning();
|
||||
@ -292,7 +294,8 @@ class TORCH_API DBReader {
|
||||
|
||||
void MoveToBeginning() const {
|
||||
cursor_->SeekToFirst();
|
||||
for (uint32_t s = 0; s < shard_id_; s++) {
|
||||
for (const auto s : c10::irange(shard_id_)) {
|
||||
(void)s; // Suppress unused variable
|
||||
cursor_->Next();
|
||||
CAFFE_ENFORCE(
|
||||
cursor_->Valid(), "Db has fewer rows than shard id: ", s, shard_id_);
|
||||
|
@ -12,6 +12,7 @@
|
||||
#include <c10/util/C++17.h>
|
||||
#include <c10/util/Metaprogramming.h>
|
||||
#include "caffe2/core/export_caffe2_op_to_c10.h"
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
@ -136,7 +137,7 @@ class C10OperatorWrapper final : public Operator<Context> {
|
||||
|
||||
void popOutputs_() {
|
||||
AT_ASSERT(stack_.size() == op_.schema().returns().size());
|
||||
for (size_t i = 0; i < op_.schema().returns().size(); ++i) {
|
||||
for (const auto i : c10::irange(op_.schema().returns().size())) {
|
||||
OperatorBase::SetOutputTensor(i, Tensor(std::move(stack_[i]).toTensor()));
|
||||
}
|
||||
stack_.clear();
|
||||
@ -146,7 +147,7 @@ class C10OperatorWrapper final : public Operator<Context> {
|
||||
c10::List<at::Tensor> result;
|
||||
result.reserve(InputSize());
|
||||
// NOLINTNEXTLINE(clang-diagnostic-sign-compare)
|
||||
for (size_t i = 0; i < InputSize(); ++i) {
|
||||
for (const auto i : c10::irange(InputSize())) {
|
||||
result.emplace_back(Input(i));
|
||||
}
|
||||
return result;
|
||||
@ -156,7 +157,7 @@ class C10OperatorWrapper final : public Operator<Context> {
|
||||
c10::List<at::Tensor> result;
|
||||
result.reserve(OutputSize());
|
||||
// NOLINTNEXTLINE(clang-diagnostic-sign-compare)
|
||||
for (size_t i = 0; i < OutputSize(); ++i) {
|
||||
for (const auto i : c10::irange(OutputSize())) {
|
||||
result.emplace_back(OperatorBase::OutputTensorOrUndefined(i));
|
||||
}
|
||||
return result;
|
||||
|
@ -9,6 +9,7 @@
|
||||
#include <ATen/core/op_registration/op_registration.h>
|
||||
#include <torch/csrc/jit/frontend/function_schema_parser.h>
|
||||
#include <c10/core/CompileTimeFunctionPointer.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <torch/library.h>
|
||||
#include <vector>
|
||||
|
||||
@ -94,7 +95,7 @@ inline void _call_caffe2_op_from_c10(
|
||||
// We should not unwrap the list if we expect tensor list in the schema.
|
||||
torch::jit::push(*stack, outputs);
|
||||
} else {
|
||||
for (size_t i = 0; i < outputs.size(); ++i) {
|
||||
for (const auto i : c10::irange(outputs.size())) {
|
||||
torch::jit::push(*stack, outputs.extract(i));
|
||||
}
|
||||
}
|
||||
|
@ -1,6 +1,7 @@
|
||||
#ifndef NOM_CONVERTERS_DOT_H
|
||||
#define NOM_CONVERTERS_DOT_H
|
||||
|
||||
#include "c10/util/irange.h"
|
||||
#include "nomnigraph/Graph/Algorithms.h"
|
||||
#include "nomnigraph/Graph/Graph.h"
|
||||
#include "nomnigraph/Support/Casting.h"
|
||||
@ -42,7 +43,7 @@ class DotGenerator {
|
||||
for (const auto& node : sg.getNodes()) {
|
||||
generateNode(node, sg, output);
|
||||
}
|
||||
for (size_t i = 0; i < subgraphs.size(); ++i) {
|
||||
for (const auto i : c10::irange(subgraphs.size())) {
|
||||
const auto& subgraph = subgraphs[i];
|
||||
output << "subgraph cluster" << i << " {\n";
|
||||
output << "style=dotted;\n";
|
||||
|
@ -1,6 +1,7 @@
|
||||
#ifndef NOM_TRANFORMATIONS_SUBGRAPH_MATCHER_H
|
||||
#define NOM_TRANFORMATIONS_SUBGRAPH_MATCHER_H
|
||||
|
||||
#include "c10/util/irange.h"
|
||||
#include "caffe2/core/common.h"
|
||||
#include "nomnigraph/Graph/Graph.h"
|
||||
|
||||
@ -240,8 +241,7 @@ class MatchGraph : public Graph<MatchPredicate<GraphType>> {
|
||||
// criteria in the given order.
|
||||
|
||||
int currentEdgeIdx = 0;
|
||||
for (int criteriaIdx = 0; criteriaIdx < numChildrenCriteria;
|
||||
criteriaIdx++) {
|
||||
for (const auto criteriaIdx : c10::irange(numChildrenCriteria)) {
|
||||
auto childrenCriteriaRef = invertGraphTraversal
|
||||
? criteriaEdges[criteriaIdx]->tail()
|
||||
: criteriaEdges[criteriaIdx]->head();
|
||||
|
@ -9,13 +9,14 @@
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#include "c10/util/Registry.h"
|
||||
#include "caffe2/core/common.h"
|
||||
#include "caffe2/core/logging.h"
|
||||
#include "caffe2/core/types.h"
|
||||
#include "caffe2/proto/caffe2_pb.h"
|
||||
#include "caffe2/utils/filler.h"
|
||||
#include "caffe2/utils/proto_utils.h"
|
||||
#include <c10/util/irange.h>
|
||||
#include <c10/util/Registry.h>
|
||||
#include <caffe2/core/common.h>
|
||||
#include <caffe2/core/logging.h>
|
||||
#include <caffe2/core/types.h>
|
||||
#include <caffe2/proto/caffe2_pb.h>
|
||||
#include <caffe2/utils/filler.h>
|
||||
#include <caffe2/utils/proto_utils.h>
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
@ -519,7 +520,7 @@ inline uint64_t nElemFromDim(const TensorShape& X, int dim = 0) {
|
||||
CAFFE_ENFORCE_GE(dim, 0, "Invalid maximum index specified");
|
||||
|
||||
uint64_t nElem = 1;
|
||||
for (int i = dim; i < X.dims_size(); ++i) {
|
||||
for (const auto i : c10::irange(dim, X.dims_size())) {
|
||||
nElem *= X.dims(i);
|
||||
}
|
||||
return nElem;
|
||||
@ -531,7 +532,7 @@ inline uint64_t nElemBetweenDim(const TensorShape& X, int start, int stop) {
|
||||
CAFFE_ENFORCE_LE(stop, X.dims_size(), "Invalid maximum index specified");
|
||||
|
||||
uint64_t nElem = 1;
|
||||
for (int i = start; i < stop; ++i) {
|
||||
for (const auto i : c10::irange(start, stop)) {
|
||||
nElem *= X.dims(i);
|
||||
}
|
||||
return nElem;
|
||||
@ -560,7 +561,7 @@ OpSchema::Cost PointwiseCostInference(
|
||||
const TensorShape X = inputs[0];
|
||||
uint64_t nElemX = nElemFromDim(X);
|
||||
uint64_t nElemRead = 0;
|
||||
for (size_t i = 0; i < inputs.size(); ++i) {
|
||||
for (const auto i : c10::irange(inputs.size())) {
|
||||
nElemRead += nElemFromDim(inputs[i]);
|
||||
}
|
||||
|
||||
|
@ -5,6 +5,7 @@
|
||||
#include "caffe2/core/context.h"
|
||||
#include "caffe2/core/tensor.h"
|
||||
#include <c10/util/accumulate.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <c10/util/typeid.h>
|
||||
|
||||
#include <algorithm>
|
||||
@ -218,7 +219,7 @@ class C10_EXPORT QTensor {
|
||||
*/
|
||||
inline int64_t size_from_dim(int k) const {
|
||||
int64_t r = 1;
|
||||
for (int i = k; i < dims_.size(); ++i) {
|
||||
for (const auto i : c10::irange(k, dims_.size())) {
|
||||
r *= dims_[i];
|
||||
}
|
||||
return r;
|
||||
@ -230,7 +231,7 @@ class C10_EXPORT QTensor {
|
||||
inline int64_t size_to_dim(int k) const {
|
||||
CAFFE_ENFORCE(k < dims_.size());
|
||||
int64_t r = 1;
|
||||
for (int i = 0; i < k; ++i) {
|
||||
for (const auto i : c10::irange(k)) {
|
||||
r *= dims_[i];
|
||||
}
|
||||
return r;
|
||||
|
@ -46,7 +46,7 @@ void QTensorSerializer<Context>::Serialize(
|
||||
blob_proto.set_type(kQTensorBlobQType);
|
||||
QTensorProto& proto = *blob_proto.mutable_qtensor();
|
||||
proto.set_name(name);
|
||||
for (int i = 0; i < qtensor.ndim(); ++i) {
|
||||
for (const auto i : c10::irange(qtensor.ndim())) {
|
||||
proto.add_dims(qtensor.dim32(i));
|
||||
}
|
||||
proto.set_precision(qtensor.precision());
|
||||
|
@ -73,7 +73,7 @@ TORCH_API ExportedStatMap toMap(const ExportedStatList& stats);
|
||||
* int main() {
|
||||
* MyCaffeClass a("first");
|
||||
* MyCaffeClass b("second");
|
||||
* for (int i = 0; i < 10; ++i) {
|
||||
* for (const auto i : c10::irange(10)) {
|
||||
* a.run(10);
|
||||
* b.run(5);
|
||||
* }
|
||||
|
@ -6,6 +6,7 @@
|
||||
#include "caffe2/utils/proto_utils.h"
|
||||
|
||||
#include <c10/macros/Macros.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
#include <cmath>
|
||||
#include <string>
|
||||
@ -34,7 +35,7 @@ void assertTensorEquals(
|
||||
float epsilon = 0.1f) {
|
||||
CAFFE_ENFORCE(tensor.IsType<T>());
|
||||
CAFFE_ENFORCE_EQ(tensor.numel(), data.size());
|
||||
for (auto idx = 0; idx < tensor.numel(); ++idx) {
|
||||
for (const auto idx : c10::irange(tensor.numel())) {
|
||||
if (tensor.IsType<float>()) {
|
||||
assertNear(tensor.data<T>()[idx], data[idx], epsilon);
|
||||
} else {
|
||||
@ -88,7 +89,7 @@ void randomFill(
|
||||
std::mt19937 gen(42);
|
||||
std::uniform_real_distribution<RealType> dis(
|
||||
static_cast<RealType>(min), static_cast<RealType>(max));
|
||||
for (size_t i = 0; i < size; i++) {
|
||||
for (const auto i : c10::irange(size)) {
|
||||
data[i] = dis(gen);
|
||||
}
|
||||
}
|
||||
|
@ -120,7 +120,7 @@ inline std::string GetUniqueName() {
|
||||
|
||||
std::stringstream ss;
|
||||
ss << "_cuda_kernel_";
|
||||
for (int i = 0; i < len; ++i) {
|
||||
for (const auto i : c10::irange(len)) {
|
||||
ss << alpha[rand() % (sizeof(alpha) - 1)];
|
||||
}
|
||||
return ss.str();
|
||||
|
@ -32,7 +32,7 @@ template <int N>
|
||||
const std::vector<int64_t>& shape(Shape<N> vs) {
|
||||
static thread_local std::vector<int64_t> cache;
|
||||
cache.resize(vs.size());
|
||||
for (auto i = 0; i < vs.size(); ++i) {
|
||||
for (const auto i : c10::irange(vs.size())) {
|
||||
cache[i] = vs[i];
|
||||
}
|
||||
return cache;
|
||||
@ -86,7 +86,7 @@ void MaskMatrix_Inc<float, CPUContext>(
|
||||
int /*N*/,
|
||||
int seq_len,
|
||||
float target) {
|
||||
for (int i = 0; i < seq_len; ++i) {
|
||||
for (const auto i : c10::irange(seq_len)) {
|
||||
// assume that the mask_seq is smaller than size
|
||||
// Although it seems that random access gets bad performance,
|
||||
// we make sure that seq is in order;
|
||||
|
@ -35,7 +35,7 @@ template <int N>
|
||||
const std::vector<int64_t>& shape(Shape<N> vs) {
|
||||
static thread_local std::vector<int64_t> cache;
|
||||
cache.resize(vs.size());
|
||||
for (auto i = 0; i < vs.size(); ++i) {
|
||||
for (const auto i : c10::irange(vs.size())) {
|
||||
cache[i] = vs[i];
|
||||
}
|
||||
return cache;
|
||||
@ -71,8 +71,8 @@ void trans_mat<float, CPUContext>(
|
||||
int m,
|
||||
int n,
|
||||
CPUContext* /*context*/) {
|
||||
for (int i = 0; i < m; ++i) {
|
||||
for (int j = 0; j < n; ++j) {
|
||||
for (const auto i : c10::irange(m)) {
|
||||
for (const auto j : c10::irange(n)) {
|
||||
t[j * m + i] = o[i * n + j];
|
||||
}
|
||||
}
|
||||
|
@ -67,7 +67,7 @@ class FunHashOp : public Operator<Context> {
|
||||
|
||||
int64_t n_segments = num_segments_;
|
||||
if (num_segments_ == -1) {
|
||||
for (int64_t i = 0; i < num_nz_ent; ++i) {
|
||||
for (const auto i : c10::irange(num_nz_ent)) {
|
||||
if (seg_data[i] > n_segments) {
|
||||
n_segments = seg_data[i];
|
||||
}
|
||||
@ -86,14 +86,14 @@ class FunHashOp : public Operator<Context> {
|
||||
const auto* val_data = val.template data<T>();
|
||||
const auto* key_data = key.template data<int64_t>();
|
||||
|
||||
for (int64_t j = 0; j < num_nz_ent; ++j) {
|
||||
for (const auto j : c10::irange(num_nz_ent)) {
|
||||
int64_t cur_seg = seg_data[j];
|
||||
int64_t cur_key = key_data[j];
|
||||
T cur_val = val_data[j];
|
||||
int64_t output_stride = cur_seg * num_outputs_;
|
||||
for (int64_t i = 0; i < num_outputs_; ++i) {
|
||||
for (const auto i : c10::irange(num_outputs_)) {
|
||||
T sum = 0;
|
||||
for (int64_t k = 0; k < num_alpha; ++k) {
|
||||
for (const auto k : c10::irange(num_alpha)) {
|
||||
uint64_t hash;
|
||||
// The hash function takes as input four integers:
|
||||
// 1. feature index
|
||||
@ -186,14 +186,14 @@ class FunHashGradientOp : public Operator<Context> {
|
||||
|
||||
memset(grad_weight_data, 0, sizeof(T) * num_weight);
|
||||
|
||||
for (int64_t j = 0; j < num_nz_ent; ++j) {
|
||||
for (const auto j : c10::irange(num_nz_ent)) {
|
||||
int64_t cur_seg = seg_data[j];
|
||||
int64_t cur_key = key_data[j];
|
||||
T cur_val = val_data[j];
|
||||
int64_t grad_out_stride = cur_seg * num_outputs_;
|
||||
for (int64_t i = 0; i < num_outputs_; ++i) {
|
||||
for (const auto i : c10::irange(num_outputs_)) {
|
||||
T grad_out_scale = grad_out_data[grad_out_stride + i] * cur_val;
|
||||
for (int64_t k = 0; k < num_alpha; ++k) {
|
||||
for (const auto k : c10::irange(num_alpha)) {
|
||||
uint64_t hash;
|
||||
hash_data[0] = cur_key;
|
||||
hash_data[1] = i;
|
||||
|
@ -66,7 +66,7 @@ class SparseFunHashOp : public Operator<Context> {
|
||||
|
||||
int64_t n_segments = num_segments_;
|
||||
if (num_segments_ == -1) {
|
||||
for (int64_t i = 0; i < num_nz_ent; ++i) {
|
||||
for (const auto i : c10::irange(num_nz_ent)) {
|
||||
if (seg_data[i] > n_segments) {
|
||||
n_segments = seg_data[i];
|
||||
}
|
||||
@ -85,14 +85,14 @@ class SparseFunHashOp : public Operator<Context> {
|
||||
const auto* val_data = val.template data<T>();
|
||||
const auto* key_data = key.template data<int64_t>();
|
||||
|
||||
for (int64_t j = 0; j < num_nz_ent; ++j) {
|
||||
for (const auto j : c10::irange(num_nz_ent)) {
|
||||
int64_t cur_seg = seg_data[j];
|
||||
int64_t cur_key = key_data[j];
|
||||
T cur_val = val_data[j];
|
||||
int64_t output_stride = cur_seg * num_outputs_;
|
||||
for (int64_t i = 0; i < num_outputs_; ++i) {
|
||||
for (const auto i : c10::irange(num_outputs_)) {
|
||||
T sum = 0;
|
||||
for (int64_t k = 0; k < num_alpha; ++k) {
|
||||
for (const auto k : c10::irange(num_alpha)) {
|
||||
// The hash function takes as input three integers:
|
||||
// 1. feature index
|
||||
// 2. output index
|
||||
@ -190,14 +190,14 @@ class SparseFunHashGradientOp : public Operator<Context> {
|
||||
const auto* key_data = key.template data<int64_t>();
|
||||
|
||||
int64_t w_ind = 0;
|
||||
for (int64_t j = 0; j < num_nz_ent; ++j) {
|
||||
for (const auto j : c10::irange(num_nz_ent)) {
|
||||
int64_t cur_seg = seg_data[j];
|
||||
int64_t cur_key = key_data[j];
|
||||
T cur_val = val_data[j];
|
||||
int64_t grad_out_stride = cur_seg * num_outputs_;
|
||||
for (int64_t i = 0; i < num_outputs_; ++i) {
|
||||
for (const auto i : c10::irange(num_outputs_)) {
|
||||
T grad_out_scale = grad_out_data[grad_out_stride + i] * cur_val;
|
||||
for (int64_t k = 0; k < num_alpha; ++k) {
|
||||
for (const auto k : c10::irange(num_alpha)) {
|
||||
hash_data[0] = cur_key;
|
||||
hash_data[1] = i;
|
||||
hash_data[2] = k;
|
||||
|
@ -111,7 +111,7 @@ class SparseMatrixReshapeOp : public Operator<Context> {
|
||||
auto* new_col_data = new_col->template mutable_data<int64_t>();
|
||||
auto* new_row_data = new_row->template mutable_data<int>();
|
||||
|
||||
for (int i = 0; i < nnz; ++i) {
|
||||
for (const auto i : c10::irange(nnz)) {
|
||||
int64_t offset = old_row_data[i] * old_stride_ + old_col_data[i];
|
||||
new_row_data[i] = offset / new_stride_;
|
||||
new_col_data[i] = offset % new_stride_;
|
||||
|
Reference in New Issue
Block a user