mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Revert D30652629: use irange for loops
Test Plan: revert-hammer
Differential Revision:
D30652629 (687c2267d4
)
Original commit changeset: 0ae6c4bbbb55
fbshipit-source-id: 5c4f067b584a021c8c9656454d1ee60999600fb3
This commit is contained in:
committed by
Facebook GitHub Bot
parent
1e2b2ee5ff
commit
2f099c7555
@ -4,7 +4,6 @@
|
||||
#include <string>
|
||||
|
||||
#include <c10/core/MemoryFormat.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
#include <fbjni/ByteBuffer.h>
|
||||
#include <fbjni/fbjni.h>
|
||||
@ -98,7 +97,7 @@ static at::Tensor newAtTensor(
|
||||
std::vector<int64_t> shapeVec{};
|
||||
shapeVec.reserve(rank);
|
||||
auto numel = 1;
|
||||
for (const auto i : c10::irange(rank)) {
|
||||
for (auto i = 0; i < rank; ++i) {
|
||||
shapeVec.push_back(shapeArr[i]);
|
||||
numel *= shapeArr[i];
|
||||
}
|
||||
@ -522,7 +521,7 @@ at::IValue JIValue::JIValueToAtIValue(
|
||||
|
||||
std::vector<at::IValue> elements;
|
||||
elements.reserve(n);
|
||||
for (const auto i : c10::irange(n)) {
|
||||
for (auto i = 0; i < n; ++i) {
|
||||
auto jivalue_element = jarray->getElement(i);
|
||||
auto element = JIValue::JIValueToAtIValue(jivalue_element);
|
||||
elements.push_back(std::move(element));
|
||||
@ -536,7 +535,7 @@ at::IValue JIValue::JIValueToAtIValue(
|
||||
size_t n = jArrayPinned.size();
|
||||
c10::List<bool> list{};
|
||||
list.reserve(n);
|
||||
for (const auto i : c10::irange(n)) {
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
list.push_back(jArrayPinned[i]);
|
||||
}
|
||||
return at::IValue{std::move(list)};
|
||||
@ -548,7 +547,7 @@ at::IValue JIValue::JIValueToAtIValue(
|
||||
size_t n = jArrayPinned.size();
|
||||
c10::List<int64_t> list{};
|
||||
list.reserve(n);
|
||||
for (const auto i : c10::irange(n)) {
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
list.push_back(jArrayPinned[i]);
|
||||
}
|
||||
return at::IValue{std::move(list)};
|
||||
@ -560,7 +559,7 @@ at::IValue JIValue::JIValueToAtIValue(
|
||||
size_t n = jArrayPinned.size();
|
||||
c10::List<double> list{};
|
||||
list.reserve(n);
|
||||
for (const auto i : c10::irange(n)) {
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
list.push_back(jArrayPinned[i]);
|
||||
}
|
||||
return at::IValue{std::move(list)};
|
||||
@ -573,7 +572,7 @@ at::IValue JIValue::JIValueToAtIValue(
|
||||
size_t n = jArray->size();
|
||||
c10::List<at::Tensor> list{};
|
||||
list.reserve(n);
|
||||
for (const auto i : c10::irange(n)) {
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
list.push_back(
|
||||
TensorHybrid::newAtTensorFromJTensor(jArray->getElement(i)));
|
||||
}
|
||||
@ -595,7 +594,7 @@ at::IValue JIValue::JIValueToAtIValue(
|
||||
c10::impl::GenericList list{c10::unshapedType(first_element.type())};
|
||||
list.reserve(n);
|
||||
list.push_back(first_element);
|
||||
for (const auto i : c10::irange(1, n)) {
|
||||
for (auto i = 1; i < n; ++i) {
|
||||
auto jivalue_element = jarray->getElement(i);
|
||||
auto element = JIValue::JIValueToAtIValue(jivalue_element);
|
||||
list.push_back(element);
|
||||
|
@ -6,7 +6,6 @@
|
||||
#include <fbjni/ByteBuffer.h>
|
||||
#include <fbjni/fbjni.h>
|
||||
|
||||
#include <c10/util/irange.h>
|
||||
#include <torch/csrc/jit/mobile/import.h>
|
||||
#include <torch/csrc/jit/mobile/module.h>
|
||||
#include <torch/script.h>
|
||||
@ -158,7 +157,7 @@ class PytorchJni : public facebook::jni::HybridClass<PytorchJni> {
|
||||
std::vector<at::IValue> inputs{};
|
||||
size_t n = jinputs->size();
|
||||
inputs.reserve(n);
|
||||
for (const auto i : c10::irange(n)) {
|
||||
for (size_t i = 0; i < n; i++) {
|
||||
at::IValue atIValue = JIValue::JIValueToAtIValue(jinputs->getElement(i));
|
||||
if (at::kVulkan == deviceType_) {
|
||||
inputs.push_back(
|
||||
@ -187,7 +186,7 @@ class PytorchJni : public facebook::jni::HybridClass<PytorchJni> {
|
||||
std::vector<at::IValue> inputs{};
|
||||
size_t n = jinputs->size();
|
||||
inputs.reserve(n);
|
||||
for (const auto i : c10::irange(n)) {
|
||||
for (size_t i = 0; i < n; i++) {
|
||||
at::IValue atIValue = JIValue::JIValueToAtIValue(jinputs->getElement(i));
|
||||
if (at::kVulkan == deviceType_) {
|
||||
inputs.push_back(
|
||||
|
@ -3,7 +3,6 @@
|
||||
#include <ATen/BatchedFallback.h>
|
||||
#include <ATen/native/ResizeCommon.h>
|
||||
#include <ATen/ATen.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
namespace at {
|
||||
|
||||
@ -330,7 +329,7 @@ Tensor permute_batching_rule(const Tensor& self, IntArrayRef dims) {
|
||||
|
||||
VmapDimVector all_dims_physical;
|
||||
all_dims_physical.reserve(self_physical.tensor().dim());
|
||||
for (const auto bdim : c10::irange(self_physical.numBatchDims())) {
|
||||
for (int64_t bdim = 0; bdim < self_physical.numBatchDims(); bdim++) {
|
||||
all_dims_physical.push_back(bdim);
|
||||
}
|
||||
all_dims_physical.insert(
|
||||
|
@ -2,7 +2,6 @@
|
||||
|
||||
#include <ATen/Parallel.h>
|
||||
#include <ATen/TensorUtils.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <limits>
|
||||
#include <utility>
|
||||
#include <cstring>
|
||||
@ -131,7 +130,7 @@ inline Tensor sort_strides(Tensor& tensor_) {
|
||||
IntArrayRef strides = tensor_.strides();
|
||||
std::vector<int64_t> indices;
|
||||
indices.reserve(tensor_.ndimension());
|
||||
for (const auto i : c10::irange(tensor_.ndimension())) {
|
||||
for (int64_t i = 0; i < tensor_.ndimension(); i++) {
|
||||
indices.push_back(i);
|
||||
}
|
||||
std::sort(indices.begin(), indices.end(), [&strides](int64_t i1, int64_t i2) {
|
||||
@ -197,7 +196,7 @@ inline bool _all_equal_numel(at::ArrayRef<Tensor> tensors) {
|
||||
if (tensors.size() == 0)
|
||||
return true;
|
||||
int64_t all_numel = tensors[0].numel();
|
||||
for (const auto i : c10::irange(1, tensors.size())) {
|
||||
for (size_t i = 1; i < tensors.size(); i++) {
|
||||
if (tensors[i].numel() != all_numel)
|
||||
return false;
|
||||
}
|
||||
|
@ -11,7 +11,6 @@
|
||||
#include <c10/util/Exception.h>
|
||||
#include <c10/core/impl/DeviceGuardImplInterface.h>
|
||||
#include <c10/core/QEngine.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
@ -352,7 +351,7 @@ static inline void manual_seed(uint64_t seed) {
|
||||
// available. In that case, we must not seed CUDA; it will fail!
|
||||
const auto num_gpus = detail::getCUDAHooks().getNumGPUs();
|
||||
if (hasCUDA() && num_gpus > 0) {
|
||||
for (const auto i : c10::irange(num_gpus)) {
|
||||
for (int i = 0; i < num_gpus; i++) {
|
||||
auto cuda_gen = globalContext().defaultGenerator(
|
||||
Device(at::kCUDA, static_cast<c10::DeviceIndex>(i))
|
||||
);
|
||||
|
@ -197,7 +197,7 @@ std::vector<int64_t> infer_dense_strides(IntArrayRef tensor_sizes, IntArrayRef t
|
||||
// compute output strides which preserves the input tensor's memory layout
|
||||
std::vector<int64_t> out_strides(ndim);
|
||||
int64_t curr_stride = 1;
|
||||
for (const auto i : c10::irange(ndim)) {
|
||||
for (size_t i = 0; i < ndim; ++i) {
|
||||
int64_t idx = perm[i];
|
||||
out_strides[idx] = curr_stride;
|
||||
// Note: for size 0, we simply treated it as 1, it really doesn't matter here
|
||||
|
@ -4,7 +4,6 @@
|
||||
#include <ATen/Tensor.h>
|
||||
#include <c10/util/Exception.h>
|
||||
#include <c10/util/MaybeOwned.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
#include <functional>
|
||||
#include <sstream>
|
||||
@ -267,7 +266,7 @@ inline std::vector<Tensor> expand_outplace(TensorList to_expand) {
|
||||
// expands a list of Tensors; ignores undefined (null) tensors
|
||||
bool first = true;
|
||||
DimVector sizes;
|
||||
for (const auto i : c10::irange(to_expand.size())) {
|
||||
for (size_t i = 0; i < to_expand.size(); ++i) {
|
||||
if (!to_expand[i].defined()) {
|
||||
continue;
|
||||
} else if (first) {
|
||||
@ -279,7 +278,7 @@ inline std::vector<Tensor> expand_outplace(TensorList to_expand) {
|
||||
}
|
||||
|
||||
std::vector<Tensor> result(to_expand.size());
|
||||
for (const auto i : c10::irange(to_expand.size())) {
|
||||
for (size_t i = 0; i < to_expand.size(); ++i) {
|
||||
if (!to_expand[i].defined()) {
|
||||
continue;
|
||||
} else if (to_expand[i].sizes().equals(sizes)) {
|
||||
@ -300,7 +299,7 @@ static inline Tensor sum_to(Tensor tensor, const IntArrayRef shape) {
|
||||
c10::SmallVector<int64_t, 8> reduce_dims;
|
||||
const at::IntArrayRef sizes = tensor.sizes();
|
||||
const int64_t leading_dims = sizes.size() - shape.size();
|
||||
for (const auto i : c10::irange(leading_dims)) {
|
||||
for (int64_t i = 0; i < leading_dims; ++i) {
|
||||
reduce_dims.push_back(i);
|
||||
}
|
||||
for (int64_t i = leading_dims; i < static_cast<int64_t>(sizes.size()); ++i) {
|
||||
@ -321,7 +320,7 @@ static inline bool is_expandable_to(IntArrayRef shape, IntArrayRef desired) {
|
||||
if (ndim > target_dim) {
|
||||
return false;
|
||||
}
|
||||
for (const auto i : c10::irange(ndim)) {
|
||||
for (size_t i = 0; i < ndim; i++) {
|
||||
int64_t size = shape[ndim - i - 1];
|
||||
int64_t target = desired[target_dim - i - 1];
|
||||
if (size != target && size != 1) {
|
||||
|
@ -1,7 +1,6 @@
|
||||
#include <ATen/MemoryOverlap.h>
|
||||
#include <ATen/core/TensorBase.h>
|
||||
#include <c10/core/Layout.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
namespace at {
|
||||
|
||||
@ -18,7 +17,7 @@ MemOverlap has_internal_overlap(TensorImpl* t) {
|
||||
|
||||
auto strides = t->strides();
|
||||
auto sizes = t->sizes();
|
||||
for (const auto i : c10::irange(strides.size())) {
|
||||
for (size_t i = 0; i < strides.size(); ++i) {
|
||||
if (strides[i] == 0 && sizes[i] > 1) {
|
||||
return MemOverlap::YES;
|
||||
}
|
||||
|
@ -225,7 +225,7 @@ std::vector<Dimname> compute_squeeze_outnames(const Tensor& tensor) {
|
||||
}
|
||||
std::vector<Dimname> outnames;
|
||||
auto tensor_names = tensor.names();
|
||||
for (const auto d : c10::irange(tensor.dim())) {
|
||||
for (int64_t d = 0; d < tensor.dim(); d++) {
|
||||
if (tensor.sizes()[d] != 1) {
|
||||
outnames.push_back(tensor_names[d]);
|
||||
}
|
||||
@ -242,7 +242,7 @@ std::vector<Dimname> compute_diagonal_outnames(
|
||||
}
|
||||
std::vector<Dimname> outnames;
|
||||
auto tensor_names = tensor.names();
|
||||
for (const auto d : c10::irange(tensor.dim())) {
|
||||
for (int64_t d = 0; d < tensor.dim(); d++) {
|
||||
if (d == dim1 || d == dim2) {
|
||||
continue;
|
||||
}
|
||||
|
@ -6,7 +6,6 @@
|
||||
|
||||
#ifndef C10_MOBILE
|
||||
#include <c10/core/thread_pool.h>
|
||||
#include <c10/util/irange.h>
|
||||
#else
|
||||
#include <caffe2/utils/threadpool/pthreadpool-cpp.h>
|
||||
#endif // C10_MOBILE
|
||||
@ -88,7 +87,7 @@ TaskThreadPoolBase& _get_intraop_pool() {
|
||||
// `fn` will be called with params: (thread_pool_task_id, task_id).
|
||||
void _run_with_pool(const std::function<void(int, size_t)>& fn, size_t range) {
|
||||
#ifndef C10_MOBILE
|
||||
for (const auto i : c10::irange(1, range)) {
|
||||
for (size_t i = 1; i < range; ++i) {
|
||||
_get_intraop_pool().run([fn, i]() { fn((int)i, i); });
|
||||
}
|
||||
// Run the first task on the current thread directly.
|
||||
|
@ -3,7 +3,6 @@
|
||||
#include <ATen/Tensor.h>
|
||||
#include <c10/core/TensorImpl.h>
|
||||
#include <c10/util/Exception.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
namespace at {
|
||||
struct TORCH_API SparseTensorImpl : public TensorImpl {
|
||||
@ -110,7 +109,7 @@ public:
|
||||
bool shrinking_dense_dim = false;
|
||||
auto sparse_size_original = sizes().slice(0, sparse_dim);
|
||||
auto sparse_size_new = size.slice(0, sparse_dim);
|
||||
for (const auto i : c10::irange(sparse_dim)) {
|
||||
for (int64_t i = 0; i < sparse_dim; i++) {
|
||||
if (sparse_size_new[i] < sparse_size_original[i]) {
|
||||
shrinking_sparse_dims = true;
|
||||
break;
|
||||
@ -118,7 +117,7 @@ public:
|
||||
}
|
||||
auto dense_size_original = sizes().slice(sparse_dim);
|
||||
auto dense_size_new = size.slice(sparse_dim);
|
||||
for (const auto i : c10::irange(dense_dim)) {
|
||||
for (int64_t i = 0; i < dense_dim; i++) {
|
||||
if (dense_size_new[i] < dense_size_original[i]) {
|
||||
shrinking_dense_dim = true;
|
||||
break;
|
||||
|
@ -3,7 +3,6 @@
|
||||
#include <ATen/ATen.h>
|
||||
#include <ATen/SparseTensorImpl.h>
|
||||
#include <ATen/Parallel.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
namespace at { namespace sparse {
|
||||
|
||||
@ -99,7 +98,7 @@ Tensor coo_to_csr(const int64_t* indices, int64_t dim, int64_t nnz) {
|
||||
at::parallel_for(0, nnz, 10000, [&](int64_t start, int64_t end) {
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
|
||||
int64_t h, hp0, hp1;
|
||||
for (const auto i : c10::irange(start, end)) {
|
||||
for (auto i = start; i < end; i++) {
|
||||
hp0 = indices[i];
|
||||
hp1 = (i+1 == nnz) ? dim : indices[i+1];
|
||||
if (hp0 != hp1) {
|
||||
|
@ -1,7 +1,6 @@
|
||||
#include <ATen/TensorIndexing.h>
|
||||
|
||||
#include <c10/util/Exception.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
namespace at {
|
||||
namespace indexing {
|
||||
@ -32,7 +31,7 @@ std::ostream& operator<<(std::ostream& stream, const TensorIndex& tensor_index)
|
||||
|
||||
std::ostream& operator<<(std::ostream& stream, const std::vector<TensorIndex>& tensor_indices) {
|
||||
stream << "(";
|
||||
for (const auto i : c10::irange(tensor_indices.size())) {
|
||||
for (size_t i = 0; i < tensor_indices.size(); i++) {
|
||||
stream << tensor_indices[i];
|
||||
if (i < tensor_indices.size() - 1) stream << ", ";
|
||||
}
|
||||
|
@ -1,7 +1,6 @@
|
||||
#pragma once
|
||||
|
||||
#include <c10/util/Optional.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <ATen/core/TensorBody.h>
|
||||
#include <ATen/ExpandUtils.h>
|
||||
#include <ATen/Functions.h>
|
||||
@ -336,7 +335,7 @@ static inline Tensor scalarToTensor(const Scalar& v, const TensorOptions& option
|
||||
// strip away unit dimensions from the left of 'src'
|
||||
static inline IntArrayRef slicePrefix1sSize(const IntArrayRef& sizes) {
|
||||
size_t first_non1_src = sizes.size();
|
||||
for (const auto i : c10::irange(sizes.size())) {
|
||||
for (size_t i = 0; i < sizes.size(); ++i) {
|
||||
if (sizes[i] != 1) {
|
||||
first_non1_src = i;
|
||||
break;
|
||||
@ -440,7 +439,7 @@ static inline Tensor applySlicing(
|
||||
"too many indices for tensor of dimension ", (int)self_sizes.size());
|
||||
|
||||
Tensor result = self;
|
||||
for (const auto i : c10::irange(indices.size())) {
|
||||
for (size_t i = 0; i < indices.size(); i++) {
|
||||
auto& obj = indices[i];
|
||||
result = handleDimInMultiDimIndexing(
|
||||
/*prev_dim_result=*/result,
|
||||
|
@ -36,8 +36,8 @@ inline void get_base_ptrs(char** ptrs, ArrayRef<OperandInfo> operands) {
|
||||
}
|
||||
|
||||
inline void get_strides(int64_t* strides, ArrayRef<OperandInfo> operands, int64_t ndim) {
|
||||
for (const auto dim : c10::irange(ndim)) {
|
||||
for (const auto arg : c10::irange(operands.size())) {
|
||||
for (int64_t dim = 0; dim < ndim; ++dim) {
|
||||
for (size_t arg = 0; arg < operands.size(); ++arg) {
|
||||
*strides++ = operands[arg].stride_bytes[dim];
|
||||
}
|
||||
}
|
||||
@ -214,7 +214,7 @@ void TensorIteratorBase::reorder_dimensions() {
|
||||
// returns 1 if the dim0 should come after dim1, -1 if dim0 should come
|
||||
// before dim1, and 0 if the comparison is ambiguous.
|
||||
auto should_swap = [&](size_t dim0, size_t dim1) {
|
||||
for (const auto arg : c10::irange(ntensors())) {
|
||||
for (int arg = 0; arg < ntensors(); arg++) {
|
||||
// ignore undefined or incorrectly sized tensors
|
||||
if (operands_[arg].stride_bytes.empty() || operands_[arg].will_resize) {
|
||||
continue;
|
||||
@ -251,7 +251,7 @@ void TensorIteratorBase::reorder_dimensions() {
|
||||
};
|
||||
|
||||
// insertion sort with support for ambiguous comparisons
|
||||
for (const auto i : c10::irange(1, ndim())) {
|
||||
for (int i = 1; i < ndim(); i++) {
|
||||
int dim1 = i;
|
||||
for (int dim0 = i - 1; dim0 >= 0; dim0--) {
|
||||
int comparison = should_swap(perm_[dim0], perm_[dim1]);
|
||||
@ -497,7 +497,7 @@ void TensorIteratorBase::compute_types(const TensorIteratorConfig& config) {
|
||||
StrideVector TensorIteratorBase::compatible_stride(int element_size) const {
|
||||
auto stride = StrideVector();
|
||||
int64_t next_stride = element_size;
|
||||
for (const auto dim : c10::irange(ndim())) {
|
||||
for (int dim = 0; dim < ndim(); dim++) {
|
||||
stride.push_back(next_stride);
|
||||
next_stride *= shape_[dim];
|
||||
}
|
||||
@ -510,14 +510,14 @@ DimVector TensorIteratorBase::invert_perm(IntArrayRef input) const {
|
||||
TORCH_INTERNAL_ASSERT(!has_coalesced_dimensions_);
|
||||
TORCH_INTERNAL_ASSERT(input.size()==perm_.size());
|
||||
auto res = DimVector(input.size()); //no initialization needed, every value in res should be written to.
|
||||
for (const auto dim : c10::irange(ndim())) {
|
||||
for (int dim = 0; dim < ndim(); dim++) {
|
||||
res[perm_[dim]] = input[dim];
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
void TensorIteratorBase::allocate_or_resize_outputs() {
|
||||
for (const auto i : c10::irange(num_outputs_)) {
|
||||
for (int i = 0; i < num_outputs_; i++) {
|
||||
auto& op = operands_[i];
|
||||
if (!op.tensor_base().defined() || op.will_resize) {
|
||||
TORCH_INTERNAL_ASSERT(op.is_type_defined(), "no type for operand", i);
|
||||
@ -525,7 +525,7 @@ void TensorIteratorBase::allocate_or_resize_outputs() {
|
||||
op.stride_bytes = compatible_stride(element_size);
|
||||
// check if permutation is just an inverted order
|
||||
bool inverted = true;
|
||||
for (const auto i : c10::irange(ndim())) {
|
||||
for (int i = 0; i < ndim(); i++) {
|
||||
if (perm_[i] != ndim() - i - 1) {
|
||||
inverted = false;
|
||||
break;
|
||||
@ -539,7 +539,7 @@ void TensorIteratorBase::allocate_or_resize_outputs() {
|
||||
set_output(i, tensor_shape, {}, original_options(op), names_);
|
||||
} else {
|
||||
auto tensor_stride = invert_perm(op.stride_bytes);
|
||||
for (const auto dim : c10::irange(ndim())) {
|
||||
for (int dim = 0; dim < ndim(); dim++) {
|
||||
tensor_stride[dim] /= element_size;
|
||||
}
|
||||
set_output(i, tensor_shape, tensor_stride, original_options(op), names_);
|
||||
@ -593,7 +593,7 @@ void TensorIteratorBase::coalesce_dimensions() {
|
||||
if (shape0 == 1 || shape1 == 1) {
|
||||
return true;
|
||||
}
|
||||
for (const auto i : c10::irange(ntensors())) {
|
||||
for (int i = 0; i < ntensors(); i++) {
|
||||
auto& stride = operands_[i].stride_bytes;
|
||||
if (shape0 * stride[dim0] != stride[dim1]) {
|
||||
return false;
|
||||
@ -604,14 +604,14 @@ void TensorIteratorBase::coalesce_dimensions() {
|
||||
|
||||
// replace each operands stride at dim0 with its stride at dim1
|
||||
auto replace_stride = [&](int dim0, int dim1) {
|
||||
for (const auto i : c10::irange(ntensors())) {
|
||||
for (int i = 0; i < ntensors(); i++) {
|
||||
auto& stride = operands_[i].stride_bytes;
|
||||
stride[dim0] = stride[dim1];
|
||||
}
|
||||
};
|
||||
|
||||
int prev_dim = 0;
|
||||
for (const auto dim : c10::irange(1, ndim())) {
|
||||
for (int dim = 1; dim < ndim(); dim++) {
|
||||
if (can_coalesce(prev_dim, dim)) {
|
||||
if (shape_[prev_dim] == 1) {
|
||||
replace_stride(prev_dim, dim);
|
||||
@ -627,7 +627,7 @@ void TensorIteratorBase::coalesce_dimensions() {
|
||||
}
|
||||
|
||||
shape_.resize(prev_dim + 1);
|
||||
for (const auto i : c10::irange(ntensors())) {
|
||||
for (int i = 0; i < ntensors(); i++) {
|
||||
operands_[i].stride_bytes.resize(ndim());
|
||||
}
|
||||
has_coalesced_dimensions_ = true;
|
||||
@ -670,7 +670,7 @@ void TensorIteratorBase::permute_dimensions(IntArrayRef perm) {
|
||||
|
||||
auto reorder = [perm](IntArrayRef data) {
|
||||
auto res = DimVector(data.size(), 0);
|
||||
for (const auto i : c10::irange(perm.size())) {
|
||||
for (size_t i = 0; i < perm.size(); i++) {
|
||||
res[i] = data[perm[i]];
|
||||
}
|
||||
return res;
|
||||
@ -687,7 +687,7 @@ void TensorIteratorBase::permute_dimensions(IntArrayRef perm) {
|
||||
|
||||
int64_t TensorIteratorBase::num_output_elements() const {
|
||||
int64_t elem = 1;
|
||||
for (const auto dim : c10::irange(ndim())) {
|
||||
for (int dim = 0; dim < ndim(); dim++) {
|
||||
if (operands_[0].stride_bytes[dim] != 0 || shape_[dim] == 0) {
|
||||
elem *= shape_[dim];
|
||||
}
|
||||
@ -697,7 +697,7 @@ int64_t TensorIteratorBase::num_output_elements() const {
|
||||
|
||||
int TensorIteratorBase::num_reduce_dims() const {
|
||||
int count = 0;
|
||||
for (const auto dim : c10::irange(ndim())) {
|
||||
for (int dim = 0; dim < ndim(); dim++) {
|
||||
if (operands_[0].stride_bytes[dim] == 0) {
|
||||
count++;
|
||||
}
|
||||
@ -760,7 +760,7 @@ bool TensorIteratorBase::is_contiguous() const {
|
||||
|
||||
bool TensorIteratorBase::is_scalar(int arg) const {
|
||||
const auto& stride = operands_[arg].stride_bytes;
|
||||
for (const auto i : c10::irange(ndim())) {
|
||||
for (int i = 0; i < ndim(); i++) {
|
||||
if (stride[i] != 0 && shape_[i] != 1) {
|
||||
return false;
|
||||
}
|
||||
@ -815,7 +815,7 @@ void TensorIteratorBase::narrow(int dim, int64_t start, int64_t size) {
|
||||
|
||||
void TensorIteratorBase::select_all_keeping_dim(int start_dim, IntArrayRef indices) {
|
||||
TORCH_INTERNAL_ASSERT(start_dim <= ndim());
|
||||
for (const auto i : c10::irange(start_dim, ndim())) {
|
||||
for (int i = start_dim; i < ndim(); ++i) {
|
||||
for (auto& op : operands_) {
|
||||
op.data = ((char*)op.data) + op.stride_bytes[i] * indices[i - start_dim];
|
||||
}
|
||||
@ -1063,13 +1063,13 @@ void TensorIteratorBase::populate_operands(TensorIteratorConfig& config) {
|
||||
|
||||
void TensorIteratorBase::mark_outputs() {
|
||||
// TODO: merge this into populate_operands
|
||||
for (const auto i : c10::irange(num_outputs_)) {
|
||||
for (int i = 0; i < num_outputs_; i++) {
|
||||
operands_[i].is_output = true;
|
||||
const auto& output = tensor(i);
|
||||
if (!output.defined()) continue;
|
||||
|
||||
// check if output is also an input
|
||||
for (const auto arg : c10::irange(num_outputs_, ntensors())) {
|
||||
for (int arg = num_outputs_; arg < ntensors(); arg++) {
|
||||
const auto& input = tensor(arg);
|
||||
if (output.is_same(input)) {
|
||||
operands_[i].is_read_write = true;
|
||||
@ -1086,7 +1086,7 @@ void TensorIteratorBase::mark_resize_outputs(const TensorIteratorConfig& config)
|
||||
if (config.static_shape_.has_value()) {
|
||||
return;
|
||||
}
|
||||
for (const auto i : c10::irange(num_outputs_)) {
|
||||
for (int i = 0; i < num_outputs_; i++) {
|
||||
const auto& output = tensor(i);
|
||||
if (output.defined() && !output.sizes().equals(shape_)) {
|
||||
if (config.resize_outputs_ && !operands_[i].is_read_write) {
|
||||
@ -1104,11 +1104,11 @@ void TensorIteratorBase::compute_mem_overlaps(const TensorIteratorConfig& config
|
||||
if (!config.check_mem_overlap_) {
|
||||
return;
|
||||
}
|
||||
for (const auto i : c10::irange(num_outputs_)) {
|
||||
for (int i = 0; i < num_outputs_; i++) {
|
||||
const auto& output = tensor_base(i);
|
||||
if (!output.defined()) continue;
|
||||
assert_no_internal_overlap(output);
|
||||
for (const auto j : c10::irange(num_outputs_, ntensors())) {
|
||||
for (int j = num_outputs_; j < ntensors(); j++) {
|
||||
const auto& input = tensor_base(j);
|
||||
if (!input.is_same(output)) {
|
||||
assert_no_partial_overlap(output, input);
|
||||
@ -1164,7 +1164,7 @@ void TensorIteratorBase::compute_strides(const TensorIteratorConfig& config) {
|
||||
op.stride_bytes.resize(ndim(), 0);
|
||||
else
|
||||
op.stride_bytes.resize(ndim());
|
||||
for (const auto i : c10::irange(original_shape.size())) {
|
||||
for (size_t i = 0; i < original_shape.size(); i++) {
|
||||
// see NOTE: [Computing output strides]
|
||||
if (original_shape[i] == 1 && shape_[offset + i] !=1) {
|
||||
op.stride_bytes[offset + i] = 0;
|
||||
@ -1183,7 +1183,7 @@ bool TensorIteratorBase::can_use_32bit_indexing() const {
|
||||
}
|
||||
for (auto& op : operands_) {
|
||||
int64_t max_offset = 1;
|
||||
for (const auto dim : c10::irange(ndim())) {
|
||||
for (int dim = 0; dim < ndim(); dim++) {
|
||||
max_offset += (shape_[dim] - 1) * op.stride_bytes[dim];
|
||||
}
|
||||
if (max_offset > max_value) {
|
||||
@ -1245,7 +1245,7 @@ bool TensorIteratorBase::fast_set_up(const TensorIteratorConfig& config) {
|
||||
switch (setup_type) {
|
||||
case FastSetupType::CONTIGUOUS:
|
||||
{
|
||||
for (const auto i : c10::irange(num_outputs_)) {
|
||||
for (int i = 0; i < num_outputs_; i++){
|
||||
auto& op = operands_[i];
|
||||
if (!op.tensor_base().defined()) {
|
||||
TORCH_INTERNAL_ASSERT(op.is_type_defined(), "no type for operand", i);
|
||||
@ -1256,7 +1256,7 @@ bool TensorIteratorBase::fast_set_up(const TensorIteratorConfig& config) {
|
||||
}
|
||||
case FastSetupType::CHANNELS_LAST:
|
||||
{
|
||||
for (const auto i : c10::irange(num_outputs_)) {
|
||||
for (int i = 0; i < num_outputs_; i++){
|
||||
auto& op = operands_[i];
|
||||
if (!op.tensor_base().defined()) {
|
||||
TORCH_INTERNAL_ASSERT(op.is_type_defined(), "no type for operand", i);
|
||||
@ -1273,7 +1273,7 @@ bool TensorIteratorBase::fast_set_up(const TensorIteratorConfig& config) {
|
||||
if (tensor(i_defined).defined()) break;
|
||||
}
|
||||
TORCH_CHECK(i_defined >= 0, "Can not find a defined tensor when fast allocating memory to outputs");
|
||||
for (const auto i : c10::irange(num_outputs_)) {
|
||||
for (int i = 0; i < num_outputs_; i++){
|
||||
auto& op = operands_[i];
|
||||
if (!op.tensor_base().defined()) {
|
||||
TORCH_INTERNAL_ASSERT(op.is_type_defined(), "no type for operand", i);
|
||||
|
@ -4,7 +4,6 @@
|
||||
#include <c10/util/MaybeOwned.h>
|
||||
#include <c10/util/SmallVector.h>
|
||||
#include <c10/util/TypeCast.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <ATen/core/Dimname.h>
|
||||
#include <ATen/core/Range.h>
|
||||
#include <ATen/core/TensorBase.h>
|
||||
@ -323,9 +322,9 @@ private:
|
||||
char** base, const int64_t* strides, int64_t size0, int64_t size1) {
|
||||
PtrVector data(base, base + ntensor);
|
||||
const int64_t* outer_strides = &strides[ntensor];
|
||||
for (const auto i : c10::irange(size1)) {
|
||||
for (int64_t i = 0; i < size1; i++) {
|
||||
if (i > 0) {
|
||||
for (const auto arg : c10::irange(ntensor)) {
|
||||
for (int64_t arg = 0; arg < ntensor; arg++) {
|
||||
data[arg] += outer_strides[arg];
|
||||
}
|
||||
}
|
||||
@ -398,7 +397,7 @@ public:
|
||||
|
||||
bool has_contiguous_first_dim() const {
|
||||
int num_tensors = ntensors();
|
||||
for (const auto i : c10::irange(num_tensors)) {
|
||||
for (int i = 0; i < num_tensors; i++) {
|
||||
if (strides(i)[0] != element_size(i)) {
|
||||
return false;
|
||||
}
|
||||
|
@ -1,7 +1,6 @@
|
||||
#pragma once
|
||||
#include <ATen/native/TensorIterator.h>
|
||||
#include <c10/util/SmallBuffer.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
namespace at {
|
||||
|
||||
@ -25,9 +24,9 @@ inline void get_data_ptrs(
|
||||
const int64_t ntensors = base.size();
|
||||
const int64_t ndim = counter.size();
|
||||
std::copy(base.begin(), base.end(), ptrs);
|
||||
for (const auto dim : c10::irange(ndim)) {
|
||||
for (int64_t dim = 0; dim < ndim; ++dim) {
|
||||
int64_t value = counter[dim];
|
||||
for (const auto arg : c10::irange(ntensors)) {
|
||||
for (int64_t arg = 0; arg < ntensors; ++arg) {
|
||||
ptrs[arg] += value * strides[dim * ntensors + arg];
|
||||
}
|
||||
}
|
||||
|
@ -56,7 +56,7 @@ TensorNames::TensorNames(ArrayRef<Dimname> names, int64_t start, int64_t end) {
|
||||
start = maybe_wrap_dim(start, names.size());
|
||||
end = maybe_wrap_dim(end, names.size());
|
||||
names_.reserve(end - start);
|
||||
for (const auto idx : c10::irange(start, end)) {
|
||||
for (int64_t idx = start; idx < end; ++idx) {
|
||||
names_.emplace_back(names, idx);
|
||||
}
|
||||
}
|
||||
|
@ -2,7 +2,6 @@
|
||||
#include <ATen/Config.h>
|
||||
#include <ATen/TensorUtils.h>
|
||||
#include <c10/util/accumulate.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
#include <ostream>
|
||||
#include <sstream>
|
||||
@ -324,7 +323,7 @@ size_t computeStorageNbytes(
|
||||
// size of the underlying storage is 1 bigger than the offset
|
||||
// of the last element according to stride
|
||||
size_t size = 1;
|
||||
for (const auto i : c10::irange(sizes.size())) {
|
||||
for(size_t i = 0; i < sizes.size(); i++) {
|
||||
if(sizes[i] == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
@ -83,7 +83,7 @@ VmapDimVector VmapPhysicalView::getPhysicalShape(IntArrayRef logical_shape) cons
|
||||
static BatchDims computeFrontBatchDimsFromLevels(std::bitset<kVmapNumLevels> levels_bitset) {
|
||||
BatchDims bdims;
|
||||
int64_t dim = 0;
|
||||
for (const auto level : c10::irange(kVmapNumLevels)) {
|
||||
for (int64_t level = 0; level < kVmapNumLevels; level++) {
|
||||
if (!levels_bitset[level]) {
|
||||
continue;
|
||||
}
|
||||
@ -208,7 +208,7 @@ MultiBatchVmapTransform::logicalToPhysical(TensorList logical_tensors) {
|
||||
VmapDimVector batch_sizes(num_batch_dims, 1);
|
||||
for (const auto& physical_tensor : physical_tensors) {
|
||||
auto physical_sizes = physical_tensor.sizes();
|
||||
for (const auto dim : c10::irange(num_batch_dims)) {
|
||||
for (int64_t dim = 0; dim < num_batch_dims; dim++) {
|
||||
if (physical_sizes[dim] != 1) {
|
||||
batch_sizes[dim] = physical_sizes[dim];
|
||||
}
|
||||
|
@ -2,7 +2,6 @@
|
||||
|
||||
#include <c10/core/WrapDimMinimal.h>
|
||||
#include <c10/core/TensorImpl.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <ATen/core/Tensor.h>
|
||||
|
||||
namespace at {
|
||||
@ -41,7 +40,7 @@ static inline void maybe_wrap_dims_n(int64_t* dims, int64_t ndims, int64_t dim_p
|
||||
}
|
||||
int64_t min = -dim_post_expr;
|
||||
int64_t max = dim_post_expr - 1;
|
||||
for (const auto i : c10::irange(ndims)) {
|
||||
for (int64_t i = 0; i < ndims; ++i) {
|
||||
auto &dim = dims[i];
|
||||
if (dim < min || dim > max) {
|
||||
TORCH_CHECK_INDEX(false,
|
||||
@ -86,7 +85,7 @@ static inline int64_t legacy_cat_wrap_dim(int64_t dim, TensorList tensors) {
|
||||
|
||||
// wrap negative dims in a vector
|
||||
static inline void wrap_all_dims(std::vector<int64_t>& dims_to_wrap, int64_t tensor_total_dims) {
|
||||
for (const auto i : c10::irange(dims_to_wrap.size())) {
|
||||
for (size_t i = 0; i < dims_to_wrap.size(); i++) {
|
||||
dims_to_wrap[i] = maybe_wrap_dim(dims_to_wrap[i], tensor_total_dims);
|
||||
}
|
||||
}
|
||||
|
@ -1,7 +1,6 @@
|
||||
#pragma once
|
||||
|
||||
#include <c10/core/TensorImpl.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <ATen/WrapDimUtils.h>
|
||||
#include <sstream>
|
||||
#include <bitset>
|
||||
@ -16,7 +15,7 @@ constexpr size_t dim_bitset_size = 64;
|
||||
static inline std::bitset<dim_bitset_size> dim_list_to_bitset(IntArrayRef dims, int64_t ndims) {
|
||||
TORCH_CHECK(ndims <= (int64_t) dim_bitset_size, "only tensors with up to ", dim_bitset_size, " dims are supported");
|
||||
std::bitset<dim_bitset_size> seen;
|
||||
for (const auto i : c10::irange(dims.size())) {
|
||||
for (size_t i = 0; i < dims.size(); i++) {
|
||||
size_t dim = maybe_wrap_dim(dims[i], ndims);
|
||||
TORCH_CHECK(!seen[dim], "dim ", dim, " appears multiple times in the list of dims");
|
||||
seen[dim] = true;
|
||||
|
@ -1,5 +1,4 @@
|
||||
#include <benchmark/benchmark.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <torch/csrc/jit/passes/xnnpack_rewrite.h>
|
||||
#include <torch/csrc/autograd/generated/variable_factories.h>
|
||||
#include <torch/csrc/jit/api/module.h>
|
||||
@ -34,7 +33,7 @@ static void stateful_conv1d(benchmark::State& state) {
|
||||
)");
|
||||
|
||||
std::vector<std::vector<torch::jit::IValue>> inputs;
|
||||
for (const auto i : c10::irange(10)) {
|
||||
for (int i = 0; i < 10; ++i) {
|
||||
std::vector<torch::jit::IValue> input;
|
||||
// NOLINTNEXTLINE(modernize-use-emplace)
|
||||
input.push_back(torch::rand({batch_size, input_channels, width}));
|
||||
@ -71,8 +70,8 @@ static void GenerateSizes(benchmark::internal::Benchmark* b) {
|
||||
|
||||
for (size_t input_channels = 32; input_channels < 256; input_channels *= 2) {
|
||||
for (size_t output_channels = 32; output_channels < 256; output_channels *= 2) {
|
||||
for (const auto kernel : c10::irange(3, 8)) {
|
||||
for (const auto batch_size : c10::irange(1, 5)) {
|
||||
for (size_t kernel = 3; kernel < 8; ++kernel) {
|
||||
for (size_t batch_size = 1; batch_size < 5; ++batch_size) {
|
||||
for (size_t width = 32; width < 256; width *= 2) {
|
||||
b->Args({input_channels, output_channels, kernel, batch_size, width, true});
|
||||
b->Args({input_channels, output_channels, kernel, batch_size, width, false});
|
||||
|
@ -4,7 +4,6 @@
|
||||
// device code.
|
||||
|
||||
#include <c10/macros/Macros.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
namespace at { namespace detail {
|
||||
|
||||
|
@ -1,5 +1,4 @@
|
||||
#include <ATen/core/Formatting.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
@ -45,7 +44,7 @@ static std::tuple<double, int64_t> __printFormat(std::ostream& stream, const Ten
|
||||
}
|
||||
bool intMode = true;
|
||||
auto self_p = self.data_ptr<double>();
|
||||
for (const auto i : c10::irange(size)) {
|
||||
for(int64_t i = 0; i < size; i++) {
|
||||
auto z = self_p[i];
|
||||
if(std::isfinite(z)) {
|
||||
if(z != std::ceil(z)) {
|
||||
@ -71,7 +70,7 @@ static std::tuple<double, int64_t> __printFormat(std::ostream& stream, const Ten
|
||||
} else {
|
||||
expMin = fabs(self_p[offset]);
|
||||
expMax = fabs(self_p[offset]);
|
||||
for (const auto i : c10::irange(offset, size)) {
|
||||
for(int64_t i = offset; i < size; i++) {
|
||||
double z = fabs(self_p[i]);
|
||||
if(std::isfinite(z)) {
|
||||
if(z < expMin) {
|
||||
@ -131,8 +130,7 @@ static std::tuple<double, int64_t> __printFormat(std::ostream& stream, const Ten
|
||||
|
||||
static void __printIndent(std::ostream &stream, int64_t indent)
|
||||
{
|
||||
for (const auto i : c10::irange(indent)) {
|
||||
(void)i; //Suppress unused variable warning
|
||||
for(int64_t i = 0; i < indent; i++) {
|
||||
stream << " ";
|
||||
}
|
||||
}
|
||||
@ -170,7 +168,7 @@ static void __printMatrix(std::ostream& stream, const Tensor& self, int64_t line
|
||||
printScale(stream,scale);
|
||||
__printIndent(stream, indent);
|
||||
}
|
||||
for (const auto l : c10::irange(self.size(0))) {
|
||||
for(int64_t l = 0; l < self.size(0); l++) {
|
||||
Tensor row = self.select(0,l);
|
||||
double *row_ptr = row.data_ptr<double>();
|
||||
for(int64_t c = firstColumn; c < lastColumn+1; c++) {
|
||||
@ -200,7 +198,8 @@ void __printTensor(std::ostream& stream, Tensor& self, int64_t linesize)
|
||||
bool start = true;
|
||||
bool finished = false;
|
||||
counter[0] = -1;
|
||||
for (const auto i : c10::irange(1, counter.size()))counter[i] = 0;
|
||||
for(size_t i = 1; i < counter.size(); i++)
|
||||
counter[i] = 0;
|
||||
while(true) {
|
||||
for(int64_t i = 0; self.ndimension()-2; i++) {
|
||||
counter[i] = counter[i] + 1;
|
||||
@ -270,7 +269,7 @@ std::ostream& print(std::ostream& stream, const Tensor & tensor_, int64_t linesi
|
||||
printScale(stream, scale);
|
||||
}
|
||||
double* tensor_p = tensor.data_ptr<double>();
|
||||
for (const auto i : c10::irange(tensor.size(0))) {
|
||||
for (int64_t i = 0; i < tensor.size(0); i++) {
|
||||
stream << std::setw(sz) << tensor_p[i]/scale << std::endl;
|
||||
}
|
||||
}
|
||||
@ -285,7 +284,7 @@ std::ostream& print(std::ostream& stream, const Tensor & tensor_, int64_t linesi
|
||||
__printTensor(stream, tensor, linesize);
|
||||
}
|
||||
stream << "[ " << tensor_.toString() << "{" << tensor.size(0);
|
||||
for (const auto i : c10::irange(1, tensor.ndimension())) {
|
||||
for(int64_t i = 1; i < tensor.ndimension(); i++) {
|
||||
stream << "," << tensor.size(i);
|
||||
}
|
||||
stream << "}";
|
||||
|
@ -155,7 +155,7 @@ private:
|
||||
data_.seed_ = seed;
|
||||
data_.seeded_ = true;
|
||||
data_.state_[0] = seed & 0xffffffff;
|
||||
for (const auto j : c10::irange(1, MERSENNE_STATE_N)) {
|
||||
for(int j = 1; j < MERSENNE_STATE_N; j++) {
|
||||
data_.state_[j] = (1812433253 * (data_.state_[j-1] ^ (data_.state_[j-1] >> 30)) + j);
|
||||
}
|
||||
data_.left_ = 1;
|
||||
|
@ -3,7 +3,6 @@
|
||||
#include <c10/macros/Macros.h>
|
||||
#include <c10/util/Deprecated.h>
|
||||
#include <c10/util/Exception.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <stdint.h>
|
||||
#include <cstddef>
|
||||
|
||||
@ -135,7 +134,7 @@ public:
|
||||
const source_index_t* sizes_,
|
||||
const source_index_t* strides_)
|
||||
: data_(data_) {
|
||||
for (const auto i : c10::irange(N)) {
|
||||
for (int i = 0; i < N; i++) {
|
||||
this->sizes_[i] = sizes_[i];
|
||||
this->strides_[i] = strides_[i];
|
||||
}
|
||||
|
@ -7,7 +7,6 @@
|
||||
#include <ATen/core/dispatch/Dispatcher.h>
|
||||
#include <ATen/core/ivalue.h>
|
||||
#include <c10/core/CPUAllocator.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
template<class... Inputs>
|
||||
inline std::vector<c10::IValue> makeStack(Inputs&&... inputs) {
|
||||
@ -88,7 +87,7 @@ inline void expectThrows(Functor&& functor, const char* expectMessageContains) {
|
||||
template<class T, size_t N>
|
||||
void expectListEquals(c10::ArrayRef<T> expected, std::array<T, N> actual) {
|
||||
EXPECT_EQ(expected.size(), actual.size());
|
||||
for (const auto i : c10::irange(expected.size())) {
|
||||
for (size_t i = 0; i < expected.size(); ++i) {
|
||||
EXPECT_EQ(expected[i], actual[i]);
|
||||
}
|
||||
}
|
||||
@ -96,7 +95,7 @@ void expectListEquals(c10::ArrayRef<T> expected, std::array<T, N> actual) {
|
||||
template<class T>
|
||||
void expectListEquals(c10::ArrayRef<T> expected, c10::ArrayRef<T> actual) {
|
||||
EXPECT_EQ(expected.size(), actual.size());
|
||||
for (const auto i : c10::irange(expected.size())) {
|
||||
for (size_t i = 0; i < expected.size(); ++i) {
|
||||
EXPECT_EQ(expected[i], actual[i]);
|
||||
}
|
||||
}
|
||||
@ -104,7 +103,7 @@ void expectListEquals(c10::ArrayRef<T> expected, c10::ArrayRef<T> actual) {
|
||||
template<class T>
|
||||
void expectListEquals(c10::ArrayRef<T> expected, c10::List<T> actual) {
|
||||
EXPECT_EQ(expected.size(), actual.size());
|
||||
for (const auto i : c10::irange(expected.size())) {
|
||||
for (size_t i = 0; i < expected.size(); ++i) {
|
||||
EXPECT_EQ(expected[i], actual.get(i));
|
||||
}
|
||||
}
|
||||
@ -112,7 +111,7 @@ void expectListEquals(c10::ArrayRef<T> expected, c10::List<T> actual) {
|
||||
template<class T>
|
||||
void expectListEquals(c10::ArrayRef<T> expected, std::vector<T> actual) {
|
||||
EXPECT_EQ(expected.size(), actual.size());
|
||||
for (const auto i : c10::irange(expected.size())) {
|
||||
for (size_t i = 0; i < expected.size(); ++i) {
|
||||
EXPECT_EQ(expected[i], actual[i]);
|
||||
}
|
||||
}
|
||||
|
@ -5,7 +5,6 @@
|
||||
#include <ATen/core/jit_type.h>
|
||||
#include <c10/util/Bitset.h>
|
||||
#include <c10/core/DispatchKeySet.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <ATen/core/Variadic.h>
|
||||
#include <ATen/core/stack.h>
|
||||
|
||||
@ -172,7 +171,7 @@ private:
|
||||
"The function schema has ", schema.arguments().size(),
|
||||
" arguments but this PyTorch build only supports ", c10::utils::bitset::NUM_BITS());
|
||||
c10::utils::bitset dispatch_arg_indices_reverse;
|
||||
for (const auto index : c10::irange(schema.arguments().size())) {
|
||||
for (size_t index = 0; index < schema.arguments().size(); ++index) {
|
||||
if (schema.arguments()[index].type()->isSubtypeOf(*TensorType::get()) ||
|
||||
schema.arguments()[index].type()->isSubtypeOf(
|
||||
*ListType::ofTensors()) ||
|
||||
|
@ -5,7 +5,6 @@
|
||||
#include <ATen/Functions.h>
|
||||
#include <ATen/core/dispatch/Dispatcher.h>
|
||||
#include <ATen/core/op_registration/op_registration.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <torch/library.h>
|
||||
|
||||
using namespace at;
|
||||
@ -52,7 +51,7 @@ void generic_wrapper_fallback(const c10::OperatorHandle& op, torch::jit::Stack*
|
||||
|
||||
// Unwrap all arguments
|
||||
auto args = torch::jit::pop(*stack, num_arguments);
|
||||
for (const auto i : c10::irange(num_arguments)) {
|
||||
for (size_t i = 0; i < num_arguments; i++) {
|
||||
// TODO: Handle tensor list
|
||||
if (args[i].isTensor()) {
|
||||
auto* impl = args[i].unsafeToTensorImpl();
|
||||
@ -71,7 +70,7 @@ void generic_wrapper_fallback(const c10::OperatorHandle& op, torch::jit::Stack*
|
||||
|
||||
// Rewrap outputs
|
||||
auto rets = torch::jit::pop(*stack, num_returns);
|
||||
for (const auto i : c10::irange(num_returns)) {
|
||||
for (size_t i = 0; i < num_returns; i++) {
|
||||
// TODO: Handle tensor list
|
||||
if (rets[i].isTensor()) {
|
||||
torch::jit::push(*stack, at::detail::make_tensor<GenericWrapperTensorImpl>(std::move(rets[i]).toTensor())); // yes move!
|
||||
|
@ -2,7 +2,6 @@
|
||||
|
||||
#include <c10/util/StringUtil.h>
|
||||
#include <c10/util/string_view.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <ATen/core/jit_type.h>
|
||||
#include <ATen/core/interned_strings.h>
|
||||
#include <ATen/core/ivalue.h>
|
||||
|
@ -16,7 +16,7 @@ inline std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema)
|
||||
out << "(";
|
||||
|
||||
bool seen_kwarg_only = false;
|
||||
for (const auto i : c10::irange(schema.arguments().size())) {
|
||||
for(size_t i = 0; i < schema.arguments().size(); ++i) {
|
||||
if (i > 0) out << ", ";
|
||||
if (schema.arguments()[i].kwarg_only() && !seen_kwarg_only) {
|
||||
out << "*, ";
|
||||
@ -35,7 +35,7 @@ inline std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema)
|
||||
|
||||
const auto& returns = schema.returns();
|
||||
out << "(";
|
||||
for (const auto i : c10::irange(returns.size())) {
|
||||
for(size_t i = 0; i < returns.size(); ++i) {
|
||||
if (i > 0) {
|
||||
out << ", ";
|
||||
}
|
||||
@ -53,7 +53,7 @@ inline std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema)
|
||||
|
||||
inline size_t findFirstOutArg(const std::vector<Argument>& args) {
|
||||
// find the start of out args in the schema
|
||||
for (const auto out_start_idx : c10::irange(args.size())) {
|
||||
for (size_t out_start_idx = 0; out_start_idx < args.size(); out_start_idx++) {
|
||||
if (args.at(out_start_idx).is_out()) {
|
||||
return out_start_idx;
|
||||
}
|
||||
@ -122,7 +122,7 @@ inline bool FunctionSchema::isBackwardCompatibleWith(
|
||||
&& arguments().size() >= old.arguments().size())) {
|
||||
return false;
|
||||
}
|
||||
for (const auto i : c10::irange(returns().size())) {
|
||||
for (size_t i = 0; i < returns().size(); ++i) {
|
||||
// Backwards compatibility requires covariance on argument types
|
||||
// (i.e. more generic), and contravariance on return types (i.e.
|
||||
// more specific).
|
||||
@ -138,7 +138,7 @@ inline bool FunctionSchema::isBackwardCompatibleWith(
|
||||
size_t new_out_start_idx = findFirstOutArg(arguments());
|
||||
|
||||
// make sure among the default args, they are backward compatible
|
||||
for (const auto i : c10::irange(old_out_start_idx)) {
|
||||
for (size_t i = 0; i < old_out_start_idx; i++) {
|
||||
if (!arguments().at(i).isBackwardCompatibleWith(
|
||||
old.arguments().at(i), why_not)) {
|
||||
return false;
|
||||
@ -146,7 +146,7 @@ inline bool FunctionSchema::isBackwardCompatibleWith(
|
||||
}
|
||||
|
||||
// // Validate that all new arguments provided has a default value
|
||||
for (const auto i : c10::irange(old_out_start_idx, new_out_start_idx)) {
|
||||
for (size_t i = old_out_start_idx; i < new_out_start_idx; ++i) {
|
||||
if (!arguments().at(i).default_value()) {
|
||||
if (why_not) {
|
||||
*why_not
|
||||
@ -160,7 +160,7 @@ inline bool FunctionSchema::isBackwardCompatibleWith(
|
||||
}
|
||||
|
||||
// now compare the out args
|
||||
for (const auto i : c10::irange(old_out_start_idx, old.arguments().size())) {
|
||||
for (size_t i = old_out_start_idx; i < old.arguments().size(); i++) {
|
||||
if (!arguments()
|
||||
.at(i - old_out_start_idx + new_out_start_idx)
|
||||
.isBackwardCompatibleWith(old.arguments().at(i), why_not)) {
|
||||
@ -238,7 +238,7 @@ inline void FunctionSchema::checkAndNormalizeInputs(
|
||||
*this);
|
||||
|
||||
size_t consumed_kwargs = 0;
|
||||
for (const auto pos : c10::irange(arguments().size())) {
|
||||
for (size_t pos = 0; pos < arguments().size(); ++pos) {
|
||||
const auto& argument = arguments()[pos];
|
||||
if (pos < inputs.size()) {
|
||||
checkArg(inputs[pos], argument, pos);
|
||||
@ -298,7 +298,7 @@ inline bool isSubtypeOfList(
|
||||
if (child.size() != parent.size()) {
|
||||
return false;
|
||||
}
|
||||
for (const auto i : c10::irange(child.size())) {
|
||||
for (size_t i = 0; i < child.size(); ++i) {
|
||||
const Argument& c = child[i];
|
||||
const Argument& p = parent[i];
|
||||
if (c.name() != p.name()) {
|
||||
|
@ -22,7 +22,6 @@
|
||||
#include <c10/util/intrusive_ptr.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <c10/util/hash.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
namespace torch {
|
||||
namespace jit {
|
||||
@ -1115,7 +1114,7 @@ struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target {
|
||||
}
|
||||
std::ostringstream oss;
|
||||
oss << devices[0];
|
||||
for (const auto idx : c10::irange(1, devices.size())) {
|
||||
for (size_t idx = 1; idx < devices.size(); idx++) {
|
||||
if (idx == devices.size() - 1) {
|
||||
oss << " and ";
|
||||
} else {
|
||||
@ -1132,7 +1131,7 @@ struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target {
|
||||
return c10::kCPU;
|
||||
}
|
||||
c10::DeviceType deviceType = devices[0].type();
|
||||
for (const auto idx : c10::irange(1, devices.size())) {
|
||||
for (size_t idx = 1; idx < devices.size(); idx++) {
|
||||
TORCH_CHECK_VALUE(
|
||||
devices[idx].type() == deviceType,
|
||||
"Expected all devices to be of the same type, but got a mismatch between ",
|
||||
@ -1152,7 +1151,7 @@ struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target {
|
||||
[](const c10::Device& a, const c10::Device& b) { return a.index() < b.index(); });
|
||||
// Deduplicate by compacting.
|
||||
size_t targetIdx = 0;
|
||||
for (const auto sourceIdx : c10::irange(devices.size())) {
|
||||
for (size_t sourceIdx = 0; sourceIdx < devices.size(); sourceIdx++) {
|
||||
TORCH_CHECK_VALUE(
|
||||
devices[sourceIdx].has_index(),
|
||||
"Expected devices to have indices, got ", devices[sourceIdx]);
|
||||
|
@ -1,5 +1,4 @@
|
||||
#include <ATen/core/op_registration/infer_schema.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <sstream>
|
||||
|
||||
namespace c10 {
|
||||
@ -21,7 +20,7 @@ std::string fastToString(size_t x) {
|
||||
std::vector<Argument> createArgumentVector(c10::ArrayRef<ArgumentDef> args) {
|
||||
std::vector<Argument> result;
|
||||
result.reserve(args.size());
|
||||
for (const auto i : c10::irange(args.size())) {
|
||||
for (size_t i = 0; i < args.size(); ++i) {
|
||||
// Arguments are named "_<index>"
|
||||
result.emplace_back(fastToString(i), (*args[i].getTypeFn)());
|
||||
}
|
||||
@ -50,7 +49,7 @@ C10_EXPORT c10::optional<std::string> findSchemaDifferences(const FunctionSchema
|
||||
" vs " + guts::to_string(rhs.returns().size());
|
||||
}
|
||||
|
||||
for (const auto i : c10::irange(lhs.arguments().size())) {
|
||||
for (size_t i = 0; i < lhs.arguments().size(); ++i) {
|
||||
const TypePtr& leftType = lhs.arguments()[i].type();
|
||||
const TypePtr& rightType = rhs.arguments()[i].type();
|
||||
// Type::operator== is virtual. Comparing pointers first is
|
||||
@ -62,7 +61,7 @@ C10_EXPORT c10::optional<std::string> findSchemaDifferences(const FunctionSchema
|
||||
}
|
||||
}
|
||||
|
||||
for (const auto i : c10::irange(lhs.returns().size())) {
|
||||
for (size_t i = 0; i < lhs.returns().size(); ++i) {
|
||||
const TypePtr& leftType = lhs.returns()[i].type();
|
||||
const TypePtr& rightType = rhs.returns()[i].type();
|
||||
// See above about comparing pointers first.
|
||||
|
@ -3,7 +3,6 @@
|
||||
#include <c10/util/ArrayRef.h>
|
||||
#include <c10/util/Exception.h>
|
||||
#include <c10/util/StringUtil.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <string>
|
||||
|
||||
namespace c10 {
|
||||
@ -70,7 +69,7 @@ struct QualifiedName {
|
||||
// Can't be a prefix if it's bigger
|
||||
return false;
|
||||
}
|
||||
for (const auto i : c10::irange(thisAtoms.size())) {
|
||||
for (size_t i = 0; i < thisAtoms.size(); i++) {
|
||||
if (thisAtoms[i] != otherAtoms[i]) {
|
||||
return false;
|
||||
}
|
||||
@ -117,7 +116,7 @@ struct QualifiedName {
|
||||
reserve += e.size() + 1;
|
||||
}
|
||||
out.reserve(reserve);
|
||||
for (const auto i : c10::irange(v.size())) {
|
||||
for (size_t i = 0; i < v.size(); ++i) {
|
||||
if (i != 0) {
|
||||
out.push_back(delimiter);
|
||||
}
|
||||
|
@ -4,7 +4,6 @@
|
||||
|
||||
#include <ATen/core/ivalue.h>
|
||||
#include <c10/util/Deprecated.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
// TODO move this to c10 namespace
|
||||
|
||||
@ -109,7 +108,7 @@ static inline IValue pop(Stack* stack) {
|
||||
static inline std::vector<IValue> pop(Stack& stack, size_t n) {
|
||||
std::vector<IValue> result;
|
||||
result.reserve(n);
|
||||
for (const auto i : c10::irange(n)) {
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
result.push_back(std::move(peek(stack, i, n)));
|
||||
}
|
||||
drop(stack, n);
|
||||
|
@ -4,7 +4,6 @@
|
||||
// See Note [Do not compile initializers with AVX]
|
||||
|
||||
#include <ATen/cpu/vec/vec.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
namespace at { namespace vec {
|
||||
|
||||
@ -17,7 +16,7 @@ inline scalar_t vec_reduce_all(
|
||||
using Vec = vec::Vectorized<scalar_t>;
|
||||
scalar_t acc_arr[Vec::size()];
|
||||
acc_vec.store(acc_arr);
|
||||
for (const auto i : c10::irange(1, size)) {
|
||||
for (int64_t i = 1; i < size; i++) {
|
||||
std::array<scalar_t, Vec::size()> acc_arr_next = {0};
|
||||
acc_arr_next[0] = acc_arr[i];
|
||||
Vec acc_vec_next = Vec::loadu(acc_arr_next.data());
|
||||
|
@ -4,7 +4,6 @@
|
||||
// See Note [Do not compile initializers with AVX]
|
||||
|
||||
#include <c10/util/complex.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <ATen/cpu/vec/intrinsics.h>
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
|
||||
@ -110,7 +109,7 @@ public:
|
||||
Vectorized<c10::complex<double>> map(c10::complex<double> (*const f)(const c10::complex<double> &)) const {
|
||||
__at_align__ c10::complex<double> tmp[size()];
|
||||
store(tmp);
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (int i = 0; i < size(); i++) {
|
||||
tmp[i] = f(tmp[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
@ -294,7 +293,7 @@ public:
|
||||
__at_align__ c10::complex<double> y_tmp[size()];
|
||||
store(x_tmp);
|
||||
exp.store(y_tmp);
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (int i = 0; i < size(); i++) {
|
||||
x_tmp[i] = std::pow(x_tmp[i], y_tmp[i]);
|
||||
}
|
||||
return loadu(x_tmp);
|
||||
|
@ -4,7 +4,6 @@
|
||||
// See Note [Do not compile initializers with AVX]
|
||||
|
||||
#include <c10/util/complex.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <ATen/cpu/vec/intrinsics.h>
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
|
||||
@ -145,7 +144,7 @@ public:
|
||||
Vectorized<c10::complex<float>> map(c10::complex<float> (*const f)(const c10::complex<float> &)) const {
|
||||
__at_align__ c10::complex<float> tmp[size()];
|
||||
store(tmp);
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (int i = 0; i < size(); i++) {
|
||||
tmp[i] = f(tmp[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
@ -328,7 +327,7 @@ public:
|
||||
__at_align__ c10::complex<float> y_tmp[size()];
|
||||
store(x_tmp);
|
||||
exp.store(y_tmp);
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (int i = 0; i < size(); i++) {
|
||||
x_tmp[i] = std::pow(x_tmp[i], y_tmp[i]);
|
||||
}
|
||||
return loadu(x_tmp);
|
||||
|
@ -5,7 +5,6 @@
|
||||
|
||||
#include <ATen/cpu/vec/intrinsics.h>
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
#include <c10/util/irange.h>
|
||||
#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
|
||||
#include <sleef.h>
|
||||
#endif
|
||||
@ -73,7 +72,7 @@ public:
|
||||
// Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
|
||||
// for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
|
||||
// instructions while a loop would be compiled to one instruction.
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (auto i = 0; i < size(); ++i) {
|
||||
tmp_values[i] = 0.0;
|
||||
}
|
||||
std::memcpy(
|
||||
@ -104,7 +103,7 @@ public:
|
||||
Vectorized<double> map(double (*const f)(double)) const {
|
||||
__at_align__ double tmp[size()];
|
||||
store(tmp);
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
tmp[i] = f(tmp[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
@ -181,7 +180,7 @@ public:
|
||||
__at_align__ double tmp_x[size()];
|
||||
store(tmp);
|
||||
x.store(tmp_x);
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
tmp[i] = calc_igamma(tmp[i], tmp_x[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
@ -191,7 +190,7 @@ public:
|
||||
__at_align__ double tmp_x[size()];
|
||||
store(tmp);
|
||||
x.store(tmp_x);
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
tmp[i] = calc_igammac(tmp[i], tmp_x[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
|
@ -5,7 +5,6 @@
|
||||
|
||||
#include <ATen/cpu/vec/intrinsics.h>
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
#include <c10/util/irange.h>
|
||||
#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
|
||||
#include <sleef.h>
|
||||
#endif
|
||||
@ -81,7 +80,7 @@ public:
|
||||
// Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
|
||||
// for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
|
||||
// instructions while a loop would be compiled to one instruction.
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (auto i = 0; i < size(); ++i) {
|
||||
tmp_values[i] = 0.0;
|
||||
}
|
||||
std::memcpy(
|
||||
@ -110,7 +109,7 @@ public:
|
||||
Vectorized<float> map(float (*const f)(float)) const {
|
||||
__at_align__ float tmp[size()];
|
||||
store(tmp);
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
tmp[i] = f(tmp[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
@ -218,7 +217,7 @@ public:
|
||||
__at_align__ float tmp_x[size()];
|
||||
store(tmp);
|
||||
x.store(tmp_x);
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
tmp[i] = calc_igamma(tmp[i], tmp_x[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
@ -228,7 +227,7 @@ public:
|
||||
__at_align__ float tmp_x[size()];
|
||||
store(tmp);
|
||||
x.store(tmp_x);
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
tmp[i] = calc_igammac(tmp[i], tmp_x[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
|
@ -5,7 +5,6 @@
|
||||
|
||||
#include <ATen/cpu/vec/intrinsics.h>
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
#include <c10/util/irange.h>
|
||||
// Sleef offers vectorized versions of some transcedentals
|
||||
// such as sin, cos, tan etc..
|
||||
// However for now opting for STL, since we are not building
|
||||
@ -222,7 +221,7 @@ public:
|
||||
}
|
||||
else {
|
||||
__at_align__ float tmp_values[size()];
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (auto i = 0; i < size(); ++i) {
|
||||
tmp_values[i] = 0.0;
|
||||
}
|
||||
std::memcpy(
|
||||
@ -288,7 +287,7 @@ public:
|
||||
__at_align__ float tmp[size()];
|
||||
__at_align__ float res[size()];
|
||||
store(tmp);
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (int i = 0; i < size(); i++) {
|
||||
if (_isnan(tmp[i])) {
|
||||
std::memset(static_cast<void*>(&res[i]), 0xFF, sizeof(float));
|
||||
} else {
|
||||
@ -300,7 +299,7 @@ public:
|
||||
Vectorized<float> map(float (*const f)(float)) const {
|
||||
__at_align__ float tmp[size()];
|
||||
store(tmp);
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
tmp[i] = f(tmp[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
@ -337,7 +336,7 @@ public:
|
||||
__at_align__ float tmp_exp[size()];
|
||||
store(tmp);
|
||||
exp.store(tmp_exp);
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
tmp[i] = std::atan2(tmp[i], tmp_exp[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
@ -372,7 +371,7 @@ public:
|
||||
__at_align__ float tmp_q[size()];
|
||||
store(tmp);
|
||||
q.store(tmp_q);
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
tmp[i] = std::fmod(tmp[i], tmp_q[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
@ -382,7 +381,7 @@ public:
|
||||
__at_align__ float tmp_b[size()];
|
||||
store(tmp);
|
||||
b.store(tmp_b);
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
tmp[i] = std::hypot(tmp[i], tmp_b[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
@ -398,7 +397,7 @@ public:
|
||||
__at_align__ float tmp_x[size()];
|
||||
store(tmp);
|
||||
x.store(tmp_x);
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
tmp[i] = calc_igamma(tmp[i], tmp_x[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
@ -408,7 +407,7 @@ public:
|
||||
__at_align__ float tmp_x[size()];
|
||||
store(tmp);
|
||||
x.store(tmp_x);
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
tmp[i] = calc_igammac(tmp[i], tmp_x[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
@ -430,7 +429,7 @@ public:
|
||||
__at_align__ float tmp_b[size()];
|
||||
store(tmp);
|
||||
b.store(tmp_b);
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
tmp[i] = std::nextafter(tmp[i], tmp_b[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
@ -495,7 +494,7 @@ public:
|
||||
__at_align__ float tmp_exp[size()];
|
||||
store(tmp);
|
||||
exp.store(tmp_exp);
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
tmp[i] = std::pow(tmp[i], tmp_exp[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
|
@ -6,7 +6,6 @@
|
||||
#include <ATen/cpu/vec/intrinsics.h>
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
#include <c10/macros/Macros.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <iostream>
|
||||
|
||||
namespace at {
|
||||
@ -99,7 +98,7 @@ public:
|
||||
// Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
|
||||
// for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
|
||||
// instructions while a loop would be compiled to one instruction.
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (auto i = 0; i < size(); ++i) {
|
||||
tmp_values[i] = 0;
|
||||
}
|
||||
std::memcpy(tmp_values, ptr, count * sizeof(int64_t));
|
||||
@ -222,7 +221,7 @@ public:
|
||||
// Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
|
||||
// for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
|
||||
// instructions while a loop would be compiled to one instruction.
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (auto i = 0; i < size(); ++i) {
|
||||
tmp_values[i] = 0;
|
||||
}
|
||||
std::memcpy(tmp_values, ptr, count * sizeof(int32_t));
|
||||
@ -436,7 +435,7 @@ public:
|
||||
// Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
|
||||
// for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
|
||||
// instructions while a loop would be compiled to one instruction.
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (auto i = 0; i < size(); ++i) {
|
||||
tmp_values[i] = 0;
|
||||
}
|
||||
std::memcpy(tmp_values, ptr, count * sizeof(int16_t));
|
||||
@ -685,7 +684,7 @@ public:
|
||||
// Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
|
||||
// for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
|
||||
// instructions while a loop would be compiled to one instruction.
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (size_t i = 0; i < size(); ++i) {
|
||||
tmp_values[i] = 0;
|
||||
}
|
||||
std::memcpy(tmp_values, ptr, count * sizeof(int8_t));
|
||||
|
@ -6,8 +6,6 @@
|
||||
#include <ATen/cpu/vec/intrinsics.h>
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
#include <ATen/native/quantized/affine_quantizer_base.h>
|
||||
|
||||
#include <c10/util/irange.h>
|
||||
#include <c10/util/qint32.h>
|
||||
#include <c10/util/qint8.h>
|
||||
#include <c10/util/quint8.h>
|
||||
@ -741,7 +739,7 @@ struct VectorizedQuantizedConverter {
|
||||
std::array<value_type, size_> vals;
|
||||
|
||||
VectorizedQuantizedConverter(T val) {
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (size_t i = 0; i < size(); ++i) {
|
||||
vals[i] = val.val_;
|
||||
}
|
||||
}
|
||||
@ -759,9 +757,9 @@ struct VectorizedQuantizedConverter {
|
||||
Vectorized<float> zero_point,
|
||||
Vectorized<float> scale_zp_premul) const {
|
||||
float_vec_return_type rv;
|
||||
for (const auto i : c10::irange(float_num_vecs())) {
|
||||
for (int i = 0; i < float_num_vecs(); ++i) {
|
||||
float tmp_vals[8];
|
||||
for (const auto j : c10::irange(8)) {
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
tmp_vals[j] = at::native::dequantize_val<T>(
|
||||
scale[j], zero_point[j], T(vals[8 * i + j]));
|
||||
}
|
||||
@ -818,7 +816,7 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
|
||||
std::array<value_type, size()> qvals;
|
||||
std::array<float, float_num_vecs() * 8> float_vals;
|
||||
|
||||
for (const auto i : c10::irange(float_num_vecs())) {
|
||||
for (int i = 0; i < float_num_vecs(); ++i) {
|
||||
rhs[i].store(&float_vals[i * 8], 8);
|
||||
}
|
||||
|
||||
@ -834,7 +832,7 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
|
||||
|
||||
Vectorized<c10::qint32> maximum(Vectorized<c10::qint32> b) const {
|
||||
Vectorized<c10::qint32> retval;
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (size_t i = 0; i < size(); ++i) {
|
||||
retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]);
|
||||
}
|
||||
return retval;
|
||||
@ -842,7 +840,7 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
|
||||
|
||||
Vectorized<c10::qint32> minimum(Vectorized<c10::qint32> b) const {
|
||||
Vectorized<c10::qint32> retval;
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (size_t i = 0; i < size(); ++i) {
|
||||
retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]);
|
||||
}
|
||||
return retval;
|
||||
@ -857,7 +855,7 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
|
||||
Vectorized<c10::qint32> zero_point,
|
||||
Vectorized<c10::qint32> q_six) {
|
||||
Vectorized<c10::qint32> retval;
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (size_t i = 0; i < size(); ++i) {
|
||||
retval.vals[i] = std::min<value_type>(
|
||||
std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]);
|
||||
}
|
||||
@ -866,7 +864,7 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
|
||||
|
||||
int_vec_return_type widening_subtract(Vectorized<c10::qint32> b) const {
|
||||
int_vec_return_type retval;
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (size_t i = 0; i < size(); ++i) {
|
||||
retval[0].vals[i] = vals[i] - b.vals[i];
|
||||
}
|
||||
return retval;
|
||||
@ -877,7 +875,7 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
|
||||
float multiplier,
|
||||
int32_t zero_point) {
|
||||
Vectorized<c10::qint32> retval;
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (size_t i = 0; i < size(); ++i) {
|
||||
retval.vals[i] =
|
||||
nearbyint(static_cast<float>(inp[0].vals[i]) * multiplier) +
|
||||
zero_point;
|
||||
@ -950,7 +948,7 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
|
||||
std::array<value_type, size()> qvals;
|
||||
std::array<float, float_num_vecs() * 8> float_vals;
|
||||
|
||||
for (const auto i : c10::irange(float_num_vecs())) {
|
||||
for (int i = 0; i < float_num_vecs(); ++i) {
|
||||
rhs[i].store(&float_vals[i * 8], 8);
|
||||
}
|
||||
|
||||
@ -966,7 +964,7 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
|
||||
|
||||
Vectorized<c10::qint8> maximum(Vectorized<c10::qint8> b) const {
|
||||
Vectorized<c10::qint8> retval;
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (size_t i = 0; i < size(); ++i) {
|
||||
retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]);
|
||||
}
|
||||
return retval;
|
||||
@ -974,7 +972,7 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
|
||||
|
||||
Vectorized<c10::qint8> minimum(Vectorized<c10::qint8> b) const {
|
||||
Vectorized<c10::qint8> retval;
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (size_t i = 0; i < size(); ++i) {
|
||||
retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]);
|
||||
}
|
||||
return retval;
|
||||
@ -988,7 +986,7 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
|
||||
Vectorized<c10::qint8> zero_point,
|
||||
Vectorized<c10::qint8> q_six) {
|
||||
Vectorized<c10::qint8> retval;
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (size_t i = 0; i < size(); ++i) {
|
||||
retval.vals[i] = std::min<value_type>(
|
||||
std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]);
|
||||
}
|
||||
@ -998,8 +996,8 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
|
||||
int_vec_return_type widening_subtract(Vectorized<c10::qint8> b) const {
|
||||
int_vec_return_type retval;
|
||||
constexpr int elem_per_int_vec = size() / int_num_vecs();
|
||||
for (const auto i : c10::irange(int_num_vecs())) {
|
||||
for (const auto j : c10::irange(elem_per_int_vec)) {
|
||||
for (size_t i = 0; i < int_num_vecs(); ++i) {
|
||||
for (size_t j = 0; j < elem_per_int_vec; ++j) {
|
||||
retval[i].vals[j] =
|
||||
static_cast<int32_t>(vals[i * elem_per_int_vec + j]) -
|
||||
static_cast<int32_t>(b.vals[i * elem_per_int_vec + j]);
|
||||
@ -1015,8 +1013,8 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
|
||||
constexpr auto min_val = std::numeric_limits<value_type>::min();
|
||||
constexpr auto max_val = std::numeric_limits<value_type>::max();
|
||||
Vectorized<c10::qint8> retval;
|
||||
for (const auto i : c10::irange(int_num_vecs())) {
|
||||
for (const auto j : c10::irange(elem_per_int_vec)) {
|
||||
for (size_t i = 0; i < int_num_vecs(); ++i) {
|
||||
for (size_t j = 0; j < elem_per_int_vec; ++j) {
|
||||
int32_t rounded =
|
||||
nearbyint(static_cast<float>(inp[i].vals[j]) * multiplier) +
|
||||
zero_point;
|
||||
@ -1070,7 +1068,7 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
|
||||
std::array<value_type, size()> qvals;
|
||||
std::array<float, float_num_vecs() * 8> float_vals;
|
||||
|
||||
for (const auto i : c10::irange(float_num_vecs())) {
|
||||
for (int i = 0; i < float_num_vecs(); ++i) {
|
||||
rhs[i].store(&float_vals[i * 8], 8);
|
||||
}
|
||||
|
||||
@ -1086,7 +1084,7 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
|
||||
|
||||
Vectorized<c10::quint8> maximum(Vectorized<c10::quint8> b) const {
|
||||
Vectorized<c10::quint8> retval;
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (size_t i = 0; i < size(); ++i) {
|
||||
retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]);
|
||||
}
|
||||
return retval;
|
||||
@ -1094,7 +1092,7 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
|
||||
|
||||
Vectorized<c10::quint8> minimum(Vectorized<c10::quint8> b) const {
|
||||
Vectorized<c10::quint8> retval;
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (size_t i = 0; i < size(); ++i) {
|
||||
retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]);
|
||||
}
|
||||
return retval;
|
||||
@ -1109,7 +1107,7 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
|
||||
Vectorized<c10::quint8> zero_point,
|
||||
Vectorized<c10::quint8> q_six) {
|
||||
Vectorized<c10::quint8> retval;
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (size_t i = 0; i < size(); ++i) {
|
||||
retval.vals[i] = std::min<value_type>(
|
||||
std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]);
|
||||
}
|
||||
@ -1119,8 +1117,8 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
|
||||
int_vec_return_type widening_subtract(Vectorized<c10::quint8> b) const {
|
||||
int_vec_return_type retval;
|
||||
constexpr int elem_per_int_vec = size() / int_num_vecs();
|
||||
for (const auto i : c10::irange(int_num_vecs())) {
|
||||
for (const auto j : c10::irange(elem_per_int_vec)) {
|
||||
for (size_t i = 0; i < int_num_vecs(); ++i) {
|
||||
for (size_t j = 0; j < elem_per_int_vec; ++j) {
|
||||
retval[i].vals[j] =
|
||||
static_cast<int32_t>(vals[i * elem_per_int_vec + j]) -
|
||||
static_cast<int32_t>(b.vals[i * elem_per_int_vec + j]);
|
||||
@ -1136,8 +1134,8 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
|
||||
constexpr auto min_val = std::numeric_limits<value_type>::min();
|
||||
constexpr auto max_val = std::numeric_limits<value_type>::max();
|
||||
Vectorized<c10::quint8> retval;
|
||||
for (const auto i : c10::irange(int_num_vecs())) {
|
||||
for (const auto j : c10::irange(elem_per_int_vec)) {
|
||||
for (size_t i = 0; i < int_num_vecs(); ++i) {
|
||||
for (size_t j = 0; j < elem_per_int_vec; ++j) {
|
||||
int32_t rounded =
|
||||
nearbyint(static_cast<float>(inp[i].vals[j]) * multiplier) +
|
||||
zero_point;
|
||||
|
@ -3,7 +3,6 @@
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
#include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
|
||||
#include <c10/util/complex.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
namespace at {
|
||||
namespace vec {
|
||||
@ -168,7 +167,7 @@ class Vectorized<ComplexDbl> {
|
||||
Vectorized<ComplexDbl> map(ComplexDbl (*const f)(ComplexDbl)) const {
|
||||
__at_align__ ComplexDbl tmp[size()];
|
||||
store(tmp);
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (int i = 0; i < size(); i++) {
|
||||
tmp[i] = f(tmp[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
@ -177,7 +176,7 @@ class Vectorized<ComplexDbl> {
|
||||
Vectorized<ComplexDbl> map(ComplexDbl (*const f)(const ComplexDbl&)) const {
|
||||
__at_align__ ComplexDbl tmp[size()];
|
||||
store(tmp);
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (int i = 0; i < size(); i++) {
|
||||
tmp[i] = f(tmp[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
@ -455,7 +454,7 @@ class Vectorized<ComplexDbl> {
|
||||
__at_align__ ComplexDbl y_tmp[size()];
|
||||
store(x_tmp);
|
||||
exp.store(y_tmp);
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (int i = 0; i < size(); i++) {
|
||||
x_tmp[i] = std::pow(x_tmp[i], y_tmp[i]);
|
||||
}
|
||||
return loadu(x_tmp);
|
||||
|
@ -4,7 +4,6 @@
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
#include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
|
||||
#include <c10/util/complex.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
namespace at {
|
||||
namespace vec {
|
||||
@ -223,7 +222,7 @@ class Vectorized<ComplexFlt> {
|
||||
Vectorized<ComplexFlt> map(ComplexFlt (*const f)(ComplexFlt)) const {
|
||||
__at_align__ ComplexFlt tmp[size()];
|
||||
store(tmp);
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (int i = 0; i < size(); i++) {
|
||||
tmp[i] = f(tmp[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
@ -232,7 +231,7 @@ class Vectorized<ComplexFlt> {
|
||||
Vectorized<ComplexFlt> map(ComplexFlt (*const f)(const ComplexFlt&)) const {
|
||||
__at_align__ ComplexFlt tmp[size()];
|
||||
store(tmp);
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (int i = 0; i < size(); i++) {
|
||||
tmp[i] = f(tmp[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
@ -431,7 +430,7 @@ class Vectorized<ComplexFlt> {
|
||||
__at_align__ ComplexFlt y_tmp[size()];
|
||||
store(x_tmp);
|
||||
exp.store(y_tmp);
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (int i = 0; i < size(); i++) {
|
||||
x_tmp[i] = std::pow(x_tmp[i], y_tmp[i]);
|
||||
}
|
||||
return loadu(x_tmp);
|
||||
|
@ -3,8 +3,6 @@
|
||||
#include <ATen/cpu/vec/intrinsics.h>
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
#include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
|
||||
|
||||
#include <c10/util/irange.h>
|
||||
#include <c10/util/quint8.h>
|
||||
#include <array>
|
||||
|
||||
|
@ -4,7 +4,6 @@
|
||||
// See Note [Do not compile initializers with AVX]
|
||||
|
||||
#include <c10/util/complex.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <ATen/cpu/vec/intrinsics.h>
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
|
||||
@ -150,7 +149,7 @@ public:
|
||||
Vectorized<c10::complex<double>> map(c10::complex<double> (*const f)(const c10::complex<double> &)) const {
|
||||
__at_align__ c10::complex<double> tmp[size()];
|
||||
store(tmp);
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (int i = 0; i < size(); i++) {
|
||||
tmp[i] = f(tmp[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
@ -358,7 +357,7 @@ public:
|
||||
__at_align__ c10::complex<double> y_tmp[size()];
|
||||
store(x_tmp);
|
||||
exp.store(y_tmp);
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (int i = 0; i < size(); i++) {
|
||||
x_tmp[i] = std::pow(x_tmp[i], y_tmp[i]);
|
||||
}
|
||||
return loadu(x_tmp);
|
||||
|
@ -4,7 +4,6 @@
|
||||
// See Note [Do not compile initializers with AVX]
|
||||
|
||||
#include <c10/util/complex.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <ATen/cpu/vec/intrinsics.h>
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
|
||||
@ -668,7 +667,7 @@ public:
|
||||
Vectorized<c10::complex<float>> map(c10::complex<float> (*const f)(const c10::complex<float> &)) const {
|
||||
__at_align__ c10::complex<float> tmp[size()];
|
||||
store(tmp);
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (int i = 0; i < size(); i++) {
|
||||
tmp[i] = f(tmp[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
@ -859,7 +858,7 @@ public:
|
||||
__at_align__ c10::complex<float> y_tmp[size()];
|
||||
store(x_tmp);
|
||||
exp.store(y_tmp);
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (int i = 0; i < size(); i++) {
|
||||
x_tmp[i] = std::pow(x_tmp[i], y_tmp[i]);
|
||||
}
|
||||
return loadu(x_tmp);
|
||||
|
@ -5,7 +5,6 @@
|
||||
|
||||
#include <ATen/cpu/vec/intrinsics.h>
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
#include <c10/util/irange.h>
|
||||
#if (defined(CPU_CAPABILITY_AVX512)) && !defined(_MSC_VER)
|
||||
#include <sleef.h>
|
||||
#endif
|
||||
@ -88,7 +87,7 @@ public:
|
||||
// Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
|
||||
// for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
|
||||
// instructions while a loop would be compiled to one instruction.
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (auto i = 0; i < size(); ++i) {
|
||||
tmp_values[i] = 0.0;
|
||||
}
|
||||
std::memcpy(
|
||||
@ -121,7 +120,7 @@ public:
|
||||
Vectorized<double> map(double (*const f)(double)) const {
|
||||
__at_align__ double tmp[size()];
|
||||
store(tmp);
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
tmp[i] = f(tmp[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
@ -201,7 +200,7 @@ public:
|
||||
__at_align__ double tmp_x[size()];
|
||||
store(tmp);
|
||||
x.store(tmp_x);
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
tmp[i] = calc_igamma(tmp[i], tmp_x[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
@ -211,7 +210,7 @@ public:
|
||||
__at_align__ double tmp_x[size()];
|
||||
store(tmp);
|
||||
x.store(tmp_x);
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
tmp[i] = calc_igammac(tmp[i], tmp_x[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
|
@ -5,7 +5,6 @@
|
||||
|
||||
#include <ATen/cpu/vec/intrinsics.h>
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
#include <c10/util/irange.h>
|
||||
#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
|
||||
#include <sleef.h>
|
||||
#endif
|
||||
@ -105,7 +104,7 @@ public:
|
||||
// Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
|
||||
// for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
|
||||
// instructions while a loop would be compiled to one instruction.
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (auto i = 0; i < size(); ++i) {
|
||||
tmp_values[i] = 0.0;
|
||||
}
|
||||
std::memcpy(
|
||||
@ -136,7 +135,7 @@ public:
|
||||
Vectorized<float> map(float (*const f)(float)) const {
|
||||
__at_align__ float tmp[size()];
|
||||
store(tmp);
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
tmp[i] = f(tmp[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
@ -247,7 +246,7 @@ public:
|
||||
__at_align__ float tmp_x[size()];
|
||||
store(tmp);
|
||||
x.store(tmp_x);
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
tmp[i] = calc_igamma(tmp[i], tmp_x[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
@ -257,7 +256,7 @@ public:
|
||||
__at_align__ float tmp_x[size()];
|
||||
store(tmp);
|
||||
x.store(tmp_x);
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
tmp[i] = calc_igammac(tmp[i], tmp_x[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
|
@ -6,7 +6,6 @@
|
||||
#include <ATen/cpu/vec/intrinsics.h>
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
#include <c10/macros/Macros.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
namespace at {
|
||||
namespace vec {
|
||||
@ -101,7 +100,7 @@ public:
|
||||
// Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
|
||||
// for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
|
||||
// instructions while a loop would be compiled to one instruction.
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (auto i = 0; i < size(); ++i) {
|
||||
tmp_values[i] = 0;
|
||||
}
|
||||
std::memcpy(tmp_values, ptr, count * sizeof(int64_t));
|
||||
@ -254,7 +253,7 @@ public:
|
||||
// Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
|
||||
// for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
|
||||
// instructions while a loop would be compiled to one instruction.
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (auto i = 0; i < size(); ++i) {
|
||||
tmp_values[i] = 0;
|
||||
}
|
||||
std::memcpy(tmp_values, ptr, count * sizeof(int32_t));
|
||||
@ -486,7 +485,7 @@ public:
|
||||
// Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
|
||||
// for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
|
||||
// instructions while a loop would be compiled to one instruction.
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (auto i = 0; i < size(); ++i) {
|
||||
tmp_values[i] = 0;
|
||||
}
|
||||
std::memcpy(tmp_values, ptr, count * sizeof(int16_t));
|
||||
@ -762,7 +761,7 @@ public:
|
||||
// Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
|
||||
// for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
|
||||
// instructions while a loop would be compiled to one instruction.
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (size_t i = 0; i < size(); ++i) {
|
||||
tmp_values[i] = 0;
|
||||
}
|
||||
std::memcpy(tmp_values, ptr, count * sizeof(int8_t));
|
||||
|
@ -6,8 +6,6 @@
|
||||
#include <ATen/cpu/vec/intrinsics.h>
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
#include <ATen/native/quantized/affine_quantizer_base.h>
|
||||
|
||||
#include <c10/util/irange.h>
|
||||
#include <c10/util/qint32.h>
|
||||
#include <c10/util/qint8.h>
|
||||
#include <c10/util/quint8.h>
|
||||
@ -746,7 +744,7 @@ struct VectorizedQuantizedConverter {
|
||||
std::array<value_type, size_> vals;
|
||||
|
||||
VectorizedQuantizedConverter(T val) {
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (size_t i = 0; i < size(); ++i) {
|
||||
vals[i] = val.val_;
|
||||
}
|
||||
}
|
||||
@ -764,9 +762,9 @@ struct VectorizedQuantizedConverter {
|
||||
Vectorized<float> zero_point,
|
||||
Vectorized<float> scale_zp_premul) const {
|
||||
float_vec_return_type rv;
|
||||
for (const auto i : c10::irange(float_num_vecs())) {
|
||||
for (int i = 0; i < float_num_vecs(); ++i) {
|
||||
float tmp_vals[16];
|
||||
for (const auto j : c10::irange(16)) {
|
||||
for (int j = 0; j < 16; ++j) {
|
||||
tmp_vals[j] = at::native::dequantize_val<T>(
|
||||
scale[j], zero_point[j], T(vals[16 * i + j]));
|
||||
}
|
||||
@ -831,7 +829,7 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
|
||||
std::array<value_type, size()> qvals;
|
||||
std::array<float, float_num_vecs() * 16> float_vals;
|
||||
|
||||
for (const auto i : c10::irange(float_num_vecs())) {
|
||||
for (int i = 0; i < float_num_vecs(); ++i) {
|
||||
rhs[i].store(&float_vals[i * 16], 16);
|
||||
}
|
||||
|
||||
@ -847,7 +845,7 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
|
||||
|
||||
Vectorized<c10::qint32> maximum(Vectorized<c10::qint32> b) const {
|
||||
Vectorized<c10::qint32> retval;
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (size_t i = 0; i < size(); ++i) {
|
||||
retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]);
|
||||
}
|
||||
return retval;
|
||||
@ -855,7 +853,7 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
|
||||
|
||||
Vectorized<c10::qint32> minimum(Vectorized<c10::qint32> b) const {
|
||||
Vectorized<c10::qint32> retval;
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (size_t i = 0; i < size(); ++i) {
|
||||
retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]);
|
||||
}
|
||||
return retval;
|
||||
@ -870,7 +868,7 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
|
||||
Vectorized<c10::qint32> zero_point,
|
||||
Vectorized<c10::qint32> q_six) {
|
||||
Vectorized<c10::qint32> retval;
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (size_t i = 0; i < size(); ++i) {
|
||||
retval.vals[i] = std::min<value_type>(
|
||||
std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]);
|
||||
}
|
||||
@ -879,7 +877,7 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
|
||||
|
||||
int_vec_return_type widening_subtract(Vectorized<c10::qint32> b) const {
|
||||
int_vec_return_type retval;
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (size_t i = 0; i < size(); ++i) {
|
||||
retval[0].vals[i] = vals[i] - b.vals[i];
|
||||
}
|
||||
return retval;
|
||||
@ -890,7 +888,7 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
|
||||
float multiplier,
|
||||
int32_t zero_point) {
|
||||
Vectorized<c10::qint32> retval;
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (size_t i = 0; i < size(); ++i) {
|
||||
retval.vals[i] =
|
||||
nearbyint(static_cast<float>(inp[0].vals[i]) * multiplier) +
|
||||
zero_point;
|
||||
@ -963,7 +961,7 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
|
||||
std::array<value_type, size()> qvals;
|
||||
std::array<float, float_num_vecs() * 16> float_vals;
|
||||
|
||||
for (const auto i : c10::irange(float_num_vecs())) {
|
||||
for (int i = 0; i < float_num_vecs(); ++i) {
|
||||
rhs[i].store(&float_vals[i * 16], 16);
|
||||
}
|
||||
|
||||
@ -979,7 +977,7 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
|
||||
|
||||
Vectorized<c10::qint8> maximum(Vectorized<c10::qint8> b) const {
|
||||
Vectorized<c10::qint8> retval;
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (size_t i = 0; i < size(); ++i) {
|
||||
retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]);
|
||||
}
|
||||
return retval;
|
||||
@ -987,7 +985,7 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
|
||||
|
||||
Vectorized<c10::qint8> minimum(Vectorized<c10::qint8> b) const {
|
||||
Vectorized<c10::qint8> retval;
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (size_t i = 0; i < size(); ++i) {
|
||||
retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]);
|
||||
}
|
||||
return retval;
|
||||
@ -1001,7 +999,7 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
|
||||
Vectorized<c10::qint8> zero_point,
|
||||
Vectorized<c10::qint8> q_six) {
|
||||
Vectorized<c10::qint8> retval;
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (size_t i = 0; i < size(); ++i) {
|
||||
retval.vals[i] = std::min<value_type>(
|
||||
std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]);
|
||||
}
|
||||
@ -1011,8 +1009,8 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
|
||||
int_vec_return_type widening_subtract(Vectorized<c10::qint8> b) const {
|
||||
int_vec_return_type retval;
|
||||
constexpr int elem_per_int_vec = size() / int_num_vecs();
|
||||
for (const auto i : c10::irange(int_num_vecs())) {
|
||||
for (const auto j : c10::irange(elem_per_int_vec)) {
|
||||
for (size_t i = 0; i < int_num_vecs(); ++i) {
|
||||
for (size_t j = 0; j < elem_per_int_vec; ++j) {
|
||||
retval[i].vals[j] =
|
||||
static_cast<int32_t>(vals[i * elem_per_int_vec + j]) -
|
||||
static_cast<int32_t>(b.vals[i * elem_per_int_vec + j]);
|
||||
@ -1028,8 +1026,8 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
|
||||
constexpr auto min_val = std::numeric_limits<value_type>::min();
|
||||
constexpr auto max_val = std::numeric_limits<value_type>::max();
|
||||
Vectorized<c10::qint8> retval;
|
||||
for (const auto i : c10::irange(int_num_vecs())) {
|
||||
for (const auto j : c10::irange(elem_per_int_vec)) {
|
||||
for (size_t i = 0; i < int_num_vecs(); ++i) {
|
||||
for (size_t j = 0; j < elem_per_int_vec; ++j) {
|
||||
int32_t rounded =
|
||||
nearbyint(static_cast<float>(inp[i].vals[j]) * multiplier) +
|
||||
zero_point;
|
||||
@ -1083,7 +1081,7 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
|
||||
std::array<value_type, size()> qvals;
|
||||
std::array<float, float_num_vecs() * 16> float_vals;
|
||||
|
||||
for (const auto i : c10::irange(float_num_vecs())) {
|
||||
for (int i = 0; i < float_num_vecs(); ++i) {
|
||||
rhs[i].store(&float_vals[i * 16], 16);
|
||||
}
|
||||
|
||||
@ -1099,7 +1097,7 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
|
||||
|
||||
Vectorized<c10::quint8> maximum(Vectorized<c10::quint8> b) const {
|
||||
Vectorized<c10::quint8> retval;
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (size_t i = 0; i < size(); ++i) {
|
||||
retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]);
|
||||
}
|
||||
return retval;
|
||||
@ -1107,7 +1105,7 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
|
||||
|
||||
Vectorized<c10::quint8> minimum(Vectorized<c10::quint8> b) const {
|
||||
Vectorized<c10::quint8> retval;
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (size_t i = 0; i < size(); ++i) {
|
||||
retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]);
|
||||
}
|
||||
return retval;
|
||||
@ -1122,7 +1120,7 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
|
||||
Vectorized<c10::quint8> zero_point,
|
||||
Vectorized<c10::quint8> q_six) {
|
||||
Vectorized<c10::quint8> retval;
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (size_t i = 0; i < size(); ++i) {
|
||||
retval.vals[i] = std::min<value_type>(
|
||||
std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]);
|
||||
}
|
||||
@ -1132,8 +1130,8 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
|
||||
int_vec_return_type widening_subtract(Vectorized<c10::quint8> b) const {
|
||||
int_vec_return_type retval;
|
||||
constexpr int elem_per_int_vec = size() / int_num_vecs();
|
||||
for (const auto i : c10::irange(int_num_vecs())) {
|
||||
for (const auto j : c10::irange(elem_per_int_vec)) {
|
||||
for (size_t i = 0; i < int_num_vecs(); ++i) {
|
||||
for (size_t j = 0; j < elem_per_int_vec; ++j) {
|
||||
retval[i].vals[j] =
|
||||
static_cast<int32_t>(vals[i * elem_per_int_vec + j]) -
|
||||
static_cast<int32_t>(b.vals[i * elem_per_int_vec + j]);
|
||||
@ -1149,8 +1147,8 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
|
||||
constexpr auto min_val = std::numeric_limits<value_type>::min();
|
||||
constexpr auto max_val = std::numeric_limits<value_type>::max();
|
||||
Vectorized<c10::quint8> retval;
|
||||
for (const auto i : c10::irange(int_num_vecs())) {
|
||||
for (const auto j : c10::irange(elem_per_int_vec)) {
|
||||
for (size_t i = 0; i < int_num_vecs(); ++i) {
|
||||
for (size_t j = 0; j < elem_per_int_vec; ++j) {
|
||||
int32_t rounded =
|
||||
nearbyint(static_cast<float>(inp[i].vals[j]) * multiplier) +
|
||||
zero_point;
|
||||
|
@ -31,7 +31,6 @@
|
||||
#include <ATen/native/cpu/zmath.h>
|
||||
#include <c10/util/TypeCast.h>
|
||||
#include <c10/macros/Macros.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
// These macros helped us unify vec_base.h
|
||||
#ifdef CPU_CAPABILITY_AVX512
|
||||
@ -151,7 +150,7 @@ public:
|
||||
static Vectorized<T> blend(const Vectorized<T>& a, const Vectorized<T>& b) {
|
||||
int64_t mask = mask_;
|
||||
Vectorized vector;
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
if (mask & 0x01) {
|
||||
vector[i] = b[i];
|
||||
} else {
|
||||
@ -166,7 +165,7 @@ public:
|
||||
Vectorized vector;
|
||||
int_same_size_t<T> buffer[size()];
|
||||
mask.store(buffer);
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
if (buffer[i] & 0x01)
|
||||
{
|
||||
vector[i] = b[i];
|
||||
@ -179,14 +178,14 @@ public:
|
||||
template<typename step_t> // step sometimes requires a higher precision type (e.g., T=int, step_t=double)
|
||||
static Vectorized<T> arange(T base = static_cast<T>(0), step_t step = static_cast<step_t>(1)) {
|
||||
Vectorized vector;
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
vector.values[i] = base + i * step;
|
||||
}
|
||||
return vector;
|
||||
}
|
||||
static Vectorized<T> set(const Vectorized<T>& a, const Vectorized<T>& b, int64_t count = size()) {
|
||||
Vectorized vector;
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
if (i < count) {
|
||||
vector[i] = b[i];
|
||||
} else {
|
||||
@ -341,7 +340,7 @@ public:
|
||||
}
|
||||
Vectorized<T> atan2(const Vectorized<T> &exp) const {
|
||||
Vectorized<T> ret;
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
ret[i] = std::atan2(values[i], exp[i]);
|
||||
}
|
||||
return ret;
|
||||
@ -381,7 +380,7 @@ public:
|
||||
// U is for SFINAE purposes only. Make sure it is not changed.
|
||||
static_assert(std::is_same<U, T>::value, "U must be T");
|
||||
Vectorized<T> ret;
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
ret[i] = std::fmod(values[i], q[i]);
|
||||
}
|
||||
return ret;
|
||||
@ -424,7 +423,7 @@ public:
|
||||
}
|
||||
Vectorized<T> hypot(const Vectorized<T> &b) const {
|
||||
Vectorized<T> ret;
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
ret[i] = std::hypot(values[i], b[i]);
|
||||
}
|
||||
return ret;
|
||||
@ -437,14 +436,14 @@ public:
|
||||
}
|
||||
Vectorized<T> igamma(const Vectorized<T> &x) const {
|
||||
Vectorized<T> ret;
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
ret[i] = calc_igamma(values[i], x[i]);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
Vectorized<T> igammac(const Vectorized<T> &x) const {
|
||||
Vectorized<T> ret;
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
ret[i] = calc_igammac(values[i], x[i]);
|
||||
}
|
||||
return ret;
|
||||
@ -457,7 +456,7 @@ public:
|
||||
}
|
||||
Vectorized<T> nextafter(const Vectorized<T> &b) const {
|
||||
Vectorized<T> ret;
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
ret[i] = std::nextafter(values[i], b[i]);
|
||||
}
|
||||
return ret;
|
||||
@ -495,7 +494,7 @@ public:
|
||||
}
|
||||
Vectorized<T> pow(const Vectorized<T> &exp) const {
|
||||
Vectorized<T> ret;
|
||||
for (const auto i : c10::irange(size())) {
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
ret[i] = std::pow(values[i], exp[i]);
|
||||
}
|
||||
return ret;
|
||||
@ -809,7 +808,7 @@ inline gather(T const* base_addr, const Vectorized<int_same_size_t<T>>& vindex)
|
||||
int_same_size_t<T> index_arr[size];
|
||||
vindex.store(static_cast<void*>(index_arr));
|
||||
T buffer[size];
|
||||
for (const auto i : c10::irange(size)) {
|
||||
for (int64_t i = 0; i < size; i++) {
|
||||
buffer[i] = base_addr[index_arr[i] * scale / sizeof(T)];
|
||||
}
|
||||
return Vectorized<T>::loadu(static_cast<void*>(buffer));
|
||||
@ -827,7 +826,7 @@ inline mask_gather(const Vectorized<T>& src, T const* base_addr,
|
||||
mask.store(static_cast<void*>(mask_arr));
|
||||
vindex.store(static_cast<void*>(index_arr));
|
||||
T buffer[size];
|
||||
for (const auto i : c10::irange(size)) {
|
||||
for (int64_t i = 0; i < size; i++) {
|
||||
if (mask_arr[i] & 0x01) { // check highest bit
|
||||
buffer[i] = base_addr[index_arr[i] * scale / sizeof(T)];
|
||||
} else {
|
||||
@ -873,7 +872,7 @@ inline Vectorized<int_same_size_t<T>> convert_to_int_of_same_size(const Vectoriz
|
||||
T src_arr[size];
|
||||
src.store(static_cast<void*>(src_arr));
|
||||
int_same_size_t<T> buffer[size];
|
||||
for (const auto i : c10::irange(size)) {
|
||||
for (int64_t i = 0; i < size; i++) {
|
||||
buffer[i] = static_cast<int_same_size_t<T>>(src_arr[i]);
|
||||
}
|
||||
return Vectorized<int_same_size_t<T>>::loadu(static_cast<void*>(buffer));
|
||||
@ -900,7 +899,7 @@ deinterleave2(const Vectorized<T>& a, const Vectorized<T>& b) {
|
||||
T buffer2[size];
|
||||
a.store(static_cast<void*>(a_arr));
|
||||
b.store(static_cast<void*>(b_arr));
|
||||
for (const auto i : c10::irange(half_size)) {
|
||||
for (int64_t i = 0; i < half_size; i++) {
|
||||
buffer1[i] = a_arr[i * 2];
|
||||
buffer1[half_size + i] = b_arr[i * 2];
|
||||
buffer2[i] = a_arr[i * 2 + 1];
|
||||
@ -932,7 +931,7 @@ interleave2(const Vectorized<T>& a, const Vectorized<T>& b) {
|
||||
T buffer2[size];
|
||||
a.store(static_cast<void*>(a_arr));
|
||||
b.store(static_cast<void*>(b_arr));
|
||||
for (const auto i : c10::irange(half_size)) {
|
||||
for (int64_t i = 0; i < half_size; i++) {
|
||||
buffer1[i * 2] = a_arr[i];
|
||||
buffer1[i * 2 + 1] = b_arr[i];
|
||||
buffer2[i * 2] = a_arr[half_size + i];
|
||||
@ -947,8 +946,7 @@ inline void convert(const src_T *src, dst_T *dst, int64_t n) {
|
||||
#ifndef _MSC_VER
|
||||
# pragma unroll
|
||||
#endif
|
||||
for (const auto i : c10::irange(n)) {
|
||||
(void)i; //Suppress unused variable warning
|
||||
for (int64_t i = 0; i < n; i++) {
|
||||
*dst = c10::static_cast_with_inter_type<dst_T, src_T>::apply(*src);
|
||||
src++;
|
||||
dst++;
|
||||
|
@ -4,7 +4,6 @@
|
||||
|
||||
#include <ATen/cuda/CUDABlas.h>
|
||||
#include <ATen/cuda/Exceptions.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
#define CUDABLAS_POSINT_CHECK(FD, X) \
|
||||
TORCH_CHECK( \
|
||||
@ -296,7 +295,7 @@ void bgemm<at::Half>(CUDABLAS_BGEMM_ARGTYPES(at::Half)) {
|
||||
c, CUDA_R_16F, ldc, stridec,
|
||||
num_batches, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
||||
} else {
|
||||
for (const auto i : c10::irange(num_batches)) {
|
||||
for (int64_t i = 0; i < num_batches; ++i) {
|
||||
at::cuda::blas::gemm<at::Half>(
|
||||
transa, transb,
|
||||
m, n, k,
|
||||
|
@ -1,7 +1,6 @@
|
||||
#include <ATen/cudnn/Descriptors.h>
|
||||
|
||||
#include <ATen/ATen.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
@ -48,11 +47,11 @@ void TensorDescriptor::set(cudnnDataType_t datatype, IntArrayRef t_sizes, IntArr
|
||||
#undef STR
|
||||
int size[CUDNN_DIM_MAX];
|
||||
int stride[CUDNN_DIM_MAX];
|
||||
for (const auto i : c10::irange(dim)) {
|
||||
for (size_t i = 0; i < dim; ++i) {
|
||||
size[i] = static_cast<int>(t_sizes[i]);
|
||||
stride[i] = static_cast<int>(t_strides[i]);
|
||||
}
|
||||
for (const auto i : c10::irange(dim, pad)) {
|
||||
for (size_t i = dim; i < pad; ++i) {
|
||||
size[i] = 1;
|
||||
stride[i] = 1;
|
||||
}
|
||||
@ -127,10 +126,10 @@ void FilterDescriptor::set(const at::Tensor &t, const at::MemoryFormat memory_fo
|
||||
"cuDNN filters (a.k.a. weights) must be contiguous in desired memory_format");
|
||||
|
||||
int size[CUDNN_DIM_MAX];
|
||||
for (const auto i : c10::irange(dim)) {
|
||||
for (int i = 0; i < dim; ++i) {
|
||||
size[i] = (int) t.size(i);
|
||||
}
|
||||
for (const auto i : c10::irange(dim, pad)) {
|
||||
for (int i = dim; i < pad; ++i) {
|
||||
size[i] = (int) 1;
|
||||
}
|
||||
dim = std::max(dim, pad);
|
||||
|
@ -1,6 +1,5 @@
|
||||
#include <ATen/miopen/Descriptors.h>
|
||||
#include <ATen/ATen.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
#include <iostream>
|
||||
|
||||
@ -40,11 +39,11 @@ void TensorDescriptor::set(miopenDataType_t datatype, IntArrayRef t_sizes, IntAr
|
||||
#undef STR
|
||||
int size[MIOPEN_DIM_MAX];
|
||||
int stride[MIOPEN_DIM_MAX];
|
||||
for (const auto i : c10::irange(dim)) {
|
||||
for (size_t i = 0; i < dim; ++i) {
|
||||
size[i] = static_cast<int>(t_sizes[i]);
|
||||
stride[i] = static_cast<int>(t_strides[i]);
|
||||
}
|
||||
for (const auto i : c10::irange(dim, pad)) {
|
||||
for (size_t i = dim; i < pad; ++i) {
|
||||
size[i] = 1;
|
||||
stride[i] = 1;
|
||||
}
|
||||
@ -104,10 +103,10 @@ void FilterDescriptor::set(const at::Tensor &t, const at::MemoryFormat memory_fo
|
||||
|
||||
int size[MIOPEN_DIM_MAX];
|
||||
int stride[MIOPEN_DIM_MAX];
|
||||
for (const auto i : c10::irange(dim)) {
|
||||
for (int i = 0; i < dim; ++i) {
|
||||
size[i] = (int) t.size(i);
|
||||
}
|
||||
for (const auto i : c10::irange(dim, pad)) {
|
||||
for (int i = dim; i < pad; ++i) {
|
||||
size[i] = (int) 1;
|
||||
}
|
||||
|
||||
|
@ -500,7 +500,7 @@ inline void _rrelu_with_noise_train(
|
||||
scalar_t* noise_data = noise.data_ptr<scalar_t>();
|
||||
auto gen = at::get_generator_or_default<CPUGeneratorImpl>(generator, detail::getDefaultCPUGenerator());
|
||||
std::lock_guard<std::mutex> lock(gen->mutex_);
|
||||
for (const auto i : c10::irange(input.numel())) {
|
||||
for (int64_t i = 0; i < input.numel(); i++) {
|
||||
if (input_data[i] <= 0) {
|
||||
at::uniform_real_distribution<double> uniform(lower, upper);
|
||||
const scalar_t r = (scalar_t)uniform(gen);
|
||||
@ -610,7 +610,7 @@ void inline prelu_cpu_kernel_share_weights(
|
||||
auto weight_val = weight.data_ptr<scalar_t>()[0];
|
||||
|
||||
at::parallel_for(0, input_numel, 1000, [&](int64_t start, int64_t end) {
|
||||
for (const auto i : c10::irange(start, end)) {
|
||||
for (auto i = start; i < end; i++) {
|
||||
scalar_t input_data_val = input_data[i];
|
||||
// to allow for compiler optimization, here splitting into two lines:
|
||||
scalar_t r = (input_data_val > 0) ? scalar_t(1) : weight_val;
|
||||
@ -725,7 +725,7 @@ void inline prelu_cpu_backward_kernel_share_weights(
|
||||
scalar_t sum = at::parallel_reduce(0, input_numel, 1000, scalar_t(0),
|
||||
[&](int64_t start, int64_t end, scalar_t ident) -> scalar_t {
|
||||
scalar_t partial_sum = ident;
|
||||
for (const auto i : c10::irange(start, end)) {
|
||||
for (auto i = start; i < end; i++) {
|
||||
scalar_t input_data_val = input_data[i];
|
||||
scalar_t grad_out_data_val = grad_out_data[i];
|
||||
// to allow for compiler optimization, here splitting into two lines:
|
||||
@ -839,7 +839,7 @@ std::tuple<Tensor, Tensor> prelu_backward_cpu(const Tensor& grad_out_, const Ten
|
||||
std::vector<int64_t> reduce_dims;
|
||||
reduce_dims.push_back(0);
|
||||
if (dims > 2) {
|
||||
for (const auto i : c10::irange(2, dims))reduce_dims.push_back(i);
|
||||
for(int64_t i = 2; i < dims; i++) reduce_dims.push_back(i);
|
||||
}
|
||||
weight_grad = weight_grad_collector.sum(reduce_dims);
|
||||
}
|
||||
|
@ -2,7 +2,6 @@
|
||||
#include <ATen/NativeFunctions.h>
|
||||
#include <ATen/native/AdaptivePooling.h>
|
||||
#include <ATen/native/xnnpack/Engine.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
|
||||
namespace at {
|
||||
@ -17,7 +16,7 @@ namespace {
|
||||
{
|
||||
TORCH_CHECK(output_size.size() == 2, "adaptive_avg_pool2d: output_size must be 2");
|
||||
int64_t ndim = input.ndimension();
|
||||
for (const auto i : c10::irange(1, ndim)) {
|
||||
for (int64_t i = 1; i < ndim; i++) {
|
||||
TORCH_CHECK(input.size(i) > 0,
|
||||
"adaptive_avg_pool2d(): Expected input to have non-zero size for non-batch dimensions, "
|
||||
"but input has sizes ", input.sizes(), " with dimension ", i, " being "
|
||||
@ -53,7 +52,7 @@ namespace {
|
||||
const Tensor& input)
|
||||
{
|
||||
int64_t ndim = grad_output.ndimension();
|
||||
for (const auto i : c10::irange(1, ndim)) {
|
||||
for (int64_t i = 1; i < ndim; i++) {
|
||||
TORCH_CHECK(grad_output.size(i) > 0,
|
||||
"adaptive_avg_pool2d_backward(): Expected grad_output to have non-zero size for non-batch dimensions, "
|
||||
"but grad_output has sizes ", grad_output.sizes(), " with dimension ", i, " being "
|
||||
|
@ -1,7 +1,6 @@
|
||||
#include <ATen/ATen.h>
|
||||
#include <ATen/NativeFunctions.h>
|
||||
#include <ATen/Parallel.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
namespace at {
|
||||
namespace native {
|
||||
@ -34,19 +33,19 @@ static void adaptive_avg_pool3d_out_frame(
|
||||
int64_t istrideH,
|
||||
int64_t istrideW) {
|
||||
at::parallel_for(0, sizeD, 1, [&](int64_t start, int64_t end) {
|
||||
for (const auto d : c10::irange(start, end)) {
|
||||
for (int64_t d = start; d < end; d++) {
|
||||
/* loop over output */
|
||||
for (const auto ot : c10::irange(osizeT)) {
|
||||
for (int64_t ot = 0; ot < osizeT; ot++) {
|
||||
int istartT = start_index(ot, osizeT, isizeT);
|
||||
int iendT = end_index(ot, osizeT, isizeT);
|
||||
int kT = iendT - istartT;
|
||||
|
||||
for (const auto oh : c10::irange(osizeH)) {
|
||||
for (int64_t oh = 0; oh < osizeH; oh++) {
|
||||
int istartH = start_index(oh, osizeH, isizeH);
|
||||
int iendH = end_index(oh, osizeH, isizeH);
|
||||
int kH = iendH - istartH;
|
||||
|
||||
for (const auto ow : c10::irange(osizeW)) {
|
||||
for (int64_t ow = 0; ow < osizeW; ow++) {
|
||||
int istartW = start_index(ow, osizeW, isizeW);
|
||||
int iendW = end_index(ow, osizeW, isizeW);
|
||||
int kW = iendW - istartW;
|
||||
@ -59,9 +58,9 @@ static void adaptive_avg_pool3d_out_frame(
|
||||
|
||||
/* compute local average: */
|
||||
scalar_t sum = 0;
|
||||
for (const auto it : c10::irange(kT)) {
|
||||
for (const auto ih : c10::irange(kH)) {
|
||||
for (const auto iw : c10::irange(kW)) {
|
||||
for (int it = 0; it < kT; it++) {
|
||||
for (int ih = 0; ih < kH; ih++) {
|
||||
for (int iw = 0; iw < kW; iw++) {
|
||||
scalar_t val =
|
||||
*(ip + it * istrideT + ih * istrideH + iw * istrideW);
|
||||
sum += val;
|
||||
@ -84,7 +83,7 @@ void adaptive_avg_pool3d_out_cpu_template(
|
||||
IntArrayRef output_size) {
|
||||
TORCH_CHECK(output_size.size() == 3, "adaptive_avg_pool3d: output_size must be 3");
|
||||
|
||||
for (const auto i : c10::irange(1, input.ndimension())) {
|
||||
for (int64_t i = 1; i < input.ndimension(); i++) {
|
||||
TORCH_CHECK(
|
||||
input.size(i) > 0,
|
||||
"adaptive_avg_pool3d(): Expected input to have non-zero size for non-batch dimensions, "
|
||||
@ -149,7 +148,7 @@ void adaptive_avg_pool3d_out_cpu_template(
|
||||
auto input_data = input.data_ptr<scalar_t>();
|
||||
auto output_data = output.data_ptr<scalar_t>();
|
||||
at::parallel_for(0, n, 1, [&](int64_t start, int64_t end) {
|
||||
for (const auto b : c10::irange(start, end)) {
|
||||
for (int64_t b = start; b < end; ++b) {
|
||||
adaptive_avg_pool3d_out_frame<scalar_t>(
|
||||
input_data + b * input.stride(0),
|
||||
output_data + b * sizeD * osizeT * osizeH * osizeW,
|
||||
@ -182,22 +181,22 @@ static void adaptive_avg_pool3d_backward_out_frame(
|
||||
int64_t osizeH,
|
||||
int64_t osizeW) {
|
||||
at::parallel_for(0, sizeD, 1, [&](int64_t start, int64_t end) {
|
||||
for (const auto d : c10::irange(start, end)) {
|
||||
for (int64_t d = start; d < end; d++) {
|
||||
scalar_t* gradInput_p_d = gradInput_p + d * isizeT * isizeW * isizeH;
|
||||
scalar_t* gradOutput_p_d = gradOutput_p + d * osizeT * osizeW * osizeH;
|
||||
|
||||
/* calculate average */
|
||||
for (const auto ot : c10::irange(osizeT)) {
|
||||
for (int64_t ot = 0; ot < osizeT; ot++) {
|
||||
int istartT = start_index(ot, osizeT, isizeT);
|
||||
int iendT = end_index(ot, osizeT, isizeT);
|
||||
int kT = iendT - istartT;
|
||||
|
||||
for (const auto oh : c10::irange(osizeH)) {
|
||||
for (int64_t oh = 0; oh < osizeH; oh++) {
|
||||
int istartH = start_index(oh, osizeH, isizeH);
|
||||
int iendH = end_index(oh, osizeH, isizeH);
|
||||
int kH = iendH - istartH;
|
||||
|
||||
for (const auto ow : c10::irange(osizeW)) {
|
||||
for (int64_t ow = 0; ow < osizeW; ow++) {
|
||||
int istartW = start_index(ow, osizeW, isizeW);
|
||||
int iendW = end_index(ow, osizeW, isizeW);
|
||||
int kW = iendW - istartW;
|
||||
@ -206,9 +205,9 @@ static void adaptive_avg_pool3d_backward_out_frame(
|
||||
gradOutput_p_d[ot * osizeH * osizeW + oh * osizeW + ow] / kT /
|
||||
kH / kW;
|
||||
|
||||
for (const auto it : c10::irange(istartT, iendT)) {
|
||||
for (const auto ih : c10::irange(istartH, iendH)) {
|
||||
for (const auto iw : c10::irange(istartW, iendW)) {
|
||||
for (int it = istartT; it < iendT; it++) {
|
||||
for (int ih = istartH; ih < iendH; ih++) {
|
||||
for (int iw = istartW; iw < iendW; iw++) {
|
||||
/* update gradient */
|
||||
gradInput_p_d[it * isizeH * isizeW + ih * isizeW + iw] +=
|
||||
grad_delta;
|
||||
@ -266,7 +265,7 @@ Tensor& adaptive_avg_pool3d_backward_out_cpu_template(
|
||||
scalar_t* gradInput_data = gradInput.data_ptr<scalar_t>();
|
||||
scalar_t* gradOutput_data = gradOutput.data_ptr<scalar_t>();
|
||||
at::parallel_for(0, n, 1, [&](int64_t start, int64_t end) {
|
||||
for (const auto b : c10::irange(start, end)) {
|
||||
for (int64_t b = start; b < end; b++) {
|
||||
adaptive_avg_pool3d_backward_out_frame<scalar_t>(
|
||||
gradInput_data + b * sizeD * isizeT * isizeH * isizeW,
|
||||
gradOutput_data + b * sizeD * osizeT * osizeH * osizeW,
|
||||
|
@ -1,7 +1,6 @@
|
||||
#include <ATen/ATen.h>
|
||||
#include <ATen/NativeFunctions.h>
|
||||
#include <ATen/native/AdaptivePooling.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
|
||||
namespace at {
|
||||
@ -11,7 +10,7 @@ TORCH_META_FUNC(adaptive_max_pool2d) (const Tensor& input, IntArrayRef output_si
|
||||
TORCH_CHECK(ndim == 3 || ndim == 4,
|
||||
"adaptive_max_pool2d(): Expected 3D or 4D tensor, but got: ",
|
||||
input.sizes());
|
||||
for (const auto i : c10::irange(1, ndim)) {
|
||||
for (int64_t i = 1; i < ndim; i++) {
|
||||
TORCH_CHECK(input.size(i) > 0,
|
||||
"adaptive_max_pool2d(): Expected input to have non-zero size for non-batch dimensions, "
|
||||
"but input has sizes ", input.sizes(), " with dimension ", i,
|
||||
@ -52,7 +51,7 @@ TORCH_META_FUNC(adaptive_max_pool2d_backward)
|
||||
int64_t ndim = grad_output.ndimension();
|
||||
TORCH_CHECK(ndim == 3 || ndim == 4,
|
||||
"adaptive_max_pooling2d_backward(): Expected 3D or 4D grad_output, but got: ", grad_output.sizes());
|
||||
for (const auto i : c10::irange(1, ndim)) {
|
||||
for (int64_t i = 1; i < ndim; i++) {
|
||||
TORCH_CHECK(grad_output.size(i) > 0,
|
||||
"adaptive_max_pooling2d_backward(): Expected grad_output to have non-zero size for non-batch dimensions, "
|
||||
"but grad_output has sizes ", grad_output.sizes(), " with dimension ", i,
|
||||
|
@ -1,7 +1,6 @@
|
||||
#include <ATen/ATen.h>
|
||||
#include <ATen/NativeFunctions.h>
|
||||
#include <ATen/Parallel.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <tuple>
|
||||
|
||||
|
||||
@ -12,7 +11,7 @@ TORCH_META_FUNC(adaptive_max_pool3d) (const Tensor& input, IntArrayRef output_si
|
||||
TORCH_CHECK(
|
||||
ndim == 4 || ndim == 5,
|
||||
"adaptive_max_pool3d(): Expected 4D or 5D tensor, but got: ", input.sizes());
|
||||
for (const auto i : c10::irange(1, ndim)) {
|
||||
for (int64_t i = 1; i < ndim; i++) {
|
||||
TORCH_CHECK(
|
||||
input.size(i) > 0,
|
||||
"adaptive_max_pool3d(): Expected input to have non-zero size for non-batch dimensions, "
|
||||
@ -97,7 +96,8 @@ static void adaptive_max_pool3d_single_out_frame(
|
||||
int64_t istrideW)
|
||||
{
|
||||
at::parallel_for(0, sizeD, 0, [&](int64_t start, int64_t end) {
|
||||
for (const auto d : c10::irange(start, end)) {
|
||||
for (auto d = start; d < end; d++)
|
||||
{
|
||||
/* loop over output */
|
||||
int64_t ot, oh, ow;
|
||||
for(ot = 0; ot < osizeT; ot++)
|
||||
@ -176,7 +176,8 @@ static void adaptive_max_pool3d_out_frame(
|
||||
int64_t istrideW)
|
||||
{
|
||||
at::parallel_for(0, sizeB, 0, [&](int64_t start, int64_t end) {
|
||||
for (const auto b : c10::irange(start, end)) {
|
||||
for (auto b = start; b < end; b++)
|
||||
{
|
||||
adaptive_max_pool3d_single_out_frame<scalar_t>(input_data+b*istrideB, output_data+b*sizeD*osizeT*osizeH*osizeW,
|
||||
indices_data+b*sizeD*osizeT*osizeH*osizeW,
|
||||
sizeD,
|
||||
@ -202,7 +203,8 @@ static void adaptive_max_pool3d_backward_single_out_frame(
|
||||
int64_t osizeW)
|
||||
{
|
||||
at::parallel_for(0, sizeD, 0, [&](int64_t start, int64_t end) {
|
||||
for (const auto d : c10::irange(start, end)) {
|
||||
for (auto d = start; d < end; d++)
|
||||
{
|
||||
scalar_t *gradInput_p_d = gradInput_p + d*isizeT*isizeH*isizeW;
|
||||
scalar_t *gradOutput_p_d = gradOutput_p + d*osizeT*osizeH*osizeW;
|
||||
int64_t *ind_p_d = ind_p + d*osizeT*osizeH*osizeW;
|
||||
@ -242,7 +244,8 @@ static void adaptive_max_pool3d_backward_out_frame(
|
||||
int64_t osizeW)
|
||||
{
|
||||
at::parallel_for(0, sizeB, 0, [&](int64_t start, int64_t end) {
|
||||
for (const auto b : c10::irange(start, end)) {
|
||||
for (auto b = start; b < end; b++)
|
||||
{
|
||||
adaptive_max_pool3d_backward_single_out_frame<scalar_t>(gradInput_data+b*sizeD*isizeT*isizeH*isizeW, gradOutput_data+b*sizeD*osizeT*osizeH*osizeW,
|
||||
indices_data+b*sizeD*osizeT*osizeH*osizeW,
|
||||
sizeD,
|
||||
|
@ -2,7 +2,6 @@
|
||||
#include <ATen/Parallel.h>
|
||||
#include <ATen/NativeFunctions.h>
|
||||
#include <ATen/native/Pool.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <tuple>
|
||||
|
||||
|
||||
@ -170,7 +169,8 @@ static void avg_pool3d_out_frame(
|
||||
c10::optional<int64_t> divisor_override)
|
||||
{
|
||||
at::parallel_for(0, nslices, 0, [&](int64_t start, int64_t end) {
|
||||
for (const auto k : c10::irange(start, end)) {
|
||||
for (auto k = start; k < end; k++)
|
||||
{
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
|
||||
int64_t i, j, ti;
|
||||
|
||||
@ -315,7 +315,7 @@ TORCH_IMPL_FUNC(avg_pool3d_out_cpu) (
|
||||
scalar_t *output_data = output.data_ptr<scalar_t>();
|
||||
|
||||
at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) {
|
||||
for (const auto p : c10::irange(start, end)) {
|
||||
for (auto p = start; p < end; p++) {
|
||||
avg_pool3d_out_frame(
|
||||
input_data + p * istride, output_data + p * ostride, nslices,
|
||||
itime, iwidth, iheight,
|
||||
@ -358,7 +358,8 @@ static void avg_pool3d_backward_out_frame(
|
||||
c10::optional<int64_t> divisor_override)
|
||||
{
|
||||
at::parallel_for(0, nslices, 0, [&](int64_t start, int64_t end) {
|
||||
for (const auto k : c10::irange(start, end)) {
|
||||
for (auto k = start; k < end; k++)
|
||||
{
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
|
||||
int64_t i, j, ti;
|
||||
|
||||
@ -499,7 +500,8 @@ TORCH_IMPL_FUNC(avg_pool3d_backward_out_cpu) (
|
||||
scalar_t *gradOutput_data = gradOutput.data_ptr<scalar_t>();
|
||||
|
||||
at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) {
|
||||
for (const auto p : c10::irange(start, end)) {
|
||||
for (auto p = start; p < end; p++)
|
||||
{
|
||||
avg_pool3d_backward_out_frame(
|
||||
gradInput_data + p * istride, gradOutput_data + p * ostride, nslices,
|
||||
itime, iwidth, iheight,
|
||||
|
@ -63,7 +63,7 @@ void apply_reflect_conj_tri_single(scalar_t* self, int64_t n, int64_t stride, bo
|
||||
std::function<void(int64_t, int64_t)> loop = [](int64_t, int64_t){};
|
||||
if (upper) {
|
||||
loop = [&](int64_t start, int64_t end) {
|
||||
for (const auto i : c10::irange(start, end)) {
|
||||
for (int64_t i = start; i < end; i++) {
|
||||
for (int64_t j = i + 1; j < n; j++) {
|
||||
self[i * stride + j] = conj_impl(self[j * stride + i]);
|
||||
}
|
||||
@ -71,8 +71,8 @@ void apply_reflect_conj_tri_single(scalar_t* self, int64_t n, int64_t stride, bo
|
||||
};
|
||||
} else {
|
||||
loop = [&](int64_t start, int64_t end) {
|
||||
for (const auto i : c10::irange(start, end)) {
|
||||
for (const auto j : c10::irange(i)) {
|
||||
for (int64_t i = start; i < end; i++) {
|
||||
for (int64_t j = 0; j < i; j++) {
|
||||
self[i * stride + j] = conj_impl(self[j * stride + i]);
|
||||
}
|
||||
}
|
||||
@ -106,7 +106,7 @@ void apply_cholesky_inverse(Tensor& input, Tensor& infos, bool upper) {
|
||||
auto n = input.size(-2);
|
||||
auto lda = std::max<int64_t>(1, n);
|
||||
|
||||
for (const auto i : c10::irange(batch_size)) {
|
||||
for (int64_t i = 0; i < batch_size; i++) {
|
||||
scalar_t* input_working_ptr = &input_data[i * input_matrix_stride];
|
||||
int* info_working_ptr = &infos_data[i];
|
||||
lapackCholeskyInverse<scalar_t>(uplo, n, input_working_ptr, lda, info_working_ptr);
|
||||
@ -501,7 +501,7 @@ inline void apply_orgqr(Tensor& self, const Tensor& tau) {
|
||||
lwork = std::max<int>(1, real_impl<scalar_t, value_t>(wkopt));
|
||||
Tensor work = at::empty({lwork}, self.options());
|
||||
|
||||
for (const auto i : c10::irange(batch_size)) {
|
||||
for (int64_t i = 0; i < batch_size; i++) {
|
||||
scalar_t* self_working_ptr = &self_data[i * self_matrix_stride];
|
||||
scalar_t* tau_working_ptr = &tau_data[i * tau_stride];
|
||||
|
||||
|
@ -2,7 +2,6 @@
|
||||
#include <algorithm>
|
||||
#include <ATen/ATen.h>
|
||||
#include <ATen/Config.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
#if AT_BUILD_WITH_BLAS()
|
||||
extern "C" double ddot_(int *n, double *x, int *incx, double *y, int *incy);
|
||||
@ -152,7 +151,7 @@ inline void scal(int64_t n, scalar_t a, scalar_t *x, int64_t incx)
|
||||
blas_impl::scal_fast_path<scalar_t>(&i_n, &a, x, &i_incx);
|
||||
return;
|
||||
}
|
||||
for (const auto i : c10::irange(n)) {
|
||||
for (int64_t i = 0; i < n; i++) {
|
||||
if (a == scalar_t(0)) {
|
||||
x[i * incx] = 0;
|
||||
} else {
|
||||
@ -177,10 +176,11 @@ void gemv(char trans, int64_t m, int64_t n, scalar_t alpha, scalar_t *a, int64_t
|
||||
}
|
||||
|
||||
if ((trans == 'T') || (trans == 't')) {
|
||||
for (const auto i : c10::irange(n)) {
|
||||
for (int64_t i = 0; i < n; i++)
|
||||
{
|
||||
scalar_t sum = 0;
|
||||
scalar_t *row_ = a + lda * i;
|
||||
for (const auto j : c10::irange(m)) {
|
||||
for (int64_t j = 0; j < m; j++) {
|
||||
sum += x[j * incx] * row_[j];
|
||||
}
|
||||
if (beta == scalar_t(0)) {
|
||||
@ -192,10 +192,10 @@ void gemv(char trans, int64_t m, int64_t n, scalar_t alpha, scalar_t *a, int64_t
|
||||
} else {
|
||||
if (beta != scalar_t(1) && beta != scalar_t(0)) scal<scalar_t>(m, beta, y, incy);
|
||||
|
||||
for (const auto j : c10::irange(n)) {
|
||||
for (int64_t j = 0; j < n; j++) {
|
||||
scalar_t *column_ = a + lda * j;
|
||||
scalar_t z = alpha * x[j * incx];
|
||||
for (const auto i : c10::irange(m)) {
|
||||
for (int64_t i = 0; i < m; i++) {
|
||||
//output values are ignored if beta is 0, and set to 0, nans and infs are not propagated
|
||||
if (j==0 && beta==scalar_t(0)) {
|
||||
y[i * incy] = scalar_t(0);
|
||||
|
@ -2,7 +2,6 @@
|
||||
#include <ATen/Parallel.h>
|
||||
#include <ATen/native/BucketizationUtils.h>
|
||||
#include <ATen/native/Resize.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
/* Implement a TF like searchsorted and a bucketize function running on cpu
|
||||
*
|
||||
@ -59,7 +58,7 @@ void searchsorted_cpu_contiguous(Tensor& result, const Tensor& input, const Tens
|
||||
|
||||
bool is_1d_boundaries = boundaries.dim() == 1;
|
||||
at::parallel_for(0, numel_in, SEARCHSORTED_GRAIN_SIZE, [&](int64_t start, int64_t end) {
|
||||
for (const auto i : c10::irange(start, end)) {
|
||||
for (int64_t i = start; i < end; ++i) {
|
||||
// If boundaries tensor is 1d, we always search the entire boundary tensor
|
||||
int64_t start_bd = is_1d_boundaries ? 0 : i / idim_in * idim_bd;
|
||||
const input_t *data_bd_start = &data_bd[start_bd];
|
||||
|
@ -5,7 +5,6 @@
|
||||
|
||||
#include <ATen/native/im2col.h>
|
||||
#include <ATen/native/im2col_shape_check.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
// Note [im2col/col2im output padding]
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
@ -151,7 +150,7 @@ static void col2im_out_cpu_template(
|
||||
stride_width +
|
||||
1;
|
||||
|
||||
for (const auto elt : c10::irange(batch_size)) {
|
||||
for (int64_t elt = 0; elt < batch_size; elt++) {
|
||||
input_n = input.select(0, elt);
|
||||
output_n = output.select(0, elt);
|
||||
|
||||
|
@ -24,7 +24,7 @@ inline Tensor view_tensor(
|
||||
|
||||
inline DimVector computeStrideForViewAsReal(IntArrayRef oldstride) {
|
||||
DimVector res(oldstride.size() + 1);
|
||||
for (const auto i : c10::irange(oldstride.size())) {
|
||||
for(size_t i = 0; i < oldstride.size(); i++) {
|
||||
res[i] = oldstride[i] * 2;
|
||||
}
|
||||
res.back() = 1;
|
||||
|
@ -47,7 +47,7 @@ Tensor constant_pad_nd(const Tensor& self, IntArrayRef pad, const Scalar& value)
|
||||
new_shape.emplace_back(input_sizes[i]);
|
||||
}
|
||||
|
||||
for (const auto i : c10::irange((size_t)l_pad)) {
|
||||
for (size_t i = 0; i < (size_t)l_pad; i++) {
|
||||
auto pad_idx = pad.size() - ((i + 1) * 2);
|
||||
auto new_dim = input_sizes[l_diff + i] + pad[pad_idx] + pad[pad_idx + 1];
|
||||
TORCH_CHECK(new_dim > 0, "The input size ", input_sizes[l_diff + i], ", plus negative padding ",
|
||||
|
@ -1,7 +1,6 @@
|
||||
#pragma once
|
||||
#include <ATen/detail/CUDAHooksInterface.h>
|
||||
#include <c10/util/env.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
namespace at { namespace native {
|
||||
|
||||
@ -36,7 +35,7 @@ static inline std::vector<int64_t> conv_output_size(
|
||||
std::vector<int64_t> output_size(dim);
|
||||
output_size[0] = input_size[input_batch_size_dim];
|
||||
output_size[1] = weight_size[weight_output_channels_dim];
|
||||
for (const auto d : c10::irange(2, dim)) {
|
||||
for (size_t d = 2; d < dim; ++d) {
|
||||
auto dilation_ = has_dilation ? dilation[d - 2] : 1;
|
||||
auto kernel = dilation_ * (weight_size[d] - 1) + 1;
|
||||
output_size[d] = (input_size[d] + (2 * padding[d - 2]) - kernel) / stride[d - 2] + 1;
|
||||
@ -54,7 +53,7 @@ static inline std::vector<int64_t> conv_input_size(
|
||||
std::vector<int64_t> input_size(dim);
|
||||
input_size[0] = output_size[output_batch_size_dim];
|
||||
input_size[1] = weight_size[weight_input_channels_dim] * groups;
|
||||
for (const auto d : c10::irange(2, dim)) {
|
||||
for (size_t d = 2; d < dim; ++d) {
|
||||
int kernel = dilation[d - 2] * (weight_size[d] - 1) + 1;
|
||||
input_size[d] = (output_size[d] - 1) * stride[d - 2] - (2 * padding[d - 2]) +
|
||||
kernel + output_padding[d - 2];
|
||||
@ -70,7 +69,7 @@ static inline std::vector<int64_t> conv_weight_size(
|
||||
std::vector<int64_t> weight_size(dim);
|
||||
weight_size[0] = output_size[1];
|
||||
weight_size[1] = input_size[1] / groups;
|
||||
for (const auto d : c10::irange(2, dim)) {
|
||||
for (size_t d = 2; d < dim; ++d) {
|
||||
int kernel = input_size[d] - (output_size[d] - 1) * stride[d - 2]
|
||||
+ 2 * padding[d - 2] - output_padding[d - 2];
|
||||
weight_size[d] = (kernel - 1) / dilation[d - 2] + 1;
|
||||
|
@ -975,7 +975,7 @@ at::Tensor _convolution(
|
||||
} else {
|
||||
std::vector<Tensor> outputs(params.groups);
|
||||
input = input.contiguous();
|
||||
for (const auto g : c10::irange(params.groups)) {
|
||||
for (int g = 0; g < params.groups; ++g) {
|
||||
auto input_g = subtensor(input, 1, params.groups, g);
|
||||
auto weight_g = subtensor(weight, 0, params.groups, g);
|
||||
auto bias_g = subtensor(bias, 0, params.groups, g);
|
||||
@ -1212,7 +1212,7 @@ std::tuple<Tensor,Tensor,Tensor> _convolution_double_backward( const c10::option
|
||||
}
|
||||
} else {
|
||||
std::vector<Tensor> gWt_list(groups);
|
||||
for (const auto g : c10::irange(groups)) {
|
||||
for (int g = 0; g < groups; ++g) {
|
||||
auto ggIt_g = subvariable(ggIt, 0, groups, g);
|
||||
auto gOt_g = subvariable(gOt, 0, groups, g);
|
||||
if (gOt_g.is_cuda()) {
|
||||
@ -1239,7 +1239,7 @@ std::tuple<Tensor,Tensor,Tensor> _convolution_double_backward( const c10::option
|
||||
// the ConvForward kernels don't support asymmetric padding.
|
||||
auto gW_size = gW.sizes();
|
||||
auto w_size = weight.sizes();
|
||||
for (const auto i : c10::irange(2, gW_size.size())) {
|
||||
for (size_t i = 2; i < gW_size.size(); ++i) {
|
||||
if (gW_size[i] > w_size[i]) {
|
||||
gW = gW.narrow(i, 0, w_size[i]);
|
||||
gW_size = gW.sizes();
|
||||
@ -1268,7 +1268,7 @@ std::tuple<Tensor,Tensor,Tensor> _convolution_double_backward( const c10::option
|
||||
// rather than narrowing the computed gI
|
||||
auto gI_size = gI.sizes();
|
||||
auto i_size = input.sizes();
|
||||
for (const auto i : c10::irange(2, gI_size.size())) {
|
||||
for (size_t i = 2; i < gI_size.size(); ++i) {
|
||||
if (gI_size[i] > i_size[i]) {
|
||||
gI = gI.narrow(i, 0, i_size[i]);
|
||||
gI_size = gI.sizes();
|
||||
@ -1289,7 +1289,7 @@ std::tuple<Tensor,Tensor,Tensor> _convolution_double_backward( const c10::option
|
||||
gi_conv_params.output_padding[1] = input_shape[0] - expected_input_shape;
|
||||
}
|
||||
} else {
|
||||
for (const auto i : c10::irange(kernel_size.size())) {
|
||||
for(size_t i = 0; i < kernel_size.size(); ++i) {
|
||||
// Check if whole input has been used or not
|
||||
auto expected_input_shape = (kernel_size[i] - 1) * gi_conv_params.dilation[i]
|
||||
- 2 * gi_conv_params.padding[i]
|
||||
|
@ -7,7 +7,6 @@
|
||||
#include <ATen/div_rtn.h>
|
||||
#include <ATen/native/CPUBlas.h>
|
||||
#include <ATen/native/Unfold2d.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
namespace at {
|
||||
namespace native {
|
||||
@ -300,7 +299,7 @@ void slow_conv2d_backward_out_cpu_template(
|
||||
at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) {
|
||||
auto fgrad_input = std::make_unique<scalar_t[]>(
|
||||
c10::multiply_integers(finput.sizes().slice(1)));
|
||||
for (const auto t : c10::irange(start, end)) {
|
||||
for (int64_t t = start; t < end; t++) {
|
||||
auto grad_input_t = grad_input_a[t];
|
||||
auto grad_output_t = grad_output_a[t];
|
||||
slow_conv2d_backward_update_grad_input_frame(
|
||||
@ -479,7 +478,7 @@ std::tuple<Tensor&, Tensor&> slow_conv2d_forward_out_cpu(
|
||||
auto weight_2d_a = weight_2d.accessor<scalar_t, 2>();
|
||||
|
||||
at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) {
|
||||
for (const auto t : c10::irange(start, end)) {
|
||||
for (int64_t t = start; t < end; t++) {
|
||||
auto input_t = input_a[t];
|
||||
auto output_t = output_a[t];
|
||||
auto finput_t = finput_a[t];
|
||||
|
@ -6,7 +6,6 @@
|
||||
#include <ATen/div_rtn.h>
|
||||
#include <ATen/native/CPUBlas.h>
|
||||
#include <ATen/native/Unfold3d.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
constexpr int64_t CONV3D_GRAIN_SALT = 20;
|
||||
|
||||
@ -359,7 +358,7 @@ void slow_conv3d_backward_out_cpu_template(
|
||||
auto fgrad_input_a = fgrad_input.accessor<scalar_t, 3>();
|
||||
auto weight_2d_a = weight2d.accessor<scalar_t, 2>();
|
||||
|
||||
for (const auto t : c10::irange(start, end)) {
|
||||
for (int64_t t = start; t < end; t++) {
|
||||
auto grad_input_t = grad_input_a[t];
|
||||
auto grad_output_t = grad_output_a[t];
|
||||
auto fgrad_input_t = fgrad_input_a[t];
|
||||
@ -463,7 +462,7 @@ static void slow_conv3d_backward_parameters_out_cpu_template(
|
||||
auto grad_weight_2d_a = grad_weight_2d.accessor<scalar_t, 2>();
|
||||
auto grad_output_a = grad_output_contiguous.accessor<scalar_t, 5>();
|
||||
auto finput_a = finput.accessor<scalar_t, 3>();
|
||||
for (const auto t : c10::irange(batch_size)) {
|
||||
for (int64_t t = 0; t < batch_size; t++) {
|
||||
auto grad_output_t = grad_output_a[t];
|
||||
auto finput_t = finput_a[t];
|
||||
slow_conv3d_backward_weight_frame(
|
||||
@ -565,7 +564,7 @@ std::tuple<Tensor&, Tensor&, Tensor&> slow_conv3d_forward_out_cpu(const Tensor&
|
||||
|
||||
at::parallel_for(
|
||||
0, batch_size, CONV3D_GRAIN_SALT, [&](int64_t start, int64_t end) {
|
||||
for (const auto t : c10::irange(start, end)) {
|
||||
for (int64_t t = start; t < end; t++) {
|
||||
auto input_t = input_a[t];
|
||||
auto output_t = output_a[t];
|
||||
auto finput_t = finput_a[t];
|
||||
|
@ -1,6 +1,5 @@
|
||||
#include <ATen/ATen.h>
|
||||
#include <ATen/NativeFunctions.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <tuple>
|
||||
|
||||
namespace at {
|
||||
@ -40,7 +39,7 @@ Tensor conv_tbc(const Tensor& self, const Tensor& weight, const Tensor& bias, in
|
||||
weight_size[2],
|
||||
}, self.options());
|
||||
output.copy_(bias.expand(output.sizes()));
|
||||
for (const auto k : c10::irange(kw)) {
|
||||
for (int k = 0; k < kw; k++) {
|
||||
int iShift = std::max(0, static_cast<int>(k - real_pad));
|
||||
int oShift = std::max(0, static_cast<int>(real_pad - k));
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
|
@ -12,7 +12,6 @@
|
||||
#include <ATen/MemoryOverlap.h>
|
||||
#include <ATen/NamedTensorUtils.h>
|
||||
#include <ATen/Parallel.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <torch/library.h>
|
||||
|
||||
#ifdef USE_FBGEMM
|
||||
@ -66,16 +65,16 @@ void copy_same_type_transpose_(Tensor& self, const Tensor& src) {
|
||||
int nc = std::min(NC - C, BLOCK_SZ);
|
||||
|
||||
// 1. copy columns from src to buf
|
||||
for (const auto c : c10::irange(nc)) {
|
||||
for (int c = 0; c < nc; c++) {
|
||||
memcpy(bp + c * BLOCK_SZ, spo + c * NR, nr * sizeof(scalar_t));
|
||||
}
|
||||
|
||||
// 2. transpose buf in place
|
||||
int rc_max = std::max(nr, nc);
|
||||
int rc_min = std::min(nr, nc);
|
||||
for (const auto r : c10::irange(rc_max)) {
|
||||
for (int r = 0; r < rc_max; r++) {
|
||||
int end = std::min(r, rc_min);
|
||||
for (const auto c : c10::irange(end)) {
|
||||
for (int c = 0; c < end; c++) {
|
||||
scalar_t tmp = bp[r + BLOCK_SZ * c];
|
||||
bp[r + BLOCK_SZ * c] = bp[r * BLOCK_SZ + c];
|
||||
bp[r * BLOCK_SZ + c] = tmp;
|
||||
@ -83,7 +82,7 @@ void copy_same_type_transpose_(Tensor& self, const Tensor& src) {
|
||||
}
|
||||
|
||||
// 3. copy rows from buf to dst
|
||||
for (const auto r : c10::irange(nr)) {
|
||||
for (int r = 0; r < nr; r++) {
|
||||
memcpy(rpo + r * NC, bp + r * BLOCK_SZ, nc * sizeof(scalar_t));
|
||||
}
|
||||
}
|
||||
|
@ -3,7 +3,6 @@
|
||||
#include <ATen/NativeFunctions.h>
|
||||
|
||||
#include <ATen/native/Cross.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
namespace at { namespace native {
|
||||
|
||||
@ -31,7 +30,7 @@ Tensor & cross_out(const Tensor & input, const Tensor & other, const c10::option
|
||||
|
||||
int64_t dim = -1;
|
||||
if(!dimension.has_value()) {
|
||||
for (const auto i : c10::irange(input.dim())) {
|
||||
for(int64_t i = 0; i < input.dim(); i++) {
|
||||
if(input.size(i) == 3) {
|
||||
dim = i;
|
||||
break;
|
||||
|
@ -5,7 +5,6 @@
|
||||
|
||||
#include <ATen/div_rtn.h>
|
||||
#include <ATen/ATen.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
#define TORCH_CHECK_DIM_SIZE(T, DIM, DIM_SIZE, SIZE) \
|
||||
TORCH_CHECK( \
|
||||
@ -44,7 +43,7 @@ std::vector<int64_t> get_output_size(
|
||||
IntArrayRef pad_size,
|
||||
IntArrayRef dilation_size) {
|
||||
std::vector<int64_t> sizes;
|
||||
for (const auto index : c10::irange(dim)) {
|
||||
for (int index = 0; index < dim; index++) {
|
||||
sizes.push_back(
|
||||
div_rtn<int64_t>(
|
||||
input.size(index + input.dim() - dim) + 2 * pad_size[index] -
|
||||
|
@ -3,7 +3,6 @@
|
||||
#include <ATen/NamedTensorUtils.h>
|
||||
#include <ATen/NativeFunctions.h>
|
||||
#include <ATen/native/Pool.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <tuple>
|
||||
|
||||
|
||||
@ -38,7 +37,8 @@ static void max_pool3d_with_indices_single_out_frame(
|
||||
int dilationH)
|
||||
{
|
||||
at::parallel_for(0, nslices, 0, [&](int64_t start, int64_t end) {
|
||||
for (const auto k : c10::irange(start, end)) {
|
||||
for (auto k = start; k < end; k++)
|
||||
{
|
||||
/* loop over output */
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
|
||||
int64_t i, j, ti;
|
||||
@ -120,7 +120,8 @@ static void max_pool3d_with_indices_out_frame(
|
||||
int dilationT, int dilationW, int dilationH)
|
||||
{
|
||||
at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) {
|
||||
for (const auto p : c10::irange(start, end)) {
|
||||
for (auto p = start; p < end; p++)
|
||||
{
|
||||
max_pool3d_with_indices_single_out_frame(
|
||||
input_data + p * istride,
|
||||
output_data + p * ostride,
|
||||
@ -284,7 +285,8 @@ static void max_pool3d_with_indices_backward_single_out_frame(
|
||||
int dilationH)
|
||||
{
|
||||
at::parallel_for(0, nslices, 0, [&](int64_t start, int64_t end) {
|
||||
for (const auto k : c10::irange(start, end)) {
|
||||
for (auto k = start; k < end; k++)
|
||||
{
|
||||
scalar_t *gradInput_p_k = gradInput_p + k * itime * iwidth * iheight;
|
||||
scalar_t *gradOutput_p_k = gradOutput_p + k * otime * owidth * oheight;
|
||||
int64_t *indz_p_k = indz_p + k * otime * owidth * oheight;
|
||||
@ -328,7 +330,8 @@ static void max_pool3d_with_indices_backward_out_frame(
|
||||
int dilationT, int dilationW, int dilationH)
|
||||
{
|
||||
at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) {
|
||||
for (const auto p : c10::irange(start, end)) {
|
||||
for (auto p = start; p < end; p++)
|
||||
{
|
||||
max_pool3d_with_indices_backward_single_out_frame<scalar_t>(
|
||||
gradInput_data + p * istride,
|
||||
gradOutput_data + p * ostride,
|
||||
|
@ -1,7 +1,6 @@
|
||||
#include <ATen/ATen.h>
|
||||
#include <ATen/Dispatch.h>
|
||||
#include <ATen/NamedTensorUtils.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
namespace at { namespace native {
|
||||
|
||||
@ -17,10 +16,8 @@ Tensor make_feature_noise(const Tensor& input) {
|
||||
sizes.reserve(input.dim());
|
||||
sizes.push_back(input_sizes[0]);
|
||||
sizes.push_back(input_sizes[1]);
|
||||
for (const auto i : c10::irange(2, input.dim())) {
|
||||
(void)i; //Suppress unused variable warning
|
||||
for (int64_t i = 2; i < input.dim(); ++i)
|
||||
sizes.push_back(1);
|
||||
}
|
||||
return at::empty(sizes, input.options());
|
||||
}
|
||||
|
||||
|
@ -123,7 +123,7 @@ Tensor embedding_dense_backward_cpu(
|
||||
|
||||
auto parallel_section = [&](index_t start, index_t end) {
|
||||
TensorIterator iter(add_iter);
|
||||
for (const auto i : c10::irange(numel)) {
|
||||
for (int64_t i = 0; i < numel; i++) {
|
||||
if (indices_data[i] != padding_idx) {
|
||||
index_t k = indices_data[i];
|
||||
if (k >= start && k < end) {
|
||||
@ -167,7 +167,7 @@ Tensor & embedding_renorm_cpu_(
|
||||
|
||||
// Note that we cannot use at::parallel_for here because we perform operations on
|
||||
// Tensor inside the loop. See github.com/pytorch/pytorch/issues/28370 for more details.
|
||||
for (const auto i : c10::irange(num_indices)) {
|
||||
for (auto i = 0; i < num_indices; i++) {
|
||||
if (i > 0 && sorted_indices[i] == sorted_indices[i - 1]) {
|
||||
continue;
|
||||
}
|
||||
|
@ -107,7 +107,7 @@ index_select_add(const Tensor &select_indices,
|
||||
auto output_stride0 = output.strides()[0];
|
||||
auto output_stride1 = output.strides()[1];
|
||||
|
||||
for (const auto i : c10::irange(numel)) {
|
||||
for (int64_t i = 0; i < numel; i++) {
|
||||
// We can skip indices equal to padding_idx so they are not included in
|
||||
// the reduction
|
||||
if (select_indices_data[i] != padding_idx) {
|
||||
@ -247,7 +247,7 @@ index_select_add(const Tensor &select_indices,
|
||||
auto output_stride0 = output.strides()[0];
|
||||
auto output_stride1 = output.strides()[1];
|
||||
auto numel = add_indices.numel();
|
||||
for (const auto i : c10::irange(numel)) {
|
||||
for (int64_t i = 0; i < numel; i++) {
|
||||
// We can skip indices equal to padding_idx so they are not included in
|
||||
// the reduction
|
||||
if (select_indices_data[i] != padding_idx) {
|
||||
@ -302,14 +302,14 @@ index_select_scale_add(const Tensor &select_indices,
|
||||
auto* scale_data = scale.data_ptr<data_t>();
|
||||
auto scale_stride = scale.strides()[0];
|
||||
|
||||
for (const auto i : c10::irange(numel)) {
|
||||
for (int64_t i = 0; i < numel; i++) {
|
||||
// We can skip indices equal to padding_idx so they are not included in
|
||||
// the reduction
|
||||
if (select_indices_data[i] != padding_idx) {
|
||||
auto* src_base = src_data + src_stride0 * select_indices_data[i];
|
||||
auto* output_base = output_data + output_stride0 * add_indices_data[i];
|
||||
auto scale = scale_data[i * scale_stride];
|
||||
for (const auto j : c10::irange(ddim)) {
|
||||
for (int64_t j = 0; j < ddim; j++) {
|
||||
output_base[j * output_stride1] += src_base[j * src_stride1] * scale;
|
||||
}
|
||||
} else if (bag_size.defined()) {
|
||||
@ -419,14 +419,14 @@ index_select_scale_add(const Tensor &select_indices,
|
||||
auto numel = add_indices.numel();
|
||||
|
||||
|
||||
for (const auto i : c10::irange(numel)) {
|
||||
for (int64_t i = 0; i < numel; i++) {
|
||||
// We can skip indices equal to padding_idx so they are not included in
|
||||
// the reduction
|
||||
if (select_indices_data[i] != padding_idx) {
|
||||
auto* src_base = src_data + src_stride0 * select_indices_data[i];
|
||||
auto* output_base = output_data + output_stride0 * add_indices_data[i];
|
||||
auto scale = scale_data[i * scale_stride];
|
||||
for (const auto j : c10::irange(ddim)) {
|
||||
for (int64_t j = 0; j < ddim; j++) {
|
||||
output_base[j * output_stride1] += src_base[j * src_stride1] * scale;
|
||||
}
|
||||
} else if (bag_size.defined()) {
|
||||
|
@ -6,7 +6,6 @@
|
||||
#include <ATen/native/TensorIterator.h>
|
||||
#include <ATen/Utils.h>
|
||||
#include <c10/util/accumulate.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
namespace at {
|
||||
namespace native {
|
||||
@ -64,7 +63,7 @@ Tensor& fill_diagonal_(Tensor& self, const Scalar& fill_value, bool wrap) {
|
||||
|
||||
if (nDims > 2) {
|
||||
int64_t dim1 = height;
|
||||
for (const auto i : c10::irange(1, nDims)) {
|
||||
for (int64_t i = 1; i < nDims; i++) {
|
||||
if (self.size(i) != dim1) {
|
||||
AT_ERROR("all dimensions of input must be of equal length");
|
||||
}
|
||||
@ -77,7 +76,7 @@ Tensor& fill_diagonal_(Tensor& self, const Scalar& fill_value, bool wrap) {
|
||||
int64_t size = std::min(height, width);
|
||||
|
||||
int64_t stride = 0;
|
||||
for (const auto i : c10::irange(nDims)) {
|
||||
for (int64_t i = 0; i < nDims; i++) {
|
||||
stride += self.stride(i);
|
||||
}
|
||||
strides.push_back(stride);
|
||||
|
@ -1,7 +1,6 @@
|
||||
#include <ATen/ATen.h>
|
||||
#include <ATen/NativeFunctions.h>
|
||||
#include <ATen/Parallel.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
#include <tuple>
|
||||
#include <vector>
|
||||
@ -33,7 +32,7 @@ TORCH_META_FUNC(fractional_max_pool2d) (
|
||||
int64_t ndims = input.ndimension();
|
||||
TORCH_CHECK(ndims == 3 || ndims == 4,
|
||||
"fractional_max_pool2d(): Expected 3D or 4D tensor, but got: ", input.sizes());
|
||||
for (const auto i : c10::irange(1, ndims)) {
|
||||
for (int64_t i = 1; i < ndims; ++i) {
|
||||
TORCH_CHECK(input.size(i) > 0,
|
||||
"fractional_max_pool2d(): Expected input to have non-zero size for non-batch dimensions, but got",
|
||||
input.sizes(), " with dimension ", i, " being empty.");
|
||||
@ -107,7 +106,7 @@ static void fractional_max_pool2d_out_single_batch_frame(
|
||||
int outputW, int outputH,
|
||||
int poolSizeW, int poolSizeH) {
|
||||
at::parallel_for(0, numPlanes, 0, [&](int64_t start, int64_t end) {
|
||||
for (const auto plane : c10::irange(start, end)) {
|
||||
for (auto plane = start; plane < end; ++plane) {
|
||||
/* each plane contains 2 random samples, one for W and one for H */
|
||||
scalar_t* randomSamplesForPlane = randomSamples + plane * 2;
|
||||
|
||||
@ -178,7 +177,7 @@ static void fractional_max_pool2d_out_frame(
|
||||
return;
|
||||
}
|
||||
at::parallel_for(0, numBatch, 0, [&](int64_t start, int64_t end) {
|
||||
for (const auto batch : c10::irange(start, end)) {
|
||||
for (auto batch = start; batch < end; ++batch) {
|
||||
fractional_max_pool2d_out_single_batch_frame<scalar_t>(
|
||||
input + batch * numPlanes * inputH * inputW,
|
||||
output + batch * numPlanes * outputH * outputW,
|
||||
@ -255,7 +254,7 @@ static void fractional_max_pool2d_backward_out_single_batch_frame(
|
||||
int inputW, int inputH,
|
||||
int outputW, int outputH) {
|
||||
at::parallel_for(0, numPlanes, 0, [&](int64_t start, int64_t end) {
|
||||
for (const auto plane : c10::irange(start, end)) {
|
||||
for (auto plane = start; plane < end; plane++) {
|
||||
scalar_t* gradInputForPlane = gradInput + plane * inputW * inputH;
|
||||
scalar_t* gradOutputForPlane = gradOutput + plane * outputW * outputH;
|
||||
int64_t* indicesForPlane = indices + plane * outputW * outputH;
|
||||
@ -292,7 +291,7 @@ static void fractional_max_pool2d_backward_out_frame(
|
||||
return;
|
||||
}
|
||||
at::parallel_for(0, numBatch, 0, [&](int64_t start, int64_t end) {
|
||||
for (const auto batch : c10::irange(start, end)) {
|
||||
for (auto batch = start; batch < end; ++batch) {
|
||||
fractional_max_pool2d_backward_out_single_batch_frame<scalar_t>(
|
||||
gradInput + batch * numPlanes * inputH * inputW,
|
||||
gradOutput + batch * numPlanes * outputH * outputW,
|
||||
|
@ -44,7 +44,7 @@ static void fractional_max_pool3d_out_single_batch_frame(
|
||||
int64_t poolSizeT, int64_t poolSizeH, int64_t poolSizeW) {
|
||||
|
||||
at::parallel_for(0, numPlanes, 0, [&](int64_t start, int64_t end) {
|
||||
for (const auto plane : c10::irange(start, end)) {
|
||||
for (auto plane = start; plane < end; ++plane) {
|
||||
/* each plane contains 3 random samples,
|
||||
one for T, one for W, and one for H */
|
||||
scalar_t* randomSamplesForPlane = randomSamples + plane * 3;
|
||||
@ -126,7 +126,7 @@ static void fractional_max_pool3d_out_frame(
|
||||
}
|
||||
|
||||
at::parallel_for(0, numBatch, 0, [&](int64_t start, int64_t end) {
|
||||
for (const auto batch : c10::irange(start, end)) {
|
||||
for (auto batch = start; batch < end; ++batch) {
|
||||
fractional_max_pool3d_out_single_batch_frame<scalar_t>(
|
||||
input + batch * numPlanes * inputW * inputH * inputT,
|
||||
output + batch * numPlanes * outputW * outputH * outputT,
|
||||
@ -171,7 +171,7 @@ void fractional_max_pool3d_out_cpu_template(
|
||||
TORCH_CHECK(ndims == 4 || ndims == 5,
|
||||
"fractional_max_pool3d_out(): Expected 4D or 5D tensor, but got: ",
|
||||
input_.sizes());
|
||||
for (const auto i : c10::irange(1, ndims)) {
|
||||
for (int64_t i = 1; i < ndims; ++i) {
|
||||
TORCH_CHECK(input_.size(i) > 0,
|
||||
"fractional_max_pool3d_out(): Expected input to have non-zero size for non-batch dimensions, but got",
|
||||
input_.sizes(), " with dimension ", i, " being empty.");
|
||||
@ -243,7 +243,7 @@ static void fractional_max_pool3d_backward_out_single_batch_frame(
|
||||
int64_t outputT, int64_t outputH, int64_t outputW) {
|
||||
|
||||
at::parallel_for(0, numPlanes, 0, [&](int64_t start, int64_t end) {
|
||||
for (const auto plane : c10::irange(start, end)) {
|
||||
for (auto plane = start; plane < end; plane++) {
|
||||
scalar_t* gradInputForPlane = gradInput + plane * inputT * inputH * inputW;
|
||||
scalar_t* gradOutputForPlane = gradOutput +
|
||||
plane * outputT * outputH * outputW;
|
||||
@ -284,7 +284,7 @@ static void fractional_max_pool3d_backward_out_frame(
|
||||
}
|
||||
|
||||
at::parallel_for(0, numBatch, 0, [&](int64_t start, int64_t end) {
|
||||
for (const auto batch : c10::irange(start, end)) {
|
||||
for (auto batch = start; batch < end; ++batch) {
|
||||
fractional_max_pool3d_backward_out_single_batch_frame<scalar_t>(
|
||||
gradInput + batch * numPlanes * inputW * inputH * inputT,
|
||||
gradOutput + batch * numPlanes * outputW * outputH * outputT,
|
||||
|
@ -9,7 +9,6 @@
|
||||
#include <ATen/native/UpSample.h>
|
||||
#include <ATen/native/cpu/GridSamplerKernel.h>
|
||||
#include <c10/util/Exception.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
namespace at { namespace native {
|
||||
|
||||
@ -52,12 +51,12 @@ namespace {
|
||||
scalar_t *grid_ptr = grid.data_ptr<scalar_t>();
|
||||
// loop over each output pixel
|
||||
at::parallel_for(0, N, 0, [&](int64_t start, int64_t end) {
|
||||
for (const auto n : c10::irange(start, end)) {
|
||||
for (int64_t n = start; n < end; ++n) {
|
||||
scalar_t *grid_ptr_N = grid_ptr + n * grid_sN;
|
||||
scalar_t *inp_ptr_N = inp_ptr + n * inp_sN;
|
||||
for (const auto d : c10::irange(out_D)) {
|
||||
for (const auto h : c10::irange(out_H)) {
|
||||
for (const auto w : c10::irange(out_W)) {
|
||||
for (int64_t d = 0; d < out_D; ++d) {
|
||||
for (int64_t h = 0; h < out_H; ++h) {
|
||||
for (int64_t w = 0; w < out_W; ++w) {
|
||||
// get the corresponding input x, y, z co-ordinates from grid
|
||||
scalar_t *grid_ptr_NDHW = grid_ptr_N + d * grid_sD + h * grid_sH + w * grid_sW;
|
||||
scalar_t ix = *grid_ptr_NDHW;
|
||||
@ -223,12 +222,12 @@ namespace {
|
||||
scalar_t *gGrid_ptr = grad_grid.data_ptr<scalar_t>();
|
||||
// loop over each output pixel
|
||||
at::parallel_for(0, N, 0, [&](int64_t start, int64_t end) {
|
||||
for (const auto n : c10::irange(start, end)) {
|
||||
for (int64_t n = start; n < end; ++n) {
|
||||
scalar_t *grid_ptr_N = grid_ptr + n * grid_sN;
|
||||
scalar_t *inp_ptr_N = inp_ptr + n * inp_sN;
|
||||
scalar_t *gGrid_ptr_NDHW = gGrid_ptr + n * gGrid_sN;
|
||||
for (const auto d : c10::irange(out_D)) {
|
||||
for (const auto h : c10::irange(out_H)) {
|
||||
for (int64_t d = 0; d < out_D; ++d) {
|
||||
for (int64_t h = 0; h < out_H; ++h) {
|
||||
for (int64_t w = 0; w < out_W; ++w, gGrid_ptr_NDHW += gGrid_sW /* grad_grid is contiguous */ ) {
|
||||
// get the corresponding input x, y, z co-ordinates from grid
|
||||
scalar_t *grid_ptr_NDHW = grid_ptr_N + d * grid_sD + h * grid_sH + w * grid_sW;
|
||||
@ -417,11 +416,11 @@ Tensor _grid_sampler_2d_cpu_fallback(const Tensor& input, const Tensor& grid,
|
||||
scalar_t *grid_ptr = grid.data_ptr<scalar_t>();
|
||||
// loop over each output pixel
|
||||
at::parallel_for(0, N, 0, [&](int64_t start, int64_t end) {
|
||||
for (const auto n : c10::irange(start, end)) {
|
||||
for (int64_t n = start; n < end; ++n) {
|
||||
scalar_t *grid_ptr_N = grid_ptr + n * grid_sN;
|
||||
scalar_t *inp_ptr_N = inp_ptr + n * inp_sN;
|
||||
for (const auto h : c10::irange(out_H)) {
|
||||
for (const auto w : c10::irange(out_W)) {
|
||||
for (int64_t h = 0; h < out_H; ++h) {
|
||||
for (int64_t w = 0; w < out_W; ++w) {
|
||||
// get the corresponding input x, y, z co-ordinates from grid
|
||||
scalar_t *grid_ptr_NHW = grid_ptr_N + h * grid_sH + w * grid_sW;
|
||||
scalar_t x = *grid_ptr_NHW;
|
||||
@ -506,7 +505,7 @@ Tensor _grid_sampler_2d_cpu_fallback(const Tensor& input, const Tensor& grid,
|
||||
scalar_t coefficients[4];
|
||||
|
||||
// Interpolate 4 values in the x directon
|
||||
for (const auto i : c10::irange(4)) {
|
||||
for (int64_t i = 0; i < 4; ++i) {
|
||||
coefficients[i] = cubic_interp1d<scalar_t>(
|
||||
get_value_bounded<scalar_t>(inp_ptr_NC, ix_nw - 1, iy_nw - 1 + i, inp_W, inp_H, inp_sW, inp_sH, padding_mode, align_corners),
|
||||
get_value_bounded<scalar_t>(inp_ptr_NC, ix_nw + 0, iy_nw - 1 + i, inp_W, inp_H, inp_sW, inp_sH, padding_mode, align_corners),
|
||||
@ -579,11 +578,11 @@ _grid_sampler_2d_cpu_fallback_backward(const Tensor& grad_output,
|
||||
scalar_t *gGrid_ptr = grad_grid.data_ptr<scalar_t>();
|
||||
// loop over each output pixel
|
||||
at::parallel_for(0, N, 0, [&](int64_t start, int64_t end) {
|
||||
for (const auto n : c10::irange(start, end)) {
|
||||
for (int64_t n = start; n < end; ++n) {
|
||||
scalar_t *grid_ptr_N = grid_ptr + n * grid_sN;
|
||||
scalar_t *inp_ptr_N = inp_ptr + n * inp_sN;
|
||||
scalar_t *gGrid_ptr_NHW = gGrid_ptr + n * gGrid_sN;
|
||||
for (const auto h : c10::irange(out_H)) {
|
||||
for (int64_t h = 0; h < out_H; ++h) {
|
||||
for (int64_t w = 0; w < out_W; ++w, gGrid_ptr_NHW += gGrid_sW /* grad_grid is contiguous */ ) {
|
||||
// get the corresponding input x, y co-ordinates from grid
|
||||
scalar_t *grid_ptr_NHW = grid_ptr_N + h * grid_sH + w * grid_sW;
|
||||
@ -704,8 +703,8 @@ _grid_sampler_2d_cpu_fallback_backward(const Tensor& grad_output,
|
||||
for (int64_t c = 0; c < C; ++c, gOut_ptr_NCHW += gOut_sC, gInp_ptr_NC += gInp_sC, inp_ptr_NC+= inp_sC) {
|
||||
scalar_t gOut = *gOut_ptr_NCHW;
|
||||
|
||||
for (const auto i : c10::irange(4)) {
|
||||
for (const auto j : c10::irange(4)) {
|
||||
for (int64_t i = 0; i < 4; ++i) {
|
||||
for (int64_t j = 0; j < 4; ++j) {
|
||||
|
||||
// set input gradient
|
||||
add_value_bounded<scalar_t>(gInp_ptr_NC, ix_nw - 1 + i, iy_nw - 1 + j,
|
||||
@ -858,7 +857,7 @@ Tensor grid_sampler(const Tensor& input, const Tensor& grid,
|
||||
!(input.dim() == 5 && static_cast<GridSamplerInterpolation>(interpolation_mode) == GridSamplerInterpolation::Bicubic),
|
||||
"grid_sampler(): bicubic interpolation only supports 4D input"
|
||||
);
|
||||
for (const auto i : c10::irange(2, input.dim())) {
|
||||
for (int64_t i = 2; i < input.dim(); i++) {
|
||||
TORCH_CHECK(input.size(i) > 0,
|
||||
"grid_sampler(): expected input to have non-empty spatial dimensions, "
|
||||
"but input has sizes ", input.sizes(), " with dimension ", i, " being "
|
||||
|
@ -5,7 +5,6 @@
|
||||
|
||||
#include <ATen/native/im2col.h>
|
||||
#include <ATen/native/im2col_shape_check.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
namespace at {
|
||||
namespace native {
|
||||
@ -92,7 +91,7 @@ static void im2col_out_cpu_template(
|
||||
Tensor input_n;
|
||||
Tensor output_n;
|
||||
|
||||
for (const auto elt : c10::irange(batch_size)) {
|
||||
for (int64_t elt = 0; elt < batch_size; elt++) {
|
||||
input_n = input.select(0, elt);
|
||||
output_n = output.select(0, elt);
|
||||
|
||||
|
@ -2,7 +2,6 @@
|
||||
#include <ATen/ExpandUtils.h>
|
||||
#include <ATen/native/TensorIterator.h>
|
||||
#include <ATen/core/List.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
#include <limits>
|
||||
|
||||
@ -32,7 +31,7 @@ static C10_UNUSED std::vector<Tensor> expandTensors(const Tensor & self, const t
|
||||
}
|
||||
// The sizes of the ByteTensor mask or bool tensor must match the sizes of the
|
||||
// corresponding dimensions in self
|
||||
for (const auto j : c10::irange(index.dim())) {
|
||||
for (int64_t j = 0; j < index.dim(); j++) {
|
||||
int64_t srcIdx = result.size() + j;
|
||||
if (index.size(j) != self.size(srcIdx)) {
|
||||
invalid_mask(self, srcIdx, index, j);
|
||||
@ -40,7 +39,7 @@ static C10_UNUSED std::vector<Tensor> expandTensors(const Tensor & self, const t
|
||||
}
|
||||
// Replace with nonzeros
|
||||
auto nonzero = index.nonzero();
|
||||
for (const auto j : c10::irange(index.dim())) {
|
||||
for (int64_t j = 0; j < index.dim(); j++) {
|
||||
result.emplace_back(nonzero.select(1, j));
|
||||
}
|
||||
} else {
|
||||
|
@ -1158,7 +1158,7 @@ static void addbmm_impl_(
|
||||
}
|
||||
|
||||
auto adjusted_beta(beta);
|
||||
for (const auto batch : c10::irange(num_batches)) {
|
||||
for (int64_t batch = 0; batch < num_batches; ++batch) {
|
||||
result.addmm_(batch1[batch], batch2[batch], adjusted_beta, alpha);
|
||||
adjusted_beta = 1; // accumulate output once
|
||||
}
|
||||
@ -1215,23 +1215,23 @@ inline void baddbmm_cpu_kernel(const Tensor& result, const Tensor& self, const T
|
||||
|
||||
int64_t grain_size = std::min(internal::GRAIN_SIZE / (is * js * ks), (int64_t)1);
|
||||
parallel_for(0, bs, grain_size, [&](int64_t b_begin, int64_t b_end) {
|
||||
for (const auto b : c10::irange(b_begin, b_end)) {
|
||||
for (int64_t b = b_begin; b < b_end; b++) {
|
||||
auto r1 = r0[b];
|
||||
auto s1 = s0[b];
|
||||
auto m1 = m0[b];
|
||||
for (const auto i : c10::irange(is)) {
|
||||
for (int64_t i = 0; i < is; i++) {
|
||||
auto r2 = r1[i];
|
||||
auto s2 = s1[i];
|
||||
for (const auto j : c10::irange(js)) {
|
||||
for (int64_t j = 0; j < js; j++) {
|
||||
scalar_t &r = r2[j];
|
||||
if (is_bmm) {
|
||||
r = 0;
|
||||
for (const auto k : c10::irange(ks)) {
|
||||
for (int64_t k = 0; k < ks; k++) {
|
||||
r += s2[k] * m1[k][j];
|
||||
}
|
||||
} else {
|
||||
r *= beta;
|
||||
for (const auto k : c10::irange(ks)) {
|
||||
for (int64_t k = 0; k < ks; k++) {
|
||||
r += alpha * s2[k] * m1[k][j];
|
||||
}
|
||||
}
|
||||
@ -1994,11 +1994,10 @@ void compute_T18_scale_square(
|
||||
auto mexp_scaled = at::native::compute_T18<scalar_t>(a_scaled);
|
||||
auto s_cpu = (s.device().type() == at::kCPU)
|
||||
? s : s.to(at::kCPU);
|
||||
for (const auto i : c10::irange(mexp_scaled.size(0))) {
|
||||
for (int64_t i = 0; i < mexp_scaled.size(0); ++i) {
|
||||
auto s_val = s_cpu.select(0, i).template item<int64_t>();
|
||||
auto mexp = mexp_scaled.select(0, i);
|
||||
for (const auto p : c10::irange(s_val)) {
|
||||
(void)p; //Suppress unused variable warning
|
||||
for (int64_t p = 0; p < s_val; ++p) {
|
||||
mexp = at::matmul(mexp, mexp);
|
||||
}
|
||||
mexp_out.select(0, i).copy_(mexp);
|
||||
@ -2266,7 +2265,7 @@ Tensor& nuclear_norm_out(const Tensor& self, IntArrayRef dim, bool keepdim, Tens
|
||||
// (e.g. [0, 1, 2, ..., ndim-1])
|
||||
static std::vector<int64_t> make_dim_list(int64_t ndim) {
|
||||
std::vector<int64_t> dim_list(ndim);
|
||||
for (const auto ind : c10::irange(ndim)) {
|
||||
for (int64_t ind = 0; ind < ndim; ind++) {
|
||||
dim_list[ind] = ind;
|
||||
}
|
||||
return dim_list;
|
||||
@ -2819,7 +2818,7 @@ struct KronImpl final {
|
||||
a_reshape = c10::SmallVector<int64_t, 10>(2 * maxdim);
|
||||
b_reshape = c10::SmallVector<int64_t, 10>(2 * maxdim);
|
||||
result_reshape = c10::SmallVector<int64_t, 10>(maxdim);
|
||||
for (const auto i : c10::irange(maxdim)) {
|
||||
for (int64_t i = 0; i < maxdim; i++) {
|
||||
a_reshape[2 * i] = (i >= pad_self ? self.sizes()[i - pad_self] : 1);
|
||||
a_reshape[2 * i + 1] = 1;
|
||||
b_reshape[2 * i] = 1;
|
||||
@ -2834,7 +2833,7 @@ struct KronImpl final {
|
||||
TORCH_INTERNAL_ASSERT(result.defined(), "Cannot call kron_out with an undefined result tensor as the out argument. Please allocate a Tensor before calling kron_out with it.");
|
||||
|
||||
c10::SmallVector<int64_t, 10> mul_shape(2 * maxdim);
|
||||
for (const auto i : c10::irange(maxdim)) {
|
||||
for (int64_t i = 0; i < maxdim; i++) {
|
||||
mul_shape[2 * i] = a_reshape[2 * i];
|
||||
mul_shape[2 * i + 1] = b_reshape[2 * i + 1];
|
||||
}
|
||||
|
@ -1,7 +1,6 @@
|
||||
#pragma once
|
||||
|
||||
#include <c10/core/ScalarType.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <ATen/ATen.h>
|
||||
#include <ATen/ExpandUtils.h>
|
||||
#include <ATen/TensorUtils.h>
|
||||
@ -170,8 +169,7 @@ void batch_iterator_with_broadcasting(const Tensor& a, const Tensor& b, const fu
|
||||
auto* b_batch_idx_ptr = data[0];
|
||||
auto* a_batch_idx_ptr = data[1];
|
||||
|
||||
for (const auto elem : c10::irange(nelems)) {
|
||||
(void)elem; //Suppress unused variable warning
|
||||
for (int64_t elem = 0; elem < nelems; ++elem) {
|
||||
auto b_curr_linear_batch_idx = *reinterpret_cast<int64_t*>(b_batch_idx_ptr);
|
||||
auto a_curr_linear_batch_idx = *reinterpret_cast<int64_t*>(a_batch_idx_ptr);
|
||||
|
||||
@ -334,7 +332,7 @@ static inline Tensor _move_to_end(const Tensor& self, IntArrayRef axes) {
|
||||
const int64_t ndim = self.ndimension();
|
||||
std::vector<int64_t> perm;
|
||||
|
||||
for (const auto i : c10::irange(ndim)) {
|
||||
for (int64_t i = 0; i < ndim; i++) {
|
||||
auto it = std::find(a.begin(), a.end(), i);
|
||||
if (it == a.end()) {
|
||||
perm.push_back(i);
|
||||
@ -478,7 +476,7 @@ static inline std::vector<int64_t> create_dim_backshift_permutation(int64_t dim0
|
||||
"duplicate or invalid dimensions");
|
||||
std::vector<int64_t> permutation(ndim);
|
||||
int64_t cur_permuted_dim = 0;
|
||||
for (const auto dim_ind : c10::irange(ndim)) {
|
||||
for (int64_t dim_ind = 0; dim_ind < ndim; dim_ind++) {
|
||||
if ((dim_ind != dim0) && (dim_ind != dim1)) {
|
||||
permutation[cur_permuted_dim++] = dim_ind;
|
||||
}
|
||||
@ -495,7 +493,7 @@ static inline std::vector<int64_t> create_dim_backshift_permutation(int64_t dim0
|
||||
static inline std::vector<int64_t> create_reverse_permutation(std::vector<int64_t> permutation) {
|
||||
int64_t ndim = permutation.size();
|
||||
std::vector<int64_t> reverse_permutation(ndim);
|
||||
for (const auto dim_ind : c10::irange(ndim)) {
|
||||
for (int64_t dim_ind = 0; dim_ind < ndim; dim_ind++) {
|
||||
reverse_permutation[permutation[dim_ind]] = dim_ind;
|
||||
}
|
||||
return reverse_permutation;
|
||||
|
@ -11,7 +11,6 @@
|
||||
#include <ATen/Parallel.h>
|
||||
#include <ATen/TensorUtils.h>
|
||||
#include <ATen/native/Fill.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
#include <numeric>
|
||||
#include <type_traits>
|
||||
@ -61,7 +60,7 @@ std::tuple<Tensor, Tensor> ctc_loss_cpu_template(const Tensor& log_probs, const
|
||||
std::vector<int64_t> tg_batch_offsets(batch_size);
|
||||
if (targets.dim() == 1) { // concatenated targets
|
||||
int64_t pos = 0;
|
||||
for (const auto i : c10::irange(batch_size)) {
|
||||
for (int64_t i = 0; i < batch_size; i++) {
|
||||
tg_batch_offsets[i] = pos;
|
||||
pos += target_lengths[i];
|
||||
if (max_target_length < target_lengths[i])
|
||||
@ -73,7 +72,7 @@ std::tuple<Tensor, Tensor> ctc_loss_cpu_template(const Tensor& log_probs, const
|
||||
else { // batch x max_target_length
|
||||
// dim is 2
|
||||
int64_t tg_batch_stride = targets.stride(0);
|
||||
for (const auto i : c10::irange(batch_size)) {
|
||||
for (int64_t i = 0; i < batch_size; i++) {
|
||||
tg_batch_offsets[i] = i * tg_batch_stride;
|
||||
if (max_target_length < target_lengths[i])
|
||||
max_target_length = target_lengths[i];
|
||||
@ -85,7 +84,7 @@ std::tuple<Tensor, Tensor> ctc_loss_cpu_template(const Tensor& log_probs, const
|
||||
" (while checking arguments for ", c, ")");
|
||||
}
|
||||
int64_t max_input_length = log_probs.size(0);
|
||||
for (const auto b : c10::irange(batch_size)) {
|
||||
for (int64_t b = 0; b < batch_size; b++) {
|
||||
TORCH_CHECK(input_lengths[b] <= max_input_length,
|
||||
"Expected input_lengths to have value at most ", max_input_length, ", but got value ", input_lengths[b],
|
||||
" (while checking arguments for ", c, ")");
|
||||
@ -104,7 +103,7 @@ std::tuple<Tensor, Tensor> ctc_loss_cpu_template(const Tensor& log_probs, const
|
||||
// first the default
|
||||
log_alpha.narrow(1, 0, 1).fill_(neginf);
|
||||
at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) {
|
||||
for (const auto b : c10::irange(start, end)) {
|
||||
for (int64_t b = start; b < end; b++) {
|
||||
int64_t input_length = input_lengths[b];
|
||||
int64_t target_length = target_lengths[b];
|
||||
auto log_probs_a = log_probs_a_global[b];
|
||||
@ -117,7 +116,7 @@ std::tuple<Tensor, Tensor> ctc_loss_cpu_template(const Tensor& log_probs, const
|
||||
log_alpha_a[0][1] = log_probs_a[0][get_target_prime(targets_data, tg_batch_offset, tg_target_stride, 1, BLANK)];
|
||||
|
||||
// now the loop over the inputs
|
||||
for (const auto t : c10::irange(1, input_length)) {
|
||||
for (int64_t t=1; t<input_length; t++) {
|
||||
for (int64_t s=0; s<2*target_length+1; s++) {
|
||||
auto current_target_prime = get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s, BLANK);
|
||||
// this loop over s could be parallel/vectorized, too, but the required items are one index apart
|
||||
@ -190,7 +189,7 @@ Tensor ctc_loss_backward_cpu_template(const Tensor& grad_out, const Tensor& log_
|
||||
if (targets.dim() == 1) { // concatenated targets
|
||||
int64_t pos = 0;
|
||||
max_target_length = 0;
|
||||
for (const auto i : c10::irange(batch_size)) {
|
||||
for (int64_t i = 0; i < batch_size; i++) {
|
||||
tg_batch_offsets[i] = pos;
|
||||
pos += target_lengths[i];
|
||||
if (max_target_length < target_lengths[i])
|
||||
@ -201,7 +200,7 @@ Tensor ctc_loss_backward_cpu_template(const Tensor& grad_out, const Tensor& log_
|
||||
else { // batch x max_target_length
|
||||
// dim is 2
|
||||
int64_t tg_batch_stride = targets.stride(0);
|
||||
for (const auto i : c10::irange(batch_size)) {
|
||||
for (int64_t i = 0; i < batch_size; i++) {
|
||||
tg_batch_offsets[i] = i * tg_batch_stride;
|
||||
}
|
||||
tg_target_stride = targets.stride(1);
|
||||
@ -235,7 +234,7 @@ Tensor ctc_loss_backward_cpu_template(const Tensor& grad_out, const Tensor& log_
|
||||
TensorIterator fill_1d_iter_local(fill_1d_iter);
|
||||
TensorIterator fill_log_beta_1d_iter_local(fill_log_beta_1d_iter);
|
||||
|
||||
for (const auto b : c10::irange(start, end)) {
|
||||
for (int64_t b = start; b < end; b++) {
|
||||
scalar_t nll = neg_log_likelihood.accessor<scalar_t, 1>()[b];
|
||||
auto grad_a = grad_a_global[b];
|
||||
if (zero_infinity && nll == std::numeric_limits<scalar_t>::infinity()) {
|
||||
@ -323,8 +322,8 @@ Tensor ctc_loss_backward_cpu_template(const Tensor& grad_out, const Tensor& log_
|
||||
// this could be a great target for further vectorization.
|
||||
// grad is the output gradient, nll is the loss. Note that the likelihood -nll is the Z of eq (16)
|
||||
scalar_t gr = grad_out.accessor<scalar_t, 1>()[b];
|
||||
for (const auto t : c10::irange(input_length)) { // or go for the full thing?
|
||||
for (const auto c : c10::irange(num_labels)) {
|
||||
for (int64_t t = 0; t < input_length; t++) { // or go for the full thing?
|
||||
for (int64_t c = 0; c < num_labels; c++) {
|
||||
scalar_t& res = grad_a[t][c];
|
||||
scalar_t lp = log_probs_a[t][c];
|
||||
res = (std::exp(lp)-std::exp(res + nll - lp)) * gr;
|
||||
|
@ -3,7 +3,6 @@
|
||||
#include <ATen/Dispatch.h>
|
||||
#include <ATen/TensorUtils.h>
|
||||
#include <ATen/native/LossMulti.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
namespace at {
|
||||
namespace native {
|
||||
@ -18,21 +17,21 @@ inline scalar_t multilabel_margin_loss_forward_inner_sum_cpu(
|
||||
int64_t dim) {
|
||||
using accscalar_t = at::acc_type<scalar_t, false>;
|
||||
accscalar_t sum = 0;
|
||||
for (const auto ddt : c10::irange(dim)) {
|
||||
for (int64_t ddt = 0; ddt < dim; ddt++) {
|
||||
int64_t target_idx = target_data[ddt];
|
||||
if (target_idx < 0) {
|
||||
break;
|
||||
}
|
||||
is_target_data[target_idx] = 1;
|
||||
}
|
||||
for (const auto dt : c10::irange(dim)) {
|
||||
for (int64_t dt = 0; dt < dim; dt++) {
|
||||
int64_t target_idx = target_data[dt];
|
||||
if (target_idx < 0) {
|
||||
break;
|
||||
}
|
||||
|
||||
scalar_t input_target = input_data[target_idx];
|
||||
for (const auto d : c10::irange(dim)) {
|
||||
for (int64_t d = 0; d < dim; d++) {
|
||||
if (!is_target_data[d]) {
|
||||
scalar_t z = 1 - input_target + input_data[d];
|
||||
if (z > 0) {
|
||||
@ -64,8 +63,7 @@ static void multilabel_margin_loss_forward_out_frame(
|
||||
|
||||
accscalar_t sum = 0;
|
||||
|
||||
for (const auto t : c10::irange(nframe)) {
|
||||
(void)t; //Suppress unused variable warning
|
||||
for (int64_t t = 0; t < nframe; t++) {
|
||||
sum += multilabel_margin_loss_forward_inner_sum_cpu(
|
||||
input_data, target_data, is_target_data, dim);
|
||||
|
||||
@ -83,7 +81,7 @@ static void multilabel_margin_loss_forward_out_frame(
|
||||
} else {
|
||||
auto output_acc = output.accessor<scalar_t, 1>();
|
||||
|
||||
for (const auto t : c10::irange(nframe)) {
|
||||
for (int64_t t = 0; t < nframe; t++) {
|
||||
scalar_t sum = multilabel_margin_loss_forward_inner_sum_cpu(
|
||||
input_data, target_data, is_target_data, dim);
|
||||
|
||||
@ -173,16 +171,15 @@ static void multilabel_margin_loss_backward_out_frame(
|
||||
reduction == Reduction::Mean ? 1. / (nframe * dim) : 1. / dim);
|
||||
|
||||
scalar_t* grad_input_row_data = grad_input.data_ptr<scalar_t>();
|
||||
for (const auto t : c10::irange(nframe)) {
|
||||
(void)t; //Suppress unused variable warning
|
||||
for (const auto dt : c10::irange(dim)) {
|
||||
for (int64_t t = 0; t < nframe; t++) {
|
||||
for (int64_t dt = 0; dt < dim; dt++) {
|
||||
int64_t target_idx = target_data[dt];
|
||||
if (target_idx < 0) {
|
||||
break;
|
||||
}
|
||||
|
||||
scalar_t input_target = input_data[target_idx];
|
||||
for (const auto d : c10::irange(dim)) {
|
||||
for (int64_t d = 0; d < dim; d++) {
|
||||
if (!is_target_data[d]) {
|
||||
scalar_t z = 1 - input_target + input_data[d];
|
||||
if (z > 0) {
|
||||
@ -209,8 +206,8 @@ static void multilabel_margin_loss_backward_out_frame(
|
||||
} else {
|
||||
check_dim_size(grad_output, 1, 0, nframe);
|
||||
auto grad_output_acc = grad_output.accessor<scalar_t, 1>();
|
||||
for (const auto t : c10::irange(nframe)) {
|
||||
for (const auto d : c10::irange(dim)) {
|
||||
for (int64_t t = 0; t < nframe; t++) {
|
||||
for (int64_t d = 0; d < dim; d++) {
|
||||
grad_input_data[t * dim + d] *= grad_output_acc[t];
|
||||
}
|
||||
}
|
||||
|
@ -2,7 +2,6 @@
|
||||
#include <ATen/Dispatch.h>
|
||||
#include <ATen/AccumulateType.h>
|
||||
#include <ATen/native/LossMulti.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
namespace at {
|
||||
namespace native {
|
||||
@ -19,7 +18,7 @@ inline scalar_t multi_margin_inner_sum_cpu(
|
||||
const int64_t target_idx) {
|
||||
const scalar_t input_target = input_data[target_idx];
|
||||
scalar_t sum = 0;
|
||||
for (const auto d : c10::irange(dim)) {
|
||||
for (int64_t d = 0; d < dim; d++) {
|
||||
if (d == target_idx) {
|
||||
continue;
|
||||
}
|
||||
@ -64,7 +63,7 @@ static inline void multi_margin_loss_cpu_kernel(
|
||||
// cannot be handled by TensorAccessor)
|
||||
if (reduction == Reduction::None && output.dim() > 0) {
|
||||
auto output_acc = output.accessor<scalar_t, 1>();
|
||||
for (const auto t : c10::irange(nframe)) {
|
||||
for (int64_t t = 0; t < nframe; t++) {
|
||||
const auto idx = target_index_checked(target_data, t, dim);
|
||||
auto sum = multi_margin_inner_sum_cpu(
|
||||
input_data, weight_data, p, margin, dim, idx);
|
||||
@ -74,7 +73,7 @@ static inline void multi_margin_loss_cpu_kernel(
|
||||
} else {
|
||||
accscalar_t sum = 0;
|
||||
auto output_acc = output.data_ptr<scalar_t>();
|
||||
for (const auto t : c10::irange(nframe)) {
|
||||
for (int64_t t = 0; t < nframe; t++) {
|
||||
const auto idx = target_index_checked(target_data, t, dim);
|
||||
sum += multi_margin_inner_sum_cpu(
|
||||
input_data, weight_data, p, margin, dim, idx);
|
||||
@ -150,11 +149,11 @@ static void multi_margin_loss_backward_cpu_kernel(
|
||||
int64_t dim,
|
||||
int64_t reduction) {
|
||||
scalar_t* grad_input_row_data = grad_input_data;
|
||||
for (const auto t : c10::irange(nframe)) {
|
||||
for (int64_t t = 0; t < nframe; t++) {
|
||||
int64_t target_idx = target_index_checked(target_data, t, dim);
|
||||
scalar_t input_target = input_data[target_idx];
|
||||
scalar_t grad_input_target = 0;
|
||||
for (const auto d : c10::irange(dim)) {
|
||||
for (int64_t d = 0; d < dim; d++) {
|
||||
scalar_t z = margin - input_target + input_data[d];
|
||||
if (d == target_idx) {
|
||||
continue;
|
||||
@ -187,8 +186,8 @@ static void multi_margin_loss_backward_cpu_kernel(
|
||||
}
|
||||
} else {
|
||||
auto grad_output_acc = grad_output.accessor<scalar_t, 1>();
|
||||
for (const auto t : c10::irange(nframe)) {
|
||||
for (const auto d : c10::irange(dim)) {
|
||||
for (int64_t t = 0; t < nframe; t++) {
|
||||
for (int64_t d = 0; d < dim; d++) {
|
||||
grad_input_data[t * dim + d] *= grad_output_acc[t];
|
||||
}
|
||||
}
|
||||
|
@ -9,7 +9,6 @@
|
||||
#include <c10/util/SmallBuffer.h>
|
||||
|
||||
#include <c10/core/TensorOptions.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
namespace at {
|
||||
namespace meta {
|
||||
@ -156,7 +155,7 @@ static void nll_loss_out_frame(
|
||||
auto output_acc = output.accessor<scalar_t, 1>();
|
||||
|
||||
at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) {
|
||||
for (const auto i : c10::irange(start, end)) {
|
||||
for (auto i = start; i < end; i++) {
|
||||
const auto cur_target = target_acc[i];
|
||||
|
||||
if (cur_target == ignore_index) {
|
||||
@ -216,7 +215,7 @@ static void nll_loss_out_frame(
|
||||
scalar_t weight_partial_sums[cascade_sum_num_levels] = {0};
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
|
||||
scalar_t loss_partial_sums[cascade_sum_num_levels] = {0};
|
||||
for (const auto b : c10::irange(batch_size)) {
|
||||
for (int64_t b = 0; b < batch_size; b++) {
|
||||
const int64_t cur_target = target_data[b];
|
||||
if (cur_target == ignore_index) {
|
||||
++num_ignored;
|
||||
@ -331,7 +330,7 @@ static void nll_loss_backward_out_frame(
|
||||
auto grad_input_acc = grad_input.accessor<scalar_t, 2>();
|
||||
auto grad_output_acc = grad_output.accessor<scalar_t, 1>();
|
||||
at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) {
|
||||
for (const auto i : c10::irange(start, end)) {
|
||||
for (auto i = start; i < end; i++) {
|
||||
auto cur_target = target_acc[i];
|
||||
if (cur_target == ignore_index) {
|
||||
continue;
|
||||
|
@ -5,7 +5,6 @@
|
||||
#include <ATen/TensorUtils.h>
|
||||
#include <ATen/native/cpu/utils.h>
|
||||
#include <ATen/native/Resize.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
namespace at {
|
||||
namespace native {
|
||||
@ -110,9 +109,9 @@ static void nll_loss2d_forward_out_frame(
|
||||
auto target_acc = target.accessor<int64_t, 3>();
|
||||
|
||||
at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) {
|
||||
for (const auto b : c10::irange(start, end)) {
|
||||
for (const auto h : c10::irange(H)) {
|
||||
for (const auto w : c10::irange(W)) {
|
||||
for (int64_t b = start; b < end; b++) {
|
||||
for (int64_t h = 0; h < H; h++) {
|
||||
for (int64_t w = 0; w < W; w++) {
|
||||
const int64_t cur_target = (int64_t)target_acc[b][h][w];
|
||||
|
||||
if (cur_target == ignore_index) {
|
||||
@ -177,8 +176,8 @@ static void nll_loss2d_forward_out_frame(
|
||||
const int64_t level_mask = level_step - 1;
|
||||
|
||||
int64_t num_ignored = 0;
|
||||
for (const auto b : c10::irange(batch_size)) {
|
||||
for (const auto elem : c10::irange(map_size)) {
|
||||
for (int64_t b = 0; b < batch_size; b++) {
|
||||
for (int64_t elem = 0; elem < map_size; elem++) {
|
||||
const int64_t cur_target = target_data[b * map_size + elem];
|
||||
if (cur_target == ignore_index) {
|
||||
++num_ignored;
|
||||
@ -287,9 +286,9 @@ static void nll_loss2d_backward_out_frame(
|
||||
auto target_acc = target.accessor<int64_t, 3>();
|
||||
|
||||
at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) {
|
||||
for (const auto b : c10::irange(start, end)) {
|
||||
for (const auto h : c10::irange(H)) {
|
||||
for (const auto w : c10::irange(W)) {
|
||||
for (int64_t b = start; b < end; b++) {
|
||||
for (int64_t h = 0; h < H; h++) {
|
||||
for (int64_t w = 0; w < W; w++) {
|
||||
const int64_t cur_target = target_acc[b][h][w];
|
||||
if (cur_target == ignore_index) {
|
||||
continue;
|
||||
@ -330,8 +329,8 @@ static void nll_loss2d_backward_out_frame(
|
||||
: grad_output_value);
|
||||
|
||||
at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) {
|
||||
for (const auto b : c10::irange(start, end)) {
|
||||
for (const auto elem : c10::irange(map_size)) {
|
||||
for (int64_t b = start; b < end; b++) {
|
||||
for (int64_t elem = 0; elem < map_size; elem++) {
|
||||
const int64_t t = target_data[b * map_size + elem];
|
||||
|
||||
if (t != ignore_index) {
|
||||
|
@ -60,7 +60,6 @@ bool _nnpack_available() {
|
||||
#include <caffe2/utils/threadpool/pthreadpool-cpp.h>
|
||||
#include <ATen/native/ConvUtils.h>
|
||||
#include <ATen/Parallel.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
namespace at {
|
||||
namespace native {
|
||||
@ -239,7 +238,7 @@ Tensor _nnpack_spatial_convolution(
|
||||
const size_t input_size_per_batch = input_channels * input_size.width * input_size.height;
|
||||
const size_t output_size_per_batch = output_channels * output_size.width * output_size.height;
|
||||
|
||||
for (const auto batch : c10::irange(0u, batch_size)) {
|
||||
for (size_t batch = 0u; batch < batch_size; ++batch) {
|
||||
const nnp_status status = nnp_convolution_inference(
|
||||
algorithm,
|
||||
nnp_convolution_transform_strategy_compute,
|
||||
|
@ -100,7 +100,7 @@ Tensor refine_names(const Tensor& self, DimnameList names) {
|
||||
self_names.size(), " and ", names.size(), " respectively).");
|
||||
check_names_valid_for(self, names);
|
||||
|
||||
for (const auto idx : c10::irange(self_names.size())) {
|
||||
for (size_t idx = 0; idx < self_names.size(); idx++) {
|
||||
const auto& self_name = self_names[idx];
|
||||
const auto& out_name = names[idx];
|
||||
if (self_name == out_name || self_name.isWildcard()) {
|
||||
@ -221,7 +221,7 @@ Tensor align_to(const Tensor& tensor, DimnameList order, int64_t ellipsis_idx) {
|
||||
};
|
||||
|
||||
// Fill in the non-ellipsis dimensions
|
||||
for (const auto order_idx : c10::irange(0U, order.size())) {
|
||||
for (auto order_idx = 0U; order_idx < order.size(); ++order_idx) {
|
||||
auto out_idx = order_idx;
|
||||
if (order_idx >= ellipsis_idx) {
|
||||
out_idx = order_idx + num_ellipsis_names;
|
||||
|
@ -10,7 +10,6 @@
|
||||
#include <ATen/native/cpu/Loops.h>
|
||||
#include <ATen/native/batch_norm.h>
|
||||
#include <ATen/native/Normalization.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
#include <vector>
|
||||
|
||||
@ -157,7 +156,7 @@ std::tuple<Tensor,Tensor> batch_norm_cpu_update_stats_template(
|
||||
// Reduce all dimensions except dim=1
|
||||
DimVector reduce_dims(ndim - 1);
|
||||
reduce_dims[0] = 0;
|
||||
for (const auto i : c10::irange(2, ndim)) {
|
||||
for (int64_t i = 2; i < ndim; ++i) {
|
||||
reduce_dims[i - 1] = i;
|
||||
}
|
||||
|
||||
@ -179,7 +178,7 @@ std::tuple<Tensor,Tensor> batch_norm_cpu_update_stats_template(
|
||||
batch_norm_cpu_collect_stats_stub(kCPU, _mean, _var_sum, input);
|
||||
|
||||
parallel_for(0, n_input, 1, [&](int64_t b_begin, int64_t b_end) {
|
||||
for (const auto f : c10::irange(b_begin, b_end)) {
|
||||
for (int64_t f = b_begin; f < b_end; ++f) {
|
||||
save_mean_a[f] = _mean_a[f];
|
||||
save_var_transform_a[f] = VarTransform<accscalar_t>{}(_var_sum_a[f] / n, eps);
|
||||
|
||||
@ -207,7 +206,7 @@ std::tuple<Tensor,Tensor> batch_norm_cpu_update_stats_template(
|
||||
|
||||
parallel_for(0, n_input, 1, [&](int64_t b_begin, int64_t b_end) {
|
||||
TensorIterator iter(reduce_iter);
|
||||
for (const auto f : c10::irange(b_begin, b_end)) {
|
||||
for (int64_t f = b_begin; f < b_end; ++f) {
|
||||
// compute variance per input
|
||||
iter.unsafe_replace_operand(0, in_data + channel_stride * f);
|
||||
accscalar_t var_sum = 0;
|
||||
@ -284,7 +283,7 @@ std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_cpu_template(
|
||||
// Reduce all dimensions except dim=1
|
||||
DimVector reduce_dims(ndim - 1);
|
||||
reduce_dims[0] = 0;
|
||||
for (const auto i : c10::irange(2, ndim)) {
|
||||
for (int64_t i = 2; i < ndim; ++i) {
|
||||
reduce_dims[i - 1] = i;
|
||||
}
|
||||
|
||||
@ -331,7 +330,7 @@ std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_cpu_template(
|
||||
TensorIterator unary_iter_local(unary_iter);
|
||||
TensorIterator binary_iter_local(binary_iter);
|
||||
|
||||
for (const auto f : c10::irange(b_begin, b_end)) {
|
||||
for (int64_t f = b_begin; f < b_end; ++f) {
|
||||
scalar_t w = weight.defined() ? weight_a[f] : 1;
|
||||
|
||||
scalar_t mean, invstd;
|
||||
|
@ -77,7 +77,7 @@ std::tuple<Tensor, Tensor> _pack_padded_sequence(const Tensor& _input, const Ten
|
||||
// more elements below in our column, we lower the counter (prev_l), and append the new
|
||||
// block to the output.
|
||||
int64_t prev_l = 0;
|
||||
for (const auto i : c10::irange(batch_size)) {
|
||||
for (int64_t i = 0; i < batch_size; ++i) {
|
||||
int64_t l = lengths[batch_size - 1 - i];
|
||||
if (l > prev_l) {
|
||||
auto current_batch_size = batch_size - i;
|
||||
@ -109,7 +109,7 @@ Tensor _pack_padded_sequence_backward(const Tensor& grad, at::IntArrayRef input_
|
||||
int64_t offset = 0;
|
||||
int64_t max_seq_len = batch_sizes_t.size(0);
|
||||
int64_t * batch_sizes = batch_sizes_t.data_ptr<int64_t>();
|
||||
for (const auto i : c10::irange(max_seq_len)) {
|
||||
for (int64_t i = 0; i < max_seq_len; ++i) {
|
||||
grad_input[i].slice(0, 0, batch_sizes[i]).copy_(grad.slice(0, offset, offset + batch_sizes[i]));
|
||||
offset += batch_sizes[i];
|
||||
}
|
||||
@ -170,8 +170,7 @@ std::tuple<Tensor, Tensor> _pad_packed_sequence(const Tensor& data, const Tensor
|
||||
}
|
||||
int64_t dec = prev_batch_size - batch_size;
|
||||
if (dec > 0) {
|
||||
for (const auto j : c10::irange(dec)) {
|
||||
(void)j; //Suppress unused variable warning
|
||||
for (int64_t j = 0; j < dec; ++j) {
|
||||
(*lengths--) = i;
|
||||
}
|
||||
}
|
||||
@ -207,7 +206,7 @@ Tensor pad_sequence(TensorList sequences, bool batch_first, double padding_value
|
||||
out_dims.insert(out_dims.end(), trailing_dims.begin(), trailing_dims.end());
|
||||
|
||||
Tensor out = at::full(out_dims, padding_value, sequences[0].options());
|
||||
for (const auto i : c10::irange(sequences_size)) {
|
||||
for (int64_t i = 0; i < sequences_size; i++) {
|
||||
const Tensor currseq = sequences[i];
|
||||
const int64_t length_i = currseq.size(0);
|
||||
// use index notation to prevent duplicate references to the tensor
|
||||
|
@ -2,7 +2,6 @@
|
||||
#include <ATen/NativeFunctions.h>
|
||||
#include <ATen/div_rtn.h>
|
||||
#include <ATen/native/DispatchStub.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
#pragma once
|
||||
|
||||
@ -213,7 +212,7 @@ pool3d_shape_check(
|
||||
TORCH_CHECK(ndim == 4 || ndim == 5,
|
||||
fn_name, ": Expected 4D or 5D tensor for input, but got: ", input.sizes());
|
||||
|
||||
for (const auto i : c10::irange(1, ndim)) {
|
||||
for (int64_t i = 1; i < ndim; ++i) {
|
||||
TORCH_CHECK(input.size(i) > 0,
|
||||
fn_name, "Expected input to have non-zero size for non-batch dimensions, but got",
|
||||
input.sizes(), " with dimension ", i, " being empty.");
|
||||
|
@ -206,9 +206,9 @@ void CalcColOffsetsTranspose(
|
||||
const int8_t* Bint8,
|
||||
int32_t B_zero_point,
|
||||
int32_t* col_offsets) {
|
||||
for (const auto i : c10::irange(N)) {
|
||||
for (int i = 0; i < N; ++i) {
|
||||
int32_t sum = 0;
|
||||
for (const auto j : c10::irange(K)) {
|
||||
for (int j = 0; j < K; ++j) {
|
||||
sum += Bint8[i * K + j];
|
||||
}
|
||||
col_offsets[i] = sum - B_zero_point * K;
|
||||
@ -353,7 +353,7 @@ bool CheckAndSaturate(T max_val, T* element) {
|
||||
void HandleWeightsSaturation(int64_t N, float* weight) {
|
||||
const float kFp16Max = RawUint16ToFp16(0x7BFF);
|
||||
bool found_out_of_range = false;
|
||||
for (const auto i : c10::irange(N)) {
|
||||
for (int64_t i = 0; i < N; ++i) {
|
||||
if (CheckAndSaturate<float>(kFp16Max, weight + i)) {
|
||||
found_out_of_range = true;
|
||||
}
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user