mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
use irange for loops (#66234)
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/66234 Modified loops in files under fbsource/fbcode/caffe2/ from the format `for(TYPE var=x0;var<x_max;x++)` to the format `for(const auto var: irange(xmax))` This was achieved by running r-barnes's loop upgrader script (D28874212) with some modification to exclude all files under /torch/jit and a number of reversions or unused variable suppression warnings added by hand. bypass_size_limit allow-large-files Test Plan: Sandcastle Reviewed By: ngimel Differential Revision: D30652629 fbshipit-source-id: 0ae6c4bbbb554bad42e372792a6430e1acf15e3e
This commit is contained in:
committed by
Facebook GitHub Bot
parent
b5b7d6a3a6
commit
687c2267d4
@ -4,6 +4,7 @@
|
|||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
#include <c10/core/MemoryFormat.h>
|
#include <c10/core/MemoryFormat.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
#include <fbjni/ByteBuffer.h>
|
#include <fbjni/ByteBuffer.h>
|
||||||
#include <fbjni/fbjni.h>
|
#include <fbjni/fbjni.h>
|
||||||
@ -97,7 +98,7 @@ static at::Tensor newAtTensor(
|
|||||||
std::vector<int64_t> shapeVec{};
|
std::vector<int64_t> shapeVec{};
|
||||||
shapeVec.reserve(rank);
|
shapeVec.reserve(rank);
|
||||||
auto numel = 1;
|
auto numel = 1;
|
||||||
for (auto i = 0; i < rank; ++i) {
|
for (const auto i : c10::irange(rank)) {
|
||||||
shapeVec.push_back(shapeArr[i]);
|
shapeVec.push_back(shapeArr[i]);
|
||||||
numel *= shapeArr[i];
|
numel *= shapeArr[i];
|
||||||
}
|
}
|
||||||
@ -521,7 +522,7 @@ at::IValue JIValue::JIValueToAtIValue(
|
|||||||
|
|
||||||
std::vector<at::IValue> elements;
|
std::vector<at::IValue> elements;
|
||||||
elements.reserve(n);
|
elements.reserve(n);
|
||||||
for (auto i = 0; i < n; ++i) {
|
for (const auto i : c10::irange(n)) {
|
||||||
auto jivalue_element = jarray->getElement(i);
|
auto jivalue_element = jarray->getElement(i);
|
||||||
auto element = JIValue::JIValueToAtIValue(jivalue_element);
|
auto element = JIValue::JIValueToAtIValue(jivalue_element);
|
||||||
elements.push_back(std::move(element));
|
elements.push_back(std::move(element));
|
||||||
@ -535,7 +536,7 @@ at::IValue JIValue::JIValueToAtIValue(
|
|||||||
size_t n = jArrayPinned.size();
|
size_t n = jArrayPinned.size();
|
||||||
c10::List<bool> list{};
|
c10::List<bool> list{};
|
||||||
list.reserve(n);
|
list.reserve(n);
|
||||||
for (size_t i = 0; i < n; ++i) {
|
for (const auto i : c10::irange(n)) {
|
||||||
list.push_back(jArrayPinned[i]);
|
list.push_back(jArrayPinned[i]);
|
||||||
}
|
}
|
||||||
return at::IValue{std::move(list)};
|
return at::IValue{std::move(list)};
|
||||||
@ -547,7 +548,7 @@ at::IValue JIValue::JIValueToAtIValue(
|
|||||||
size_t n = jArrayPinned.size();
|
size_t n = jArrayPinned.size();
|
||||||
c10::List<int64_t> list{};
|
c10::List<int64_t> list{};
|
||||||
list.reserve(n);
|
list.reserve(n);
|
||||||
for (size_t i = 0; i < n; ++i) {
|
for (const auto i : c10::irange(n)) {
|
||||||
list.push_back(jArrayPinned[i]);
|
list.push_back(jArrayPinned[i]);
|
||||||
}
|
}
|
||||||
return at::IValue{std::move(list)};
|
return at::IValue{std::move(list)};
|
||||||
@ -559,7 +560,7 @@ at::IValue JIValue::JIValueToAtIValue(
|
|||||||
size_t n = jArrayPinned.size();
|
size_t n = jArrayPinned.size();
|
||||||
c10::List<double> list{};
|
c10::List<double> list{};
|
||||||
list.reserve(n);
|
list.reserve(n);
|
||||||
for (size_t i = 0; i < n; ++i) {
|
for (const auto i : c10::irange(n)) {
|
||||||
list.push_back(jArrayPinned[i]);
|
list.push_back(jArrayPinned[i]);
|
||||||
}
|
}
|
||||||
return at::IValue{std::move(list)};
|
return at::IValue{std::move(list)};
|
||||||
@ -572,7 +573,7 @@ at::IValue JIValue::JIValueToAtIValue(
|
|||||||
size_t n = jArray->size();
|
size_t n = jArray->size();
|
||||||
c10::List<at::Tensor> list{};
|
c10::List<at::Tensor> list{};
|
||||||
list.reserve(n);
|
list.reserve(n);
|
||||||
for (size_t i = 0; i < n; ++i) {
|
for (const auto i : c10::irange(n)) {
|
||||||
list.push_back(
|
list.push_back(
|
||||||
TensorHybrid::newAtTensorFromJTensor(jArray->getElement(i)));
|
TensorHybrid::newAtTensorFromJTensor(jArray->getElement(i)));
|
||||||
}
|
}
|
||||||
@ -594,7 +595,7 @@ at::IValue JIValue::JIValueToAtIValue(
|
|||||||
c10::impl::GenericList list{c10::unshapedType(first_element.type())};
|
c10::impl::GenericList list{c10::unshapedType(first_element.type())};
|
||||||
list.reserve(n);
|
list.reserve(n);
|
||||||
list.push_back(first_element);
|
list.push_back(first_element);
|
||||||
for (auto i = 1; i < n; ++i) {
|
for (const auto i : c10::irange(1, n)) {
|
||||||
auto jivalue_element = jarray->getElement(i);
|
auto jivalue_element = jarray->getElement(i);
|
||||||
auto element = JIValue::JIValueToAtIValue(jivalue_element);
|
auto element = JIValue::JIValueToAtIValue(jivalue_element);
|
||||||
list.push_back(element);
|
list.push_back(element);
|
||||||
|
@ -6,6 +6,7 @@
|
|||||||
#include <fbjni/ByteBuffer.h>
|
#include <fbjni/ByteBuffer.h>
|
||||||
#include <fbjni/fbjni.h>
|
#include <fbjni/fbjni.h>
|
||||||
|
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <torch/csrc/jit/mobile/import.h>
|
#include <torch/csrc/jit/mobile/import.h>
|
||||||
#include <torch/csrc/jit/mobile/module.h>
|
#include <torch/csrc/jit/mobile/module.h>
|
||||||
#include <torch/script.h>
|
#include <torch/script.h>
|
||||||
@ -157,7 +158,7 @@ class PytorchJni : public facebook::jni::HybridClass<PytorchJni> {
|
|||||||
std::vector<at::IValue> inputs{};
|
std::vector<at::IValue> inputs{};
|
||||||
size_t n = jinputs->size();
|
size_t n = jinputs->size();
|
||||||
inputs.reserve(n);
|
inputs.reserve(n);
|
||||||
for (size_t i = 0; i < n; i++) {
|
for (const auto i : c10::irange(n)) {
|
||||||
at::IValue atIValue = JIValue::JIValueToAtIValue(jinputs->getElement(i));
|
at::IValue atIValue = JIValue::JIValueToAtIValue(jinputs->getElement(i));
|
||||||
if (at::kVulkan == deviceType_) {
|
if (at::kVulkan == deviceType_) {
|
||||||
inputs.push_back(
|
inputs.push_back(
|
||||||
@ -186,7 +187,7 @@ class PytorchJni : public facebook::jni::HybridClass<PytorchJni> {
|
|||||||
std::vector<at::IValue> inputs{};
|
std::vector<at::IValue> inputs{};
|
||||||
size_t n = jinputs->size();
|
size_t n = jinputs->size();
|
||||||
inputs.reserve(n);
|
inputs.reserve(n);
|
||||||
for (size_t i = 0; i < n; i++) {
|
for (const auto i : c10::irange(n)) {
|
||||||
at::IValue atIValue = JIValue::JIValueToAtIValue(jinputs->getElement(i));
|
at::IValue atIValue = JIValue::JIValueToAtIValue(jinputs->getElement(i));
|
||||||
if (at::kVulkan == deviceType_) {
|
if (at::kVulkan == deviceType_) {
|
||||||
inputs.push_back(
|
inputs.push_back(
|
||||||
|
@ -3,6 +3,7 @@
|
|||||||
#include <ATen/BatchedFallback.h>
|
#include <ATen/BatchedFallback.h>
|
||||||
#include <ATen/native/ResizeCommon.h>
|
#include <ATen/native/ResizeCommon.h>
|
||||||
#include <ATen/ATen.h>
|
#include <ATen/ATen.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
namespace at {
|
namespace at {
|
||||||
|
|
||||||
@ -329,7 +330,7 @@ Tensor permute_batching_rule(const Tensor& self, IntArrayRef dims) {
|
|||||||
|
|
||||||
VmapDimVector all_dims_physical;
|
VmapDimVector all_dims_physical;
|
||||||
all_dims_physical.reserve(self_physical.tensor().dim());
|
all_dims_physical.reserve(self_physical.tensor().dim());
|
||||||
for (int64_t bdim = 0; bdim < self_physical.numBatchDims(); bdim++) {
|
for (const auto bdim : c10::irange(self_physical.numBatchDims())) {
|
||||||
all_dims_physical.push_back(bdim);
|
all_dims_physical.push_back(bdim);
|
||||||
}
|
}
|
||||||
all_dims_physical.insert(
|
all_dims_physical.insert(
|
||||||
|
@ -2,6 +2,7 @@
|
|||||||
|
|
||||||
#include <ATen/Parallel.h>
|
#include <ATen/Parallel.h>
|
||||||
#include <ATen/TensorUtils.h>
|
#include <ATen/TensorUtils.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <limits>
|
#include <limits>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
@ -130,7 +131,7 @@ inline Tensor sort_strides(Tensor& tensor_) {
|
|||||||
IntArrayRef strides = tensor_.strides();
|
IntArrayRef strides = tensor_.strides();
|
||||||
std::vector<int64_t> indices;
|
std::vector<int64_t> indices;
|
||||||
indices.reserve(tensor_.ndimension());
|
indices.reserve(tensor_.ndimension());
|
||||||
for (int64_t i = 0; i < tensor_.ndimension(); i++) {
|
for (const auto i : c10::irange(tensor_.ndimension())) {
|
||||||
indices.push_back(i);
|
indices.push_back(i);
|
||||||
}
|
}
|
||||||
std::sort(indices.begin(), indices.end(), [&strides](int64_t i1, int64_t i2) {
|
std::sort(indices.begin(), indices.end(), [&strides](int64_t i1, int64_t i2) {
|
||||||
@ -196,7 +197,7 @@ inline bool _all_equal_numel(at::ArrayRef<Tensor> tensors) {
|
|||||||
if (tensors.size() == 0)
|
if (tensors.size() == 0)
|
||||||
return true;
|
return true;
|
||||||
int64_t all_numel = tensors[0].numel();
|
int64_t all_numel = tensors[0].numel();
|
||||||
for (size_t i = 1; i < tensors.size(); i++) {
|
for (const auto i : c10::irange(1, tensors.size())) {
|
||||||
if (tensors[i].numel() != all_numel)
|
if (tensors[i].numel() != all_numel)
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -11,6 +11,7 @@
|
|||||||
#include <c10/util/Exception.h>
|
#include <c10/util/Exception.h>
|
||||||
#include <c10/core/impl/DeviceGuardImplInterface.h>
|
#include <c10/core/impl/DeviceGuardImplInterface.h>
|
||||||
#include <c10/core/QEngine.h>
|
#include <c10/core/QEngine.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <mutex>
|
#include <mutex>
|
||||||
@ -349,7 +350,7 @@ static inline void manual_seed(uint64_t seed) {
|
|||||||
// available. In that case, we must not seed CUDA; it will fail!
|
// available. In that case, we must not seed CUDA; it will fail!
|
||||||
const auto num_gpus = detail::getCUDAHooks().getNumGPUs();
|
const auto num_gpus = detail::getCUDAHooks().getNumGPUs();
|
||||||
if (hasCUDA() && num_gpus > 0) {
|
if (hasCUDA() && num_gpus > 0) {
|
||||||
for (int i = 0; i < num_gpus; i++) {
|
for (const auto i : c10::irange(num_gpus)) {
|
||||||
auto cuda_gen = globalContext().defaultGenerator(
|
auto cuda_gen = globalContext().defaultGenerator(
|
||||||
Device(at::kCUDA, static_cast<c10::DeviceIndex>(i))
|
Device(at::kCUDA, static_cast<c10::DeviceIndex>(i))
|
||||||
);
|
);
|
||||||
|
@ -197,7 +197,7 @@ std::vector<int64_t> infer_dense_strides(IntArrayRef tensor_sizes, IntArrayRef t
|
|||||||
// compute output strides which preserves the input tensor's memory layout
|
// compute output strides which preserves the input tensor's memory layout
|
||||||
std::vector<int64_t> out_strides(ndim);
|
std::vector<int64_t> out_strides(ndim);
|
||||||
int64_t curr_stride = 1;
|
int64_t curr_stride = 1;
|
||||||
for (size_t i = 0; i < ndim; ++i) {
|
for (const auto i : c10::irange(ndim)) {
|
||||||
int64_t idx = perm[i];
|
int64_t idx = perm[i];
|
||||||
out_strides[idx] = curr_stride;
|
out_strides[idx] = curr_stride;
|
||||||
// Note: for size 0, we simply treated it as 1, it really doesn't matter here
|
// Note: for size 0, we simply treated it as 1, it really doesn't matter here
|
||||||
|
@ -4,6 +4,7 @@
|
|||||||
#include <ATen/Tensor.h>
|
#include <ATen/Tensor.h>
|
||||||
#include <c10/util/Exception.h>
|
#include <c10/util/Exception.h>
|
||||||
#include <c10/util/MaybeOwned.h>
|
#include <c10/util/MaybeOwned.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
#include <functional>
|
#include <functional>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
@ -266,7 +267,7 @@ inline std::vector<Tensor> expand_outplace(TensorList to_expand) {
|
|||||||
// expands a list of Tensors; ignores undefined (null) tensors
|
// expands a list of Tensors; ignores undefined (null) tensors
|
||||||
bool first = true;
|
bool first = true;
|
||||||
DimVector sizes;
|
DimVector sizes;
|
||||||
for (size_t i = 0; i < to_expand.size(); ++i) {
|
for (const auto i : c10::irange(to_expand.size())) {
|
||||||
if (!to_expand[i].defined()) {
|
if (!to_expand[i].defined()) {
|
||||||
continue;
|
continue;
|
||||||
} else if (first) {
|
} else if (first) {
|
||||||
@ -278,7 +279,7 @@ inline std::vector<Tensor> expand_outplace(TensorList to_expand) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
std::vector<Tensor> result(to_expand.size());
|
std::vector<Tensor> result(to_expand.size());
|
||||||
for (size_t i = 0; i < to_expand.size(); ++i) {
|
for (const auto i : c10::irange(to_expand.size())) {
|
||||||
if (!to_expand[i].defined()) {
|
if (!to_expand[i].defined()) {
|
||||||
continue;
|
continue;
|
||||||
} else if (to_expand[i].sizes().equals(sizes)) {
|
} else if (to_expand[i].sizes().equals(sizes)) {
|
||||||
@ -299,7 +300,7 @@ static inline Tensor sum_to(Tensor tensor, const IntArrayRef shape) {
|
|||||||
c10::SmallVector<int64_t, 8> reduce_dims;
|
c10::SmallVector<int64_t, 8> reduce_dims;
|
||||||
const at::IntArrayRef sizes = tensor.sizes();
|
const at::IntArrayRef sizes = tensor.sizes();
|
||||||
const int64_t leading_dims = sizes.size() - shape.size();
|
const int64_t leading_dims = sizes.size() - shape.size();
|
||||||
for (int64_t i = 0; i < leading_dims; ++i) {
|
for (const auto i : c10::irange(leading_dims)) {
|
||||||
reduce_dims.push_back(i);
|
reduce_dims.push_back(i);
|
||||||
}
|
}
|
||||||
for (int64_t i = leading_dims; i < static_cast<int64_t>(sizes.size()); ++i) {
|
for (int64_t i = leading_dims; i < static_cast<int64_t>(sizes.size()); ++i) {
|
||||||
@ -320,7 +321,7 @@ static inline bool is_expandable_to(IntArrayRef shape, IntArrayRef desired) {
|
|||||||
if (ndim > target_dim) {
|
if (ndim > target_dim) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
for (size_t i = 0; i < ndim; i++) {
|
for (const auto i : c10::irange(ndim)) {
|
||||||
int64_t size = shape[ndim - i - 1];
|
int64_t size = shape[ndim - i - 1];
|
||||||
int64_t target = desired[target_dim - i - 1];
|
int64_t target = desired[target_dim - i - 1];
|
||||||
if (size != target && size != 1) {
|
if (size != target && size != 1) {
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
#include <ATen/MemoryOverlap.h>
|
#include <ATen/MemoryOverlap.h>
|
||||||
#include <ATen/core/TensorBase.h>
|
#include <ATen/core/TensorBase.h>
|
||||||
#include <c10/core/Layout.h>
|
#include <c10/core/Layout.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
namespace at {
|
namespace at {
|
||||||
|
|
||||||
@ -17,7 +18,7 @@ MemOverlap has_internal_overlap(TensorImpl* t) {
|
|||||||
|
|
||||||
auto strides = t->strides();
|
auto strides = t->strides();
|
||||||
auto sizes = t->sizes();
|
auto sizes = t->sizes();
|
||||||
for (size_t i = 0; i < strides.size(); ++i) {
|
for (const auto i : c10::irange(strides.size())) {
|
||||||
if (strides[i] == 0 && sizes[i] > 1) {
|
if (strides[i] == 0 && sizes[i] > 1) {
|
||||||
return MemOverlap::YES;
|
return MemOverlap::YES;
|
||||||
}
|
}
|
||||||
|
@ -225,7 +225,7 @@ std::vector<Dimname> compute_squeeze_outnames(const Tensor& tensor) {
|
|||||||
}
|
}
|
||||||
std::vector<Dimname> outnames;
|
std::vector<Dimname> outnames;
|
||||||
auto tensor_names = tensor.names();
|
auto tensor_names = tensor.names();
|
||||||
for (int64_t d = 0; d < tensor.dim(); d++) {
|
for (const auto d : c10::irange(tensor.dim())) {
|
||||||
if (tensor.sizes()[d] != 1) {
|
if (tensor.sizes()[d] != 1) {
|
||||||
outnames.push_back(tensor_names[d]);
|
outnames.push_back(tensor_names[d]);
|
||||||
}
|
}
|
||||||
@ -242,7 +242,7 @@ std::vector<Dimname> compute_diagonal_outnames(
|
|||||||
}
|
}
|
||||||
std::vector<Dimname> outnames;
|
std::vector<Dimname> outnames;
|
||||||
auto tensor_names = tensor.names();
|
auto tensor_names = tensor.names();
|
||||||
for (int64_t d = 0; d < tensor.dim(); d++) {
|
for (const auto d : c10::irange(tensor.dim())) {
|
||||||
if (d == dim1 || d == dim2) {
|
if (d == dim1 || d == dim2) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -6,6 +6,7 @@
|
|||||||
|
|
||||||
#ifndef C10_MOBILE
|
#ifndef C10_MOBILE
|
||||||
#include <c10/core/thread_pool.h>
|
#include <c10/core/thread_pool.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#else
|
#else
|
||||||
#include <caffe2/utils/threadpool/pthreadpool-cpp.h>
|
#include <caffe2/utils/threadpool/pthreadpool-cpp.h>
|
||||||
#endif // C10_MOBILE
|
#endif // C10_MOBILE
|
||||||
@ -87,7 +88,7 @@ TaskThreadPoolBase& _get_intraop_pool() {
|
|||||||
// `fn` will be called with params: (thread_pool_task_id, task_id).
|
// `fn` will be called with params: (thread_pool_task_id, task_id).
|
||||||
void _run_with_pool(const std::function<void(int, size_t)>& fn, size_t range) {
|
void _run_with_pool(const std::function<void(int, size_t)>& fn, size_t range) {
|
||||||
#ifndef C10_MOBILE
|
#ifndef C10_MOBILE
|
||||||
for (size_t i = 1; i < range; ++i) {
|
for (const auto i : c10::irange(1, range)) {
|
||||||
_get_intraop_pool().run([fn, i]() { fn((int)i, i); });
|
_get_intraop_pool().run([fn, i]() { fn((int)i, i); });
|
||||||
}
|
}
|
||||||
// Run the first task on the current thread directly.
|
// Run the first task on the current thread directly.
|
||||||
|
@ -3,6 +3,7 @@
|
|||||||
#include <ATen/Tensor.h>
|
#include <ATen/Tensor.h>
|
||||||
#include <c10/core/TensorImpl.h>
|
#include <c10/core/TensorImpl.h>
|
||||||
#include <c10/util/Exception.h>
|
#include <c10/util/Exception.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
namespace at {
|
namespace at {
|
||||||
struct TORCH_API SparseTensorImpl : public TensorImpl {
|
struct TORCH_API SparseTensorImpl : public TensorImpl {
|
||||||
@ -109,7 +110,7 @@ public:
|
|||||||
bool shrinking_dense_dim = false;
|
bool shrinking_dense_dim = false;
|
||||||
auto sparse_size_original = sizes().slice(0, sparse_dim);
|
auto sparse_size_original = sizes().slice(0, sparse_dim);
|
||||||
auto sparse_size_new = size.slice(0, sparse_dim);
|
auto sparse_size_new = size.slice(0, sparse_dim);
|
||||||
for (int64_t i = 0; i < sparse_dim; i++) {
|
for (const auto i : c10::irange(sparse_dim)) {
|
||||||
if (sparse_size_new[i] < sparse_size_original[i]) {
|
if (sparse_size_new[i] < sparse_size_original[i]) {
|
||||||
shrinking_sparse_dims = true;
|
shrinking_sparse_dims = true;
|
||||||
break;
|
break;
|
||||||
@ -117,7 +118,7 @@ public:
|
|||||||
}
|
}
|
||||||
auto dense_size_original = sizes().slice(sparse_dim);
|
auto dense_size_original = sizes().slice(sparse_dim);
|
||||||
auto dense_size_new = size.slice(sparse_dim);
|
auto dense_size_new = size.slice(sparse_dim);
|
||||||
for (int64_t i = 0; i < dense_dim; i++) {
|
for (const auto i : c10::irange(dense_dim)) {
|
||||||
if (dense_size_new[i] < dense_size_original[i]) {
|
if (dense_size_new[i] < dense_size_original[i]) {
|
||||||
shrinking_dense_dim = true;
|
shrinking_dense_dim = true;
|
||||||
break;
|
break;
|
||||||
|
@ -3,6 +3,7 @@
|
|||||||
#include <ATen/ATen.h>
|
#include <ATen/ATen.h>
|
||||||
#include <ATen/SparseTensorImpl.h>
|
#include <ATen/SparseTensorImpl.h>
|
||||||
#include <ATen/Parallel.h>
|
#include <ATen/Parallel.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
namespace at { namespace sparse {
|
namespace at { namespace sparse {
|
||||||
|
|
||||||
@ -98,7 +99,7 @@ Tensor coo_to_csr(const int64_t* indices, int64_t dim, int64_t nnz) {
|
|||||||
at::parallel_for(0, nnz, 10000, [&](int64_t start, int64_t end) {
|
at::parallel_for(0, nnz, 10000, [&](int64_t start, int64_t end) {
|
||||||
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
|
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
|
||||||
int64_t h, hp0, hp1;
|
int64_t h, hp0, hp1;
|
||||||
for (auto i = start; i < end; i++) {
|
for (const auto i : c10::irange(start, end)) {
|
||||||
hp0 = indices[i];
|
hp0 = indices[i];
|
||||||
hp1 = (i+1 == nnz) ? dim : indices[i+1];
|
hp1 = (i+1 == nnz) ? dim : indices[i+1];
|
||||||
if (hp0 != hp1) {
|
if (hp0 != hp1) {
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
#include <ATen/TensorIndexing.h>
|
#include <ATen/TensorIndexing.h>
|
||||||
|
|
||||||
#include <c10/util/Exception.h>
|
#include <c10/util/Exception.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
namespace at {
|
namespace at {
|
||||||
namespace indexing {
|
namespace indexing {
|
||||||
@ -31,7 +32,7 @@ std::ostream& operator<<(std::ostream& stream, const TensorIndex& tensor_index)
|
|||||||
|
|
||||||
std::ostream& operator<<(std::ostream& stream, const std::vector<TensorIndex>& tensor_indices) {
|
std::ostream& operator<<(std::ostream& stream, const std::vector<TensorIndex>& tensor_indices) {
|
||||||
stream << "(";
|
stream << "(";
|
||||||
for (size_t i = 0; i < tensor_indices.size(); i++) {
|
for (const auto i : c10::irange(tensor_indices.size())) {
|
||||||
stream << tensor_indices[i];
|
stream << tensor_indices[i];
|
||||||
if (i < tensor_indices.size() - 1) stream << ", ";
|
if (i < tensor_indices.size() - 1) stream << ", ";
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <c10/util/Optional.h>
|
#include <c10/util/Optional.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <ATen/core/TensorBody.h>
|
#include <ATen/core/TensorBody.h>
|
||||||
#include <ATen/ExpandUtils.h>
|
#include <ATen/ExpandUtils.h>
|
||||||
#include <ATen/Functions.h>
|
#include <ATen/Functions.h>
|
||||||
@ -335,7 +336,7 @@ static inline Tensor scalarToTensor(const Scalar& v, const TensorOptions& option
|
|||||||
// strip away unit dimensions from the left of 'src'
|
// strip away unit dimensions from the left of 'src'
|
||||||
static inline IntArrayRef slicePrefix1sSize(const IntArrayRef& sizes) {
|
static inline IntArrayRef slicePrefix1sSize(const IntArrayRef& sizes) {
|
||||||
size_t first_non1_src = sizes.size();
|
size_t first_non1_src = sizes.size();
|
||||||
for (size_t i = 0; i < sizes.size(); ++i) {
|
for (const auto i : c10::irange(sizes.size())) {
|
||||||
if (sizes[i] != 1) {
|
if (sizes[i] != 1) {
|
||||||
first_non1_src = i;
|
first_non1_src = i;
|
||||||
break;
|
break;
|
||||||
@ -439,7 +440,7 @@ static inline Tensor applySlicing(
|
|||||||
"too many indices for tensor of dimension ", (int)self_sizes.size());
|
"too many indices for tensor of dimension ", (int)self_sizes.size());
|
||||||
|
|
||||||
Tensor result = self;
|
Tensor result = self;
|
||||||
for (size_t i = 0; i < indices.size(); i++) {
|
for (const auto i : c10::irange(indices.size())) {
|
||||||
auto& obj = indices[i];
|
auto& obj = indices[i];
|
||||||
result = handleDimInMultiDimIndexing(
|
result = handleDimInMultiDimIndexing(
|
||||||
/*prev_dim_result=*/result,
|
/*prev_dim_result=*/result,
|
||||||
|
@ -36,8 +36,8 @@ inline void get_base_ptrs(char** ptrs, ArrayRef<OperandInfo> operands) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
inline void get_strides(int64_t* strides, ArrayRef<OperandInfo> operands, int64_t ndim) {
|
inline void get_strides(int64_t* strides, ArrayRef<OperandInfo> operands, int64_t ndim) {
|
||||||
for (int64_t dim = 0; dim < ndim; ++dim) {
|
for (const auto dim : c10::irange(ndim)) {
|
||||||
for (size_t arg = 0; arg < operands.size(); ++arg) {
|
for (const auto arg : c10::irange(operands.size())) {
|
||||||
*strides++ = operands[arg].stride_bytes[dim];
|
*strides++ = operands[arg].stride_bytes[dim];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -214,7 +214,7 @@ void TensorIteratorBase::reorder_dimensions() {
|
|||||||
// returns 1 if the dim0 should come after dim1, -1 if dim0 should come
|
// returns 1 if the dim0 should come after dim1, -1 if dim0 should come
|
||||||
// before dim1, and 0 if the comparison is ambiguous.
|
// before dim1, and 0 if the comparison is ambiguous.
|
||||||
auto should_swap = [&](size_t dim0, size_t dim1) {
|
auto should_swap = [&](size_t dim0, size_t dim1) {
|
||||||
for (int arg = 0; arg < ntensors(); arg++) {
|
for (const auto arg : c10::irange(ntensors())) {
|
||||||
// ignore undefined or incorrectly sized tensors
|
// ignore undefined or incorrectly sized tensors
|
||||||
if (operands_[arg].stride_bytes.empty() || operands_[arg].will_resize) {
|
if (operands_[arg].stride_bytes.empty() || operands_[arg].will_resize) {
|
||||||
continue;
|
continue;
|
||||||
@ -251,7 +251,7 @@ void TensorIteratorBase::reorder_dimensions() {
|
|||||||
};
|
};
|
||||||
|
|
||||||
// insertion sort with support for ambiguous comparisons
|
// insertion sort with support for ambiguous comparisons
|
||||||
for (int i = 1; i < ndim(); i++) {
|
for (const auto i : c10::irange(1, ndim())) {
|
||||||
int dim1 = i;
|
int dim1 = i;
|
||||||
for (int dim0 = i - 1; dim0 >= 0; dim0--) {
|
for (int dim0 = i - 1; dim0 >= 0; dim0--) {
|
||||||
int comparison = should_swap(perm_[dim0], perm_[dim1]);
|
int comparison = should_swap(perm_[dim0], perm_[dim1]);
|
||||||
@ -497,7 +497,7 @@ void TensorIteratorBase::compute_types(const TensorIteratorConfig& config) {
|
|||||||
StrideVector TensorIteratorBase::compatible_stride(int element_size) const {
|
StrideVector TensorIteratorBase::compatible_stride(int element_size) const {
|
||||||
auto stride = StrideVector();
|
auto stride = StrideVector();
|
||||||
int64_t next_stride = element_size;
|
int64_t next_stride = element_size;
|
||||||
for (int dim = 0; dim < ndim(); dim++) {
|
for (const auto dim : c10::irange(ndim())) {
|
||||||
stride.push_back(next_stride);
|
stride.push_back(next_stride);
|
||||||
next_stride *= shape_[dim];
|
next_stride *= shape_[dim];
|
||||||
}
|
}
|
||||||
@ -510,14 +510,14 @@ DimVector TensorIteratorBase::invert_perm(IntArrayRef input) const {
|
|||||||
TORCH_INTERNAL_ASSERT(!has_coalesced_dimensions_);
|
TORCH_INTERNAL_ASSERT(!has_coalesced_dimensions_);
|
||||||
TORCH_INTERNAL_ASSERT(input.size()==perm_.size());
|
TORCH_INTERNAL_ASSERT(input.size()==perm_.size());
|
||||||
auto res = DimVector(input.size()); //no initialization needed, every value in res should be written to.
|
auto res = DimVector(input.size()); //no initialization needed, every value in res should be written to.
|
||||||
for (int dim = 0; dim < ndim(); dim++) {
|
for (const auto dim : c10::irange(ndim())) {
|
||||||
res[perm_[dim]] = input[dim];
|
res[perm_[dim]] = input[dim];
|
||||||
}
|
}
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
void TensorIteratorBase::allocate_or_resize_outputs() {
|
void TensorIteratorBase::allocate_or_resize_outputs() {
|
||||||
for (int i = 0; i < num_outputs_; i++) {
|
for (const auto i : c10::irange(num_outputs_)) {
|
||||||
auto& op = operands_[i];
|
auto& op = operands_[i];
|
||||||
if (!op.tensor_base().defined() || op.will_resize) {
|
if (!op.tensor_base().defined() || op.will_resize) {
|
||||||
TORCH_INTERNAL_ASSERT(op.is_type_defined(), "no type for operand", i);
|
TORCH_INTERNAL_ASSERT(op.is_type_defined(), "no type for operand", i);
|
||||||
@ -525,7 +525,7 @@ void TensorIteratorBase::allocate_or_resize_outputs() {
|
|||||||
op.stride_bytes = compatible_stride(element_size);
|
op.stride_bytes = compatible_stride(element_size);
|
||||||
// check if permutation is just an inverted order
|
// check if permutation is just an inverted order
|
||||||
bool inverted = true;
|
bool inverted = true;
|
||||||
for (int i = 0; i < ndim(); i++) {
|
for (const auto i : c10::irange(ndim())) {
|
||||||
if (perm_[i] != ndim() - i - 1) {
|
if (perm_[i] != ndim() - i - 1) {
|
||||||
inverted = false;
|
inverted = false;
|
||||||
break;
|
break;
|
||||||
@ -539,7 +539,7 @@ void TensorIteratorBase::allocate_or_resize_outputs() {
|
|||||||
set_output(i, tensor_shape, {}, original_options(op), names_);
|
set_output(i, tensor_shape, {}, original_options(op), names_);
|
||||||
} else {
|
} else {
|
||||||
auto tensor_stride = invert_perm(op.stride_bytes);
|
auto tensor_stride = invert_perm(op.stride_bytes);
|
||||||
for (int dim = 0; dim < ndim(); dim++) {
|
for (const auto dim : c10::irange(ndim())) {
|
||||||
tensor_stride[dim] /= element_size;
|
tensor_stride[dim] /= element_size;
|
||||||
}
|
}
|
||||||
set_output(i, tensor_shape, tensor_stride, original_options(op), names_);
|
set_output(i, tensor_shape, tensor_stride, original_options(op), names_);
|
||||||
@ -593,7 +593,7 @@ void TensorIteratorBase::coalesce_dimensions() {
|
|||||||
if (shape0 == 1 || shape1 == 1) {
|
if (shape0 == 1 || shape1 == 1) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
for (int i = 0; i < ntensors(); i++) {
|
for (const auto i : c10::irange(ntensors())) {
|
||||||
auto& stride = operands_[i].stride_bytes;
|
auto& stride = operands_[i].stride_bytes;
|
||||||
if (shape0 * stride[dim0] != stride[dim1]) {
|
if (shape0 * stride[dim0] != stride[dim1]) {
|
||||||
return false;
|
return false;
|
||||||
@ -604,14 +604,14 @@ void TensorIteratorBase::coalesce_dimensions() {
|
|||||||
|
|
||||||
// replace each operands stride at dim0 with its stride at dim1
|
// replace each operands stride at dim0 with its stride at dim1
|
||||||
auto replace_stride = [&](int dim0, int dim1) {
|
auto replace_stride = [&](int dim0, int dim1) {
|
||||||
for (int i = 0; i < ntensors(); i++) {
|
for (const auto i : c10::irange(ntensors())) {
|
||||||
auto& stride = operands_[i].stride_bytes;
|
auto& stride = operands_[i].stride_bytes;
|
||||||
stride[dim0] = stride[dim1];
|
stride[dim0] = stride[dim1];
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
int prev_dim = 0;
|
int prev_dim = 0;
|
||||||
for (int dim = 1; dim < ndim(); dim++) {
|
for (const auto dim : c10::irange(1, ndim())) {
|
||||||
if (can_coalesce(prev_dim, dim)) {
|
if (can_coalesce(prev_dim, dim)) {
|
||||||
if (shape_[prev_dim] == 1) {
|
if (shape_[prev_dim] == 1) {
|
||||||
replace_stride(prev_dim, dim);
|
replace_stride(prev_dim, dim);
|
||||||
@ -627,7 +627,7 @@ void TensorIteratorBase::coalesce_dimensions() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
shape_.resize(prev_dim + 1);
|
shape_.resize(prev_dim + 1);
|
||||||
for (int i = 0; i < ntensors(); i++) {
|
for (const auto i : c10::irange(ntensors())) {
|
||||||
operands_[i].stride_bytes.resize(ndim());
|
operands_[i].stride_bytes.resize(ndim());
|
||||||
}
|
}
|
||||||
has_coalesced_dimensions_ = true;
|
has_coalesced_dimensions_ = true;
|
||||||
@ -670,7 +670,7 @@ void TensorIteratorBase::permute_dimensions(IntArrayRef perm) {
|
|||||||
|
|
||||||
auto reorder = [perm](IntArrayRef data) {
|
auto reorder = [perm](IntArrayRef data) {
|
||||||
auto res = DimVector(data.size(), 0);
|
auto res = DimVector(data.size(), 0);
|
||||||
for (size_t i = 0; i < perm.size(); i++) {
|
for (const auto i : c10::irange(perm.size())) {
|
||||||
res[i] = data[perm[i]];
|
res[i] = data[perm[i]];
|
||||||
}
|
}
|
||||||
return res;
|
return res;
|
||||||
@ -687,7 +687,7 @@ void TensorIteratorBase::permute_dimensions(IntArrayRef perm) {
|
|||||||
|
|
||||||
int64_t TensorIteratorBase::num_output_elements() const {
|
int64_t TensorIteratorBase::num_output_elements() const {
|
||||||
int64_t elem = 1;
|
int64_t elem = 1;
|
||||||
for (int dim = 0; dim < ndim(); dim++) {
|
for (const auto dim : c10::irange(ndim())) {
|
||||||
if (operands_[0].stride_bytes[dim] != 0 || shape_[dim] == 0) {
|
if (operands_[0].stride_bytes[dim] != 0 || shape_[dim] == 0) {
|
||||||
elem *= shape_[dim];
|
elem *= shape_[dim];
|
||||||
}
|
}
|
||||||
@ -697,7 +697,7 @@ int64_t TensorIteratorBase::num_output_elements() const {
|
|||||||
|
|
||||||
int TensorIteratorBase::num_reduce_dims() const {
|
int TensorIteratorBase::num_reduce_dims() const {
|
||||||
int count = 0;
|
int count = 0;
|
||||||
for (int dim = 0; dim < ndim(); dim++) {
|
for (const auto dim : c10::irange(ndim())) {
|
||||||
if (operands_[0].stride_bytes[dim] == 0) {
|
if (operands_[0].stride_bytes[dim] == 0) {
|
||||||
count++;
|
count++;
|
||||||
}
|
}
|
||||||
@ -760,7 +760,7 @@ bool TensorIteratorBase::is_contiguous() const {
|
|||||||
|
|
||||||
bool TensorIteratorBase::is_scalar(int arg) const {
|
bool TensorIteratorBase::is_scalar(int arg) const {
|
||||||
const auto& stride = operands_[arg].stride_bytes;
|
const auto& stride = operands_[arg].stride_bytes;
|
||||||
for (int i = 0; i < ndim(); i++) {
|
for (const auto i : c10::irange(ndim())) {
|
||||||
if (stride[i] != 0 && shape_[i] != 1) {
|
if (stride[i] != 0 && shape_[i] != 1) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -815,7 +815,7 @@ void TensorIteratorBase::narrow(int dim, int64_t start, int64_t size) {
|
|||||||
|
|
||||||
void TensorIteratorBase::select_all_keeping_dim(int start_dim, IntArrayRef indices) {
|
void TensorIteratorBase::select_all_keeping_dim(int start_dim, IntArrayRef indices) {
|
||||||
TORCH_INTERNAL_ASSERT(start_dim <= ndim());
|
TORCH_INTERNAL_ASSERT(start_dim <= ndim());
|
||||||
for (int i = start_dim; i < ndim(); ++i) {
|
for (const auto i : c10::irange(start_dim, ndim())) {
|
||||||
for (auto& op : operands_) {
|
for (auto& op : operands_) {
|
||||||
op.data = ((char*)op.data) + op.stride_bytes[i] * indices[i - start_dim];
|
op.data = ((char*)op.data) + op.stride_bytes[i] * indices[i - start_dim];
|
||||||
}
|
}
|
||||||
@ -1063,13 +1063,13 @@ void TensorIteratorBase::populate_operands(TensorIteratorConfig& config) {
|
|||||||
|
|
||||||
void TensorIteratorBase::mark_outputs() {
|
void TensorIteratorBase::mark_outputs() {
|
||||||
// TODO: merge this into populate_operands
|
// TODO: merge this into populate_operands
|
||||||
for (int i = 0; i < num_outputs_; i++) {
|
for (const auto i : c10::irange(num_outputs_)) {
|
||||||
operands_[i].is_output = true;
|
operands_[i].is_output = true;
|
||||||
const auto& output = tensor(i);
|
const auto& output = tensor(i);
|
||||||
if (!output.defined()) continue;
|
if (!output.defined()) continue;
|
||||||
|
|
||||||
// check if output is also an input
|
// check if output is also an input
|
||||||
for (int arg = num_outputs_; arg < ntensors(); arg++) {
|
for (const auto arg : c10::irange(num_outputs_, ntensors())) {
|
||||||
const auto& input = tensor(arg);
|
const auto& input = tensor(arg);
|
||||||
if (output.is_same(input)) {
|
if (output.is_same(input)) {
|
||||||
operands_[i].is_read_write = true;
|
operands_[i].is_read_write = true;
|
||||||
@ -1086,7 +1086,7 @@ void TensorIteratorBase::mark_resize_outputs(const TensorIteratorConfig& config)
|
|||||||
if (config.static_shape_.has_value()) {
|
if (config.static_shape_.has_value()) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
for (int i = 0; i < num_outputs_; i++) {
|
for (const auto i : c10::irange(num_outputs_)) {
|
||||||
const auto& output = tensor(i);
|
const auto& output = tensor(i);
|
||||||
if (output.defined() && !output.sizes().equals(shape_)) {
|
if (output.defined() && !output.sizes().equals(shape_)) {
|
||||||
if (config.resize_outputs_ && !operands_[i].is_read_write) {
|
if (config.resize_outputs_ && !operands_[i].is_read_write) {
|
||||||
@ -1104,11 +1104,11 @@ void TensorIteratorBase::compute_mem_overlaps(const TensorIteratorConfig& config
|
|||||||
if (!config.check_mem_overlap_) {
|
if (!config.check_mem_overlap_) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
for (int i = 0; i < num_outputs_; i++) {
|
for (const auto i : c10::irange(num_outputs_)) {
|
||||||
const auto& output = tensor_base(i);
|
const auto& output = tensor_base(i);
|
||||||
if (!output.defined()) continue;
|
if (!output.defined()) continue;
|
||||||
assert_no_internal_overlap(output);
|
assert_no_internal_overlap(output);
|
||||||
for (int j = num_outputs_; j < ntensors(); j++) {
|
for (const auto j : c10::irange(num_outputs_, ntensors())) {
|
||||||
const auto& input = tensor_base(j);
|
const auto& input = tensor_base(j);
|
||||||
if (!input.is_same(output)) {
|
if (!input.is_same(output)) {
|
||||||
assert_no_partial_overlap(output, input);
|
assert_no_partial_overlap(output, input);
|
||||||
@ -1164,7 +1164,7 @@ void TensorIteratorBase::compute_strides(const TensorIteratorConfig& config) {
|
|||||||
op.stride_bytes.resize(ndim(), 0);
|
op.stride_bytes.resize(ndim(), 0);
|
||||||
else
|
else
|
||||||
op.stride_bytes.resize(ndim());
|
op.stride_bytes.resize(ndim());
|
||||||
for (size_t i = 0; i < original_shape.size(); i++) {
|
for (const auto i : c10::irange(original_shape.size())) {
|
||||||
// see NOTE: [Computing output strides]
|
// see NOTE: [Computing output strides]
|
||||||
if (original_shape[i] == 1 && shape_[offset + i] !=1) {
|
if (original_shape[i] == 1 && shape_[offset + i] !=1) {
|
||||||
op.stride_bytes[offset + i] = 0;
|
op.stride_bytes[offset + i] = 0;
|
||||||
@ -1183,7 +1183,7 @@ bool TensorIteratorBase::can_use_32bit_indexing() const {
|
|||||||
}
|
}
|
||||||
for (auto& op : operands_) {
|
for (auto& op : operands_) {
|
||||||
int64_t max_offset = 1;
|
int64_t max_offset = 1;
|
||||||
for (int dim = 0; dim < ndim(); dim++) {
|
for (const auto dim : c10::irange(ndim())) {
|
||||||
max_offset += (shape_[dim] - 1) * op.stride_bytes[dim];
|
max_offset += (shape_[dim] - 1) * op.stride_bytes[dim];
|
||||||
}
|
}
|
||||||
if (max_offset > max_value) {
|
if (max_offset > max_value) {
|
||||||
@ -1245,7 +1245,7 @@ bool TensorIteratorBase::fast_set_up(const TensorIteratorConfig& config) {
|
|||||||
switch (setup_type) {
|
switch (setup_type) {
|
||||||
case FastSetupType::CONTIGUOUS:
|
case FastSetupType::CONTIGUOUS:
|
||||||
{
|
{
|
||||||
for (int i = 0; i < num_outputs_; i++){
|
for (const auto i : c10::irange(num_outputs_)) {
|
||||||
auto& op = operands_[i];
|
auto& op = operands_[i];
|
||||||
if (!op.tensor_base().defined()) {
|
if (!op.tensor_base().defined()) {
|
||||||
TORCH_INTERNAL_ASSERT(op.is_type_defined(), "no type for operand", i);
|
TORCH_INTERNAL_ASSERT(op.is_type_defined(), "no type for operand", i);
|
||||||
@ -1256,7 +1256,7 @@ bool TensorIteratorBase::fast_set_up(const TensorIteratorConfig& config) {
|
|||||||
}
|
}
|
||||||
case FastSetupType::CHANNELS_LAST:
|
case FastSetupType::CHANNELS_LAST:
|
||||||
{
|
{
|
||||||
for (int i = 0; i < num_outputs_; i++){
|
for (const auto i : c10::irange(num_outputs_)) {
|
||||||
auto& op = operands_[i];
|
auto& op = operands_[i];
|
||||||
if (!op.tensor_base().defined()) {
|
if (!op.tensor_base().defined()) {
|
||||||
TORCH_INTERNAL_ASSERT(op.is_type_defined(), "no type for operand", i);
|
TORCH_INTERNAL_ASSERT(op.is_type_defined(), "no type for operand", i);
|
||||||
@ -1273,7 +1273,7 @@ bool TensorIteratorBase::fast_set_up(const TensorIteratorConfig& config) {
|
|||||||
if (tensor(i_defined).defined()) break;
|
if (tensor(i_defined).defined()) break;
|
||||||
}
|
}
|
||||||
TORCH_CHECK(i_defined >= 0, "Can not find a defined tensor when fast allocating memory to outputs");
|
TORCH_CHECK(i_defined >= 0, "Can not find a defined tensor when fast allocating memory to outputs");
|
||||||
for (int i = 0; i < num_outputs_; i++){
|
for (const auto i : c10::irange(num_outputs_)) {
|
||||||
auto& op = operands_[i];
|
auto& op = operands_[i];
|
||||||
if (!op.tensor_base().defined()) {
|
if (!op.tensor_base().defined()) {
|
||||||
TORCH_INTERNAL_ASSERT(op.is_type_defined(), "no type for operand", i);
|
TORCH_INTERNAL_ASSERT(op.is_type_defined(), "no type for operand", i);
|
||||||
|
@ -4,6 +4,7 @@
|
|||||||
#include <c10/util/MaybeOwned.h>
|
#include <c10/util/MaybeOwned.h>
|
||||||
#include <c10/util/SmallVector.h>
|
#include <c10/util/SmallVector.h>
|
||||||
#include <c10/util/TypeCast.h>
|
#include <c10/util/TypeCast.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <ATen/core/Dimname.h>
|
#include <ATen/core/Dimname.h>
|
||||||
#include <ATen/core/Range.h>
|
#include <ATen/core/Range.h>
|
||||||
#include <ATen/core/TensorBase.h>
|
#include <ATen/core/TensorBase.h>
|
||||||
@ -322,9 +323,9 @@ private:
|
|||||||
char** base, const int64_t* strides, int64_t size0, int64_t size1) {
|
char** base, const int64_t* strides, int64_t size0, int64_t size1) {
|
||||||
PtrVector data(base, base + ntensor);
|
PtrVector data(base, base + ntensor);
|
||||||
const int64_t* outer_strides = &strides[ntensor];
|
const int64_t* outer_strides = &strides[ntensor];
|
||||||
for (int64_t i = 0; i < size1; i++) {
|
for (const auto i : c10::irange(size1)) {
|
||||||
if (i > 0) {
|
if (i > 0) {
|
||||||
for (int64_t arg = 0; arg < ntensor; arg++) {
|
for (const auto arg : c10::irange(ntensor)) {
|
||||||
data[arg] += outer_strides[arg];
|
data[arg] += outer_strides[arg];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -397,7 +398,7 @@ public:
|
|||||||
|
|
||||||
bool has_contiguous_first_dim() const {
|
bool has_contiguous_first_dim() const {
|
||||||
int num_tensors = ntensors();
|
int num_tensors = ntensors();
|
||||||
for (int i = 0; i < num_tensors; i++) {
|
for (const auto i : c10::irange(num_tensors)) {
|
||||||
if (strides(i)[0] != element_size(i)) {
|
if (strides(i)[0] != element_size(i)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
#include <ATen/native/TensorIterator.h>
|
#include <ATen/native/TensorIterator.h>
|
||||||
#include <c10/util/SmallBuffer.h>
|
#include <c10/util/SmallBuffer.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
namespace at {
|
namespace at {
|
||||||
|
|
||||||
@ -24,9 +25,9 @@ inline void get_data_ptrs(
|
|||||||
const int64_t ntensors = base.size();
|
const int64_t ntensors = base.size();
|
||||||
const int64_t ndim = counter.size();
|
const int64_t ndim = counter.size();
|
||||||
std::copy(base.begin(), base.end(), ptrs);
|
std::copy(base.begin(), base.end(), ptrs);
|
||||||
for (int64_t dim = 0; dim < ndim; ++dim) {
|
for (const auto dim : c10::irange(ndim)) {
|
||||||
int64_t value = counter[dim];
|
int64_t value = counter[dim];
|
||||||
for (int64_t arg = 0; arg < ntensors; ++arg) {
|
for (const auto arg : c10::irange(ntensors)) {
|
||||||
ptrs[arg] += value * strides[dim * ntensors + arg];
|
ptrs[arg] += value * strides[dim * ntensors + arg];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -56,7 +56,7 @@ TensorNames::TensorNames(ArrayRef<Dimname> names, int64_t start, int64_t end) {
|
|||||||
start = maybe_wrap_dim(start, names.size());
|
start = maybe_wrap_dim(start, names.size());
|
||||||
end = maybe_wrap_dim(end, names.size());
|
end = maybe_wrap_dim(end, names.size());
|
||||||
names_.reserve(end - start);
|
names_.reserve(end - start);
|
||||||
for (int64_t idx = start; idx < end; ++idx) {
|
for (const auto idx : c10::irange(start, end)) {
|
||||||
names_.emplace_back(names, idx);
|
names_.emplace_back(names, idx);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2,6 +2,7 @@
|
|||||||
#include <ATen/Config.h>
|
#include <ATen/Config.h>
|
||||||
#include <ATen/TensorUtils.h>
|
#include <ATen/TensorUtils.h>
|
||||||
#include <c10/util/accumulate.h>
|
#include <c10/util/accumulate.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
#include <ostream>
|
#include <ostream>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
@ -323,7 +324,7 @@ size_t computeStorageNbytes(
|
|||||||
// size of the underlying storage is 1 bigger than the offset
|
// size of the underlying storage is 1 bigger than the offset
|
||||||
// of the last element according to stride
|
// of the last element according to stride
|
||||||
size_t size = 1;
|
size_t size = 1;
|
||||||
for(size_t i = 0; i < sizes.size(); i++) {
|
for (const auto i : c10::irange(sizes.size())) {
|
||||||
if(sizes[i] == 0) {
|
if(sizes[i] == 0) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -83,7 +83,7 @@ VmapDimVector VmapPhysicalView::getPhysicalShape(IntArrayRef logical_shape) cons
|
|||||||
static BatchDims computeFrontBatchDimsFromLevels(std::bitset<kVmapNumLevels> levels_bitset) {
|
static BatchDims computeFrontBatchDimsFromLevels(std::bitset<kVmapNumLevels> levels_bitset) {
|
||||||
BatchDims bdims;
|
BatchDims bdims;
|
||||||
int64_t dim = 0;
|
int64_t dim = 0;
|
||||||
for (int64_t level = 0; level < kVmapNumLevels; level++) {
|
for (const auto level : c10::irange(kVmapNumLevels)) {
|
||||||
if (!levels_bitset[level]) {
|
if (!levels_bitset[level]) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -208,7 +208,7 @@ MultiBatchVmapTransform::logicalToPhysical(TensorList logical_tensors) {
|
|||||||
VmapDimVector batch_sizes(num_batch_dims, 1);
|
VmapDimVector batch_sizes(num_batch_dims, 1);
|
||||||
for (const auto& physical_tensor : physical_tensors) {
|
for (const auto& physical_tensor : physical_tensors) {
|
||||||
auto physical_sizes = physical_tensor.sizes();
|
auto physical_sizes = physical_tensor.sizes();
|
||||||
for (int64_t dim = 0; dim < num_batch_dims; dim++) {
|
for (const auto dim : c10::irange(num_batch_dims)) {
|
||||||
if (physical_sizes[dim] != 1) {
|
if (physical_sizes[dim] != 1) {
|
||||||
batch_sizes[dim] = physical_sizes[dim];
|
batch_sizes[dim] = physical_sizes[dim];
|
||||||
}
|
}
|
||||||
|
@ -2,6 +2,7 @@
|
|||||||
|
|
||||||
#include <c10/core/WrapDimMinimal.h>
|
#include <c10/core/WrapDimMinimal.h>
|
||||||
#include <c10/core/TensorImpl.h>
|
#include <c10/core/TensorImpl.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <ATen/core/Tensor.h>
|
#include <ATen/core/Tensor.h>
|
||||||
|
|
||||||
namespace at {
|
namespace at {
|
||||||
@ -40,7 +41,7 @@ static inline void maybe_wrap_dims_n(int64_t* dims, int64_t ndims, int64_t dim_p
|
|||||||
}
|
}
|
||||||
int64_t min = -dim_post_expr;
|
int64_t min = -dim_post_expr;
|
||||||
int64_t max = dim_post_expr - 1;
|
int64_t max = dim_post_expr - 1;
|
||||||
for (int64_t i = 0; i < ndims; ++i) {
|
for (const auto i : c10::irange(ndims)) {
|
||||||
auto &dim = dims[i];
|
auto &dim = dims[i];
|
||||||
if (dim < min || dim > max) {
|
if (dim < min || dim > max) {
|
||||||
TORCH_CHECK_INDEX(false,
|
TORCH_CHECK_INDEX(false,
|
||||||
@ -85,7 +86,7 @@ static inline int64_t legacy_cat_wrap_dim(int64_t dim, TensorList tensors) {
|
|||||||
|
|
||||||
// wrap negative dims in a vector
|
// wrap negative dims in a vector
|
||||||
static inline void wrap_all_dims(std::vector<int64_t>& dims_to_wrap, int64_t tensor_total_dims) {
|
static inline void wrap_all_dims(std::vector<int64_t>& dims_to_wrap, int64_t tensor_total_dims) {
|
||||||
for (size_t i = 0; i < dims_to_wrap.size(); i++) {
|
for (const auto i : c10::irange(dims_to_wrap.size())) {
|
||||||
dims_to_wrap[i] = maybe_wrap_dim(dims_to_wrap[i], tensor_total_dims);
|
dims_to_wrap[i] = maybe_wrap_dim(dims_to_wrap[i], tensor_total_dims);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <c10/core/TensorImpl.h>
|
#include <c10/core/TensorImpl.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <ATen/WrapDimUtils.h>
|
#include <ATen/WrapDimUtils.h>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include <bitset>
|
#include <bitset>
|
||||||
@ -15,7 +16,7 @@ constexpr size_t dim_bitset_size = 64;
|
|||||||
static inline std::bitset<dim_bitset_size> dim_list_to_bitset(IntArrayRef dims, int64_t ndims) {
|
static inline std::bitset<dim_bitset_size> dim_list_to_bitset(IntArrayRef dims, int64_t ndims) {
|
||||||
TORCH_CHECK(ndims <= (int64_t) dim_bitset_size, "only tensors with up to ", dim_bitset_size, " dims are supported");
|
TORCH_CHECK(ndims <= (int64_t) dim_bitset_size, "only tensors with up to ", dim_bitset_size, " dims are supported");
|
||||||
std::bitset<dim_bitset_size> seen;
|
std::bitset<dim_bitset_size> seen;
|
||||||
for (size_t i = 0; i < dims.size(); i++) {
|
for (const auto i : c10::irange(dims.size())) {
|
||||||
size_t dim = maybe_wrap_dim(dims[i], ndims);
|
size_t dim = maybe_wrap_dim(dims[i], ndims);
|
||||||
TORCH_CHECK(!seen[dim], "dim ", dim, " appears multiple times in the list of dims");
|
TORCH_CHECK(!seen[dim], "dim ", dim, " appears multiple times in the list of dims");
|
||||||
seen[dim] = true;
|
seen[dim] = true;
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
#include <benchmark/benchmark.h>
|
#include <benchmark/benchmark.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <torch/csrc/jit/passes/xnnpack_rewrite.h>
|
#include <torch/csrc/jit/passes/xnnpack_rewrite.h>
|
||||||
#include <torch/csrc/autograd/generated/variable_factories.h>
|
#include <torch/csrc/autograd/generated/variable_factories.h>
|
||||||
#include <torch/csrc/jit/api/module.h>
|
#include <torch/csrc/jit/api/module.h>
|
||||||
@ -33,7 +34,7 @@ static void stateful_conv1d(benchmark::State& state) {
|
|||||||
)");
|
)");
|
||||||
|
|
||||||
std::vector<std::vector<torch::jit::IValue>> inputs;
|
std::vector<std::vector<torch::jit::IValue>> inputs;
|
||||||
for (int i = 0; i < 10; ++i) {
|
for (const auto i : c10::irange(10)) {
|
||||||
std::vector<torch::jit::IValue> input;
|
std::vector<torch::jit::IValue> input;
|
||||||
// NOLINTNEXTLINE(modernize-use-emplace)
|
// NOLINTNEXTLINE(modernize-use-emplace)
|
||||||
input.push_back(torch::rand({batch_size, input_channels, width}));
|
input.push_back(torch::rand({batch_size, input_channels, width}));
|
||||||
@ -70,8 +71,8 @@ static void GenerateSizes(benchmark::internal::Benchmark* b) {
|
|||||||
|
|
||||||
for (size_t input_channels = 32; input_channels < 256; input_channels *= 2) {
|
for (size_t input_channels = 32; input_channels < 256; input_channels *= 2) {
|
||||||
for (size_t output_channels = 32; output_channels < 256; output_channels *= 2) {
|
for (size_t output_channels = 32; output_channels < 256; output_channels *= 2) {
|
||||||
for (size_t kernel = 3; kernel < 8; ++kernel) {
|
for (const auto kernel : c10::irange(3, 8)) {
|
||||||
for (size_t batch_size = 1; batch_size < 5; ++batch_size) {
|
for (const auto batch_size : c10::irange(1, 5)) {
|
||||||
for (size_t width = 32; width < 256; width *= 2) {
|
for (size_t width = 32; width < 256; width *= 2) {
|
||||||
b->Args({input_channels, output_channels, kernel, batch_size, width, true});
|
b->Args({input_channels, output_channels, kernel, batch_size, width, true});
|
||||||
b->Args({input_channels, output_channels, kernel, batch_size, width, false});
|
b->Args({input_channels, output_channels, kernel, batch_size, width, false});
|
||||||
|
@ -4,6 +4,7 @@
|
|||||||
// device code.
|
// device code.
|
||||||
|
|
||||||
#include <c10/macros/Macros.h>
|
#include <c10/macros/Macros.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
namespace at { namespace detail {
|
namespace at { namespace detail {
|
||||||
|
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
#include <ATen/core/Formatting.h>
|
#include <ATen/core/Formatting.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
@ -44,7 +45,7 @@ static std::tuple<double, int64_t> __printFormat(std::ostream& stream, const Ten
|
|||||||
}
|
}
|
||||||
bool intMode = true;
|
bool intMode = true;
|
||||||
auto self_p = self.data_ptr<double>();
|
auto self_p = self.data_ptr<double>();
|
||||||
for(int64_t i = 0; i < size; i++) {
|
for (const auto i : c10::irange(size)) {
|
||||||
auto z = self_p[i];
|
auto z = self_p[i];
|
||||||
if(std::isfinite(z)) {
|
if(std::isfinite(z)) {
|
||||||
if(z != std::ceil(z)) {
|
if(z != std::ceil(z)) {
|
||||||
@ -70,7 +71,7 @@ static std::tuple<double, int64_t> __printFormat(std::ostream& stream, const Ten
|
|||||||
} else {
|
} else {
|
||||||
expMin = fabs(self_p[offset]);
|
expMin = fabs(self_p[offset]);
|
||||||
expMax = fabs(self_p[offset]);
|
expMax = fabs(self_p[offset]);
|
||||||
for(int64_t i = offset; i < size; i++) {
|
for (const auto i : c10::irange(offset, size)) {
|
||||||
double z = fabs(self_p[i]);
|
double z = fabs(self_p[i]);
|
||||||
if(std::isfinite(z)) {
|
if(std::isfinite(z)) {
|
||||||
if(z < expMin) {
|
if(z < expMin) {
|
||||||
@ -130,7 +131,8 @@ static std::tuple<double, int64_t> __printFormat(std::ostream& stream, const Ten
|
|||||||
|
|
||||||
static void __printIndent(std::ostream &stream, int64_t indent)
|
static void __printIndent(std::ostream &stream, int64_t indent)
|
||||||
{
|
{
|
||||||
for(int64_t i = 0; i < indent; i++) {
|
for (const auto i : c10::irange(indent)) {
|
||||||
|
(void)i; //Suppress unused variable warning
|
||||||
stream << " ";
|
stream << " ";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -168,7 +170,7 @@ static void __printMatrix(std::ostream& stream, const Tensor& self, int64_t line
|
|||||||
printScale(stream,scale);
|
printScale(stream,scale);
|
||||||
__printIndent(stream, indent);
|
__printIndent(stream, indent);
|
||||||
}
|
}
|
||||||
for(int64_t l = 0; l < self.size(0); l++) {
|
for (const auto l : c10::irange(self.size(0))) {
|
||||||
Tensor row = self.select(0,l);
|
Tensor row = self.select(0,l);
|
||||||
double *row_ptr = row.data_ptr<double>();
|
double *row_ptr = row.data_ptr<double>();
|
||||||
for(int64_t c = firstColumn; c < lastColumn+1; c++) {
|
for(int64_t c = firstColumn; c < lastColumn+1; c++) {
|
||||||
@ -198,8 +200,7 @@ void __printTensor(std::ostream& stream, Tensor& self, int64_t linesize)
|
|||||||
bool start = true;
|
bool start = true;
|
||||||
bool finished = false;
|
bool finished = false;
|
||||||
counter[0] = -1;
|
counter[0] = -1;
|
||||||
for(size_t i = 1; i < counter.size(); i++)
|
for (const auto i : c10::irange(1, counter.size()))counter[i] = 0;
|
||||||
counter[i] = 0;
|
|
||||||
while(true) {
|
while(true) {
|
||||||
for(int64_t i = 0; self.ndimension()-2; i++) {
|
for(int64_t i = 0; self.ndimension()-2; i++) {
|
||||||
counter[i] = counter[i] + 1;
|
counter[i] = counter[i] + 1;
|
||||||
@ -269,7 +270,7 @@ std::ostream& print(std::ostream& stream, const Tensor & tensor_, int64_t linesi
|
|||||||
printScale(stream, scale);
|
printScale(stream, scale);
|
||||||
}
|
}
|
||||||
double* tensor_p = tensor.data_ptr<double>();
|
double* tensor_p = tensor.data_ptr<double>();
|
||||||
for (int64_t i = 0; i < tensor.size(0); i++) {
|
for (const auto i : c10::irange(tensor.size(0))) {
|
||||||
stream << std::setw(sz) << tensor_p[i]/scale << std::endl;
|
stream << std::setw(sz) << tensor_p[i]/scale << std::endl;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -284,7 +285,7 @@ std::ostream& print(std::ostream& stream, const Tensor & tensor_, int64_t linesi
|
|||||||
__printTensor(stream, tensor, linesize);
|
__printTensor(stream, tensor, linesize);
|
||||||
}
|
}
|
||||||
stream << "[ " << tensor_.toString() << "{" << tensor.size(0);
|
stream << "[ " << tensor_.toString() << "{" << tensor.size(0);
|
||||||
for(int64_t i = 1; i < tensor.ndimension(); i++) {
|
for (const auto i : c10::irange(1, tensor.ndimension())) {
|
||||||
stream << "," << tensor.size(i);
|
stream << "," << tensor.size(i);
|
||||||
}
|
}
|
||||||
stream << "}";
|
stream << "}";
|
||||||
|
@ -155,7 +155,7 @@ private:
|
|||||||
data_.seed_ = seed;
|
data_.seed_ = seed;
|
||||||
data_.seeded_ = true;
|
data_.seeded_ = true;
|
||||||
data_.state_[0] = seed & 0xffffffff;
|
data_.state_[0] = seed & 0xffffffff;
|
||||||
for(int j = 1; j < MERSENNE_STATE_N; j++) {
|
for (const auto j : c10::irange(1, MERSENNE_STATE_N)) {
|
||||||
data_.state_[j] = (1812433253 * (data_.state_[j-1] ^ (data_.state_[j-1] >> 30)) + j);
|
data_.state_[j] = (1812433253 * (data_.state_[j-1] ^ (data_.state_[j-1] >> 30)) + j);
|
||||||
}
|
}
|
||||||
data_.left_ = 1;
|
data_.left_ = 1;
|
||||||
|
@ -3,6 +3,7 @@
|
|||||||
#include <c10/macros/Macros.h>
|
#include <c10/macros/Macros.h>
|
||||||
#include <c10/util/Deprecated.h>
|
#include <c10/util/Deprecated.h>
|
||||||
#include <c10/util/Exception.h>
|
#include <c10/util/Exception.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <cstddef>
|
#include <cstddef>
|
||||||
|
|
||||||
@ -134,7 +135,7 @@ public:
|
|||||||
const source_index_t* sizes_,
|
const source_index_t* sizes_,
|
||||||
const source_index_t* strides_)
|
const source_index_t* strides_)
|
||||||
: data_(data_) {
|
: data_(data_) {
|
||||||
for (int i = 0; i < N; i++) {
|
for (const auto i : c10::irange(N)) {
|
||||||
this->sizes_[i] = sizes_[i];
|
this->sizes_[i] = sizes_[i];
|
||||||
this->strides_[i] = strides_[i];
|
this->strides_[i] = strides_[i];
|
||||||
}
|
}
|
||||||
|
@ -7,6 +7,7 @@
|
|||||||
#include <ATen/core/dispatch/Dispatcher.h>
|
#include <ATen/core/dispatch/Dispatcher.h>
|
||||||
#include <ATen/core/ivalue.h>
|
#include <ATen/core/ivalue.h>
|
||||||
#include <c10/core/CPUAllocator.h>
|
#include <c10/core/CPUAllocator.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
template<class... Inputs>
|
template<class... Inputs>
|
||||||
inline std::vector<c10::IValue> makeStack(Inputs&&... inputs) {
|
inline std::vector<c10::IValue> makeStack(Inputs&&... inputs) {
|
||||||
@ -87,7 +88,7 @@ inline void expectThrows(Functor&& functor, const char* expectMessageContains) {
|
|||||||
template<class T, size_t N>
|
template<class T, size_t N>
|
||||||
void expectListEquals(c10::ArrayRef<T> expected, std::array<T, N> actual) {
|
void expectListEquals(c10::ArrayRef<T> expected, std::array<T, N> actual) {
|
||||||
EXPECT_EQ(expected.size(), actual.size());
|
EXPECT_EQ(expected.size(), actual.size());
|
||||||
for (size_t i = 0; i < expected.size(); ++i) {
|
for (const auto i : c10::irange(expected.size())) {
|
||||||
EXPECT_EQ(expected[i], actual[i]);
|
EXPECT_EQ(expected[i], actual[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -95,7 +96,7 @@ void expectListEquals(c10::ArrayRef<T> expected, std::array<T, N> actual) {
|
|||||||
template<class T>
|
template<class T>
|
||||||
void expectListEquals(c10::ArrayRef<T> expected, c10::ArrayRef<T> actual) {
|
void expectListEquals(c10::ArrayRef<T> expected, c10::ArrayRef<T> actual) {
|
||||||
EXPECT_EQ(expected.size(), actual.size());
|
EXPECT_EQ(expected.size(), actual.size());
|
||||||
for (size_t i = 0; i < expected.size(); ++i) {
|
for (const auto i : c10::irange(expected.size())) {
|
||||||
EXPECT_EQ(expected[i], actual[i]);
|
EXPECT_EQ(expected[i], actual[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -103,7 +104,7 @@ void expectListEquals(c10::ArrayRef<T> expected, c10::ArrayRef<T> actual) {
|
|||||||
template<class T>
|
template<class T>
|
||||||
void expectListEquals(c10::ArrayRef<T> expected, c10::List<T> actual) {
|
void expectListEquals(c10::ArrayRef<T> expected, c10::List<T> actual) {
|
||||||
EXPECT_EQ(expected.size(), actual.size());
|
EXPECT_EQ(expected.size(), actual.size());
|
||||||
for (size_t i = 0; i < expected.size(); ++i) {
|
for (const auto i : c10::irange(expected.size())) {
|
||||||
EXPECT_EQ(expected[i], actual.get(i));
|
EXPECT_EQ(expected[i], actual.get(i));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -111,7 +112,7 @@ void expectListEquals(c10::ArrayRef<T> expected, c10::List<T> actual) {
|
|||||||
template<class T>
|
template<class T>
|
||||||
void expectListEquals(c10::ArrayRef<T> expected, std::vector<T> actual) {
|
void expectListEquals(c10::ArrayRef<T> expected, std::vector<T> actual) {
|
||||||
EXPECT_EQ(expected.size(), actual.size());
|
EXPECT_EQ(expected.size(), actual.size());
|
||||||
for (size_t i = 0; i < expected.size(); ++i) {
|
for (const auto i : c10::irange(expected.size())) {
|
||||||
EXPECT_EQ(expected[i], actual[i]);
|
EXPECT_EQ(expected[i], actual[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -5,6 +5,7 @@
|
|||||||
#include <ATen/core/jit_type.h>
|
#include <ATen/core/jit_type.h>
|
||||||
#include <c10/util/Bitset.h>
|
#include <c10/util/Bitset.h>
|
||||||
#include <c10/core/DispatchKeySet.h>
|
#include <c10/core/DispatchKeySet.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <ATen/core/Variadic.h>
|
#include <ATen/core/Variadic.h>
|
||||||
#include <ATen/core/stack.h>
|
#include <ATen/core/stack.h>
|
||||||
|
|
||||||
@ -171,7 +172,7 @@ private:
|
|||||||
"The function schema has ", schema.arguments().size(),
|
"The function schema has ", schema.arguments().size(),
|
||||||
" arguments but this PyTorch build only supports ", c10::utils::bitset::NUM_BITS());
|
" arguments but this PyTorch build only supports ", c10::utils::bitset::NUM_BITS());
|
||||||
c10::utils::bitset dispatch_arg_indices_reverse;
|
c10::utils::bitset dispatch_arg_indices_reverse;
|
||||||
for (size_t index = 0; index < schema.arguments().size(); ++index) {
|
for (const auto index : c10::irange(schema.arguments().size())) {
|
||||||
if (schema.arguments()[index].type()->isSubtypeOf(*TensorType::get()) ||
|
if (schema.arguments()[index].type()->isSubtypeOf(*TensorType::get()) ||
|
||||||
schema.arguments()[index].type()->isSubtypeOf(
|
schema.arguments()[index].type()->isSubtypeOf(
|
||||||
*ListType::ofTensors()) ||
|
*ListType::ofTensors()) ||
|
||||||
|
@ -5,6 +5,7 @@
|
|||||||
#include <ATen/Functions.h>
|
#include <ATen/Functions.h>
|
||||||
#include <ATen/core/dispatch/Dispatcher.h>
|
#include <ATen/core/dispatch/Dispatcher.h>
|
||||||
#include <ATen/core/op_registration/op_registration.h>
|
#include <ATen/core/op_registration/op_registration.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <torch/library.h>
|
#include <torch/library.h>
|
||||||
|
|
||||||
using namespace at;
|
using namespace at;
|
||||||
@ -51,7 +52,7 @@ void generic_wrapper_fallback(const c10::OperatorHandle& op, torch::jit::Stack*
|
|||||||
|
|
||||||
// Unwrap all arguments
|
// Unwrap all arguments
|
||||||
auto args = torch::jit::pop(*stack, num_arguments);
|
auto args = torch::jit::pop(*stack, num_arguments);
|
||||||
for (size_t i = 0; i < num_arguments; i++) {
|
for (const auto i : c10::irange(num_arguments)) {
|
||||||
// TODO: Handle tensor list
|
// TODO: Handle tensor list
|
||||||
if (args[i].isTensor()) {
|
if (args[i].isTensor()) {
|
||||||
auto* impl = args[i].unsafeToTensorImpl();
|
auto* impl = args[i].unsafeToTensorImpl();
|
||||||
@ -70,7 +71,7 @@ void generic_wrapper_fallback(const c10::OperatorHandle& op, torch::jit::Stack*
|
|||||||
|
|
||||||
// Rewrap outputs
|
// Rewrap outputs
|
||||||
auto rets = torch::jit::pop(*stack, num_returns);
|
auto rets = torch::jit::pop(*stack, num_returns);
|
||||||
for (size_t i = 0; i < num_returns; i++) {
|
for (const auto i : c10::irange(num_returns)) {
|
||||||
// TODO: Handle tensor list
|
// TODO: Handle tensor list
|
||||||
if (rets[i].isTensor()) {
|
if (rets[i].isTensor()) {
|
||||||
torch::jit::push(*stack, at::detail::make_tensor<GenericWrapperTensorImpl>(std::move(rets[i]).toTensor())); // yes move!
|
torch::jit::push(*stack, at::detail::make_tensor<GenericWrapperTensorImpl>(std::move(rets[i]).toTensor())); // yes move!
|
||||||
|
@ -2,6 +2,7 @@
|
|||||||
|
|
||||||
#include <c10/util/StringUtil.h>
|
#include <c10/util/StringUtil.h>
|
||||||
#include <c10/util/string_view.h>
|
#include <c10/util/string_view.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <ATen/core/jit_type.h>
|
#include <ATen/core/jit_type.h>
|
||||||
#include <ATen/core/interned_strings.h>
|
#include <ATen/core/interned_strings.h>
|
||||||
#include <ATen/core/ivalue.h>
|
#include <ATen/core/ivalue.h>
|
||||||
|
@ -16,7 +16,7 @@ inline std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema)
|
|||||||
out << "(";
|
out << "(";
|
||||||
|
|
||||||
bool seen_kwarg_only = false;
|
bool seen_kwarg_only = false;
|
||||||
for(size_t i = 0; i < schema.arguments().size(); ++i) {
|
for (const auto i : c10::irange(schema.arguments().size())) {
|
||||||
if (i > 0) out << ", ";
|
if (i > 0) out << ", ";
|
||||||
if (schema.arguments()[i].kwarg_only() && !seen_kwarg_only) {
|
if (schema.arguments()[i].kwarg_only() && !seen_kwarg_only) {
|
||||||
out << "*, ";
|
out << "*, ";
|
||||||
@ -35,7 +35,7 @@ inline std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema)
|
|||||||
|
|
||||||
const auto& returns = schema.returns();
|
const auto& returns = schema.returns();
|
||||||
out << "(";
|
out << "(";
|
||||||
for(size_t i = 0; i < returns.size(); ++i) {
|
for (const auto i : c10::irange(returns.size())) {
|
||||||
if (i > 0) {
|
if (i > 0) {
|
||||||
out << ", ";
|
out << ", ";
|
||||||
}
|
}
|
||||||
@ -53,7 +53,7 @@ inline std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema)
|
|||||||
|
|
||||||
inline size_t findFirstOutArg(const std::vector<Argument>& args) {
|
inline size_t findFirstOutArg(const std::vector<Argument>& args) {
|
||||||
// find the start of out args in the schema
|
// find the start of out args in the schema
|
||||||
for (size_t out_start_idx = 0; out_start_idx < args.size(); out_start_idx++) {
|
for (const auto out_start_idx : c10::irange(args.size())) {
|
||||||
if (args.at(out_start_idx).is_out()) {
|
if (args.at(out_start_idx).is_out()) {
|
||||||
return out_start_idx;
|
return out_start_idx;
|
||||||
}
|
}
|
||||||
@ -122,7 +122,7 @@ inline bool FunctionSchema::isBackwardCompatibleWith(
|
|||||||
&& arguments().size() >= old.arguments().size())) {
|
&& arguments().size() >= old.arguments().size())) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
for (size_t i = 0; i < returns().size(); ++i) {
|
for (const auto i : c10::irange(returns().size())) {
|
||||||
// Backwards compatibility requires covariance on argument types
|
// Backwards compatibility requires covariance on argument types
|
||||||
// (i.e. more generic), and contravariance on return types (i.e.
|
// (i.e. more generic), and contravariance on return types (i.e.
|
||||||
// more specific).
|
// more specific).
|
||||||
@ -138,7 +138,7 @@ inline bool FunctionSchema::isBackwardCompatibleWith(
|
|||||||
size_t new_out_start_idx = findFirstOutArg(arguments());
|
size_t new_out_start_idx = findFirstOutArg(arguments());
|
||||||
|
|
||||||
// make sure among the default args, they are backward compatible
|
// make sure among the default args, they are backward compatible
|
||||||
for (size_t i = 0; i < old_out_start_idx; i++) {
|
for (const auto i : c10::irange(old_out_start_idx)) {
|
||||||
if (!arguments().at(i).isBackwardCompatibleWith(
|
if (!arguments().at(i).isBackwardCompatibleWith(
|
||||||
old.arguments().at(i), why_not)) {
|
old.arguments().at(i), why_not)) {
|
||||||
return false;
|
return false;
|
||||||
@ -146,7 +146,7 @@ inline bool FunctionSchema::isBackwardCompatibleWith(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// // Validate that all new arguments provided has a default value
|
// // Validate that all new arguments provided has a default value
|
||||||
for (size_t i = old_out_start_idx; i < new_out_start_idx; ++i) {
|
for (const auto i : c10::irange(old_out_start_idx, new_out_start_idx)) {
|
||||||
if (!arguments().at(i).default_value()) {
|
if (!arguments().at(i).default_value()) {
|
||||||
if (why_not) {
|
if (why_not) {
|
||||||
*why_not
|
*why_not
|
||||||
@ -160,7 +160,7 @@ inline bool FunctionSchema::isBackwardCompatibleWith(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// now compare the out args
|
// now compare the out args
|
||||||
for (size_t i = old_out_start_idx; i < old.arguments().size(); i++) {
|
for (const auto i : c10::irange(old_out_start_idx, old.arguments().size())) {
|
||||||
if (!arguments()
|
if (!arguments()
|
||||||
.at(i - old_out_start_idx + new_out_start_idx)
|
.at(i - old_out_start_idx + new_out_start_idx)
|
||||||
.isBackwardCompatibleWith(old.arguments().at(i), why_not)) {
|
.isBackwardCompatibleWith(old.arguments().at(i), why_not)) {
|
||||||
@ -238,7 +238,7 @@ inline void FunctionSchema::checkAndNormalizeInputs(
|
|||||||
*this);
|
*this);
|
||||||
|
|
||||||
size_t consumed_kwargs = 0;
|
size_t consumed_kwargs = 0;
|
||||||
for (size_t pos = 0; pos < arguments().size(); ++pos) {
|
for (const auto pos : c10::irange(arguments().size())) {
|
||||||
const auto& argument = arguments()[pos];
|
const auto& argument = arguments()[pos];
|
||||||
if (pos < inputs.size()) {
|
if (pos < inputs.size()) {
|
||||||
checkArg(inputs[pos], argument, pos);
|
checkArg(inputs[pos], argument, pos);
|
||||||
@ -298,7 +298,7 @@ inline bool isSubtypeOfList(
|
|||||||
if (child.size() != parent.size()) {
|
if (child.size() != parent.size()) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
for (size_t i = 0; i < child.size(); ++i) {
|
for (const auto i : c10::irange(child.size())) {
|
||||||
const Argument& c = child[i];
|
const Argument& c = child[i];
|
||||||
const Argument& p = parent[i];
|
const Argument& p = parent[i];
|
||||||
if (c.name() != p.name()) {
|
if (c.name() != p.name()) {
|
||||||
|
@ -22,6 +22,7 @@
|
|||||||
#include <c10/util/intrusive_ptr.h>
|
#include <c10/util/intrusive_ptr.h>
|
||||||
#include <c10/util/irange.h>
|
#include <c10/util/irange.h>
|
||||||
#include <c10/util/hash.h>
|
#include <c10/util/hash.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
namespace torch {
|
namespace torch {
|
||||||
namespace jit {
|
namespace jit {
|
||||||
@ -1114,7 +1115,7 @@ struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target {
|
|||||||
}
|
}
|
||||||
std::ostringstream oss;
|
std::ostringstream oss;
|
||||||
oss << devices[0];
|
oss << devices[0];
|
||||||
for (size_t idx = 1; idx < devices.size(); idx++) {
|
for (const auto idx : c10::irange(1, devices.size())) {
|
||||||
if (idx == devices.size() - 1) {
|
if (idx == devices.size() - 1) {
|
||||||
oss << " and ";
|
oss << " and ";
|
||||||
} else {
|
} else {
|
||||||
@ -1131,7 +1132,7 @@ struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target {
|
|||||||
return c10::kCPU;
|
return c10::kCPU;
|
||||||
}
|
}
|
||||||
c10::DeviceType deviceType = devices[0].type();
|
c10::DeviceType deviceType = devices[0].type();
|
||||||
for (size_t idx = 1; idx < devices.size(); idx++) {
|
for (const auto idx : c10::irange(1, devices.size())) {
|
||||||
TORCH_CHECK_VALUE(
|
TORCH_CHECK_VALUE(
|
||||||
devices[idx].type() == deviceType,
|
devices[idx].type() == deviceType,
|
||||||
"Expected all devices to be of the same type, but got a mismatch between ",
|
"Expected all devices to be of the same type, but got a mismatch between ",
|
||||||
@ -1151,7 +1152,7 @@ struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target {
|
|||||||
[](const c10::Device& a, const c10::Device& b) { return a.index() < b.index(); });
|
[](const c10::Device& a, const c10::Device& b) { return a.index() < b.index(); });
|
||||||
// Deduplicate by compacting.
|
// Deduplicate by compacting.
|
||||||
size_t targetIdx = 0;
|
size_t targetIdx = 0;
|
||||||
for (size_t sourceIdx = 0; sourceIdx < devices.size(); sourceIdx++) {
|
for (const auto sourceIdx : c10::irange(devices.size())) {
|
||||||
TORCH_CHECK_VALUE(
|
TORCH_CHECK_VALUE(
|
||||||
devices[sourceIdx].has_index(),
|
devices[sourceIdx].has_index(),
|
||||||
"Expected devices to have indices, got ", devices[sourceIdx]);
|
"Expected devices to have indices, got ", devices[sourceIdx]);
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
#include <ATen/core/op_registration/infer_schema.h>
|
#include <ATen/core/op_registration/infer_schema.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
|
|
||||||
namespace c10 {
|
namespace c10 {
|
||||||
@ -20,7 +21,7 @@ std::string fastToString(size_t x) {
|
|||||||
std::vector<Argument> createArgumentVector(c10::ArrayRef<ArgumentDef> args) {
|
std::vector<Argument> createArgumentVector(c10::ArrayRef<ArgumentDef> args) {
|
||||||
std::vector<Argument> result;
|
std::vector<Argument> result;
|
||||||
result.reserve(args.size());
|
result.reserve(args.size());
|
||||||
for (size_t i = 0; i < args.size(); ++i) {
|
for (const auto i : c10::irange(args.size())) {
|
||||||
// Arguments are named "_<index>"
|
// Arguments are named "_<index>"
|
||||||
result.emplace_back(fastToString(i), (*args[i].getTypeFn)());
|
result.emplace_back(fastToString(i), (*args[i].getTypeFn)());
|
||||||
}
|
}
|
||||||
@ -49,7 +50,7 @@ C10_EXPORT c10::optional<std::string> findSchemaDifferences(const FunctionSchema
|
|||||||
" vs " + guts::to_string(rhs.returns().size());
|
" vs " + guts::to_string(rhs.returns().size());
|
||||||
}
|
}
|
||||||
|
|
||||||
for (size_t i = 0; i < lhs.arguments().size(); ++i) {
|
for (const auto i : c10::irange(lhs.arguments().size())) {
|
||||||
const TypePtr& leftType = lhs.arguments()[i].type();
|
const TypePtr& leftType = lhs.arguments()[i].type();
|
||||||
const TypePtr& rightType = rhs.arguments()[i].type();
|
const TypePtr& rightType = rhs.arguments()[i].type();
|
||||||
// Type::operator== is virtual. Comparing pointers first is
|
// Type::operator== is virtual. Comparing pointers first is
|
||||||
@ -61,7 +62,7 @@ C10_EXPORT c10::optional<std::string> findSchemaDifferences(const FunctionSchema
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (size_t i = 0; i < lhs.returns().size(); ++i) {
|
for (const auto i : c10::irange(lhs.returns().size())) {
|
||||||
const TypePtr& leftType = lhs.returns()[i].type();
|
const TypePtr& leftType = lhs.returns()[i].type();
|
||||||
const TypePtr& rightType = rhs.returns()[i].type();
|
const TypePtr& rightType = rhs.returns()[i].type();
|
||||||
// See above about comparing pointers first.
|
// See above about comparing pointers first.
|
||||||
|
@ -3,6 +3,7 @@
|
|||||||
#include <c10/util/ArrayRef.h>
|
#include <c10/util/ArrayRef.h>
|
||||||
#include <c10/util/Exception.h>
|
#include <c10/util/Exception.h>
|
||||||
#include <c10/util/StringUtil.h>
|
#include <c10/util/StringUtil.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
namespace c10 {
|
namespace c10 {
|
||||||
@ -69,7 +70,7 @@ struct QualifiedName {
|
|||||||
// Can't be a prefix if it's bigger
|
// Can't be a prefix if it's bigger
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
for (size_t i = 0; i < thisAtoms.size(); i++) {
|
for (const auto i : c10::irange(thisAtoms.size())) {
|
||||||
if (thisAtoms[i] != otherAtoms[i]) {
|
if (thisAtoms[i] != otherAtoms[i]) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -116,7 +117,7 @@ struct QualifiedName {
|
|||||||
reserve += e.size() + 1;
|
reserve += e.size() + 1;
|
||||||
}
|
}
|
||||||
out.reserve(reserve);
|
out.reserve(reserve);
|
||||||
for (size_t i = 0; i < v.size(); ++i) {
|
for (const auto i : c10::irange(v.size())) {
|
||||||
if (i != 0) {
|
if (i != 0) {
|
||||||
out.push_back(delimiter);
|
out.push_back(delimiter);
|
||||||
}
|
}
|
||||||
|
@ -4,6 +4,7 @@
|
|||||||
|
|
||||||
#include <ATen/core/ivalue.h>
|
#include <ATen/core/ivalue.h>
|
||||||
#include <c10/util/Deprecated.h>
|
#include <c10/util/Deprecated.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
// TODO move this to c10 namespace
|
// TODO move this to c10 namespace
|
||||||
|
|
||||||
@ -108,7 +109,7 @@ static inline IValue pop(Stack* stack) {
|
|||||||
static inline std::vector<IValue> pop(Stack& stack, size_t n) {
|
static inline std::vector<IValue> pop(Stack& stack, size_t n) {
|
||||||
std::vector<IValue> result;
|
std::vector<IValue> result;
|
||||||
result.reserve(n);
|
result.reserve(n);
|
||||||
for (size_t i = 0; i < n; ++i) {
|
for (const auto i : c10::irange(n)) {
|
||||||
result.push_back(std::move(peek(stack, i, n)));
|
result.push_back(std::move(peek(stack, i, n)));
|
||||||
}
|
}
|
||||||
drop(stack, n);
|
drop(stack, n);
|
||||||
|
@ -4,6 +4,7 @@
|
|||||||
// See Note [Do not compile initializers with AVX]
|
// See Note [Do not compile initializers with AVX]
|
||||||
|
|
||||||
#include <ATen/cpu/vec/vec.h>
|
#include <ATen/cpu/vec/vec.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
namespace at { namespace vec {
|
namespace at { namespace vec {
|
||||||
|
|
||||||
@ -16,7 +17,7 @@ inline scalar_t vec_reduce_all(
|
|||||||
using Vec = vec::Vectorized<scalar_t>;
|
using Vec = vec::Vectorized<scalar_t>;
|
||||||
scalar_t acc_arr[Vec::size()];
|
scalar_t acc_arr[Vec::size()];
|
||||||
acc_vec.store(acc_arr);
|
acc_vec.store(acc_arr);
|
||||||
for (int64_t i = 1; i < size; i++) {
|
for (const auto i : c10::irange(1, size)) {
|
||||||
std::array<scalar_t, Vec::size()> acc_arr_next = {0};
|
std::array<scalar_t, Vec::size()> acc_arr_next = {0};
|
||||||
acc_arr_next[0] = acc_arr[i];
|
acc_arr_next[0] = acc_arr[i];
|
||||||
Vec acc_vec_next = Vec::loadu(acc_arr_next.data());
|
Vec acc_vec_next = Vec::loadu(acc_arr_next.data());
|
||||||
|
@ -4,6 +4,7 @@
|
|||||||
// See Note [Do not compile initializers with AVX]
|
// See Note [Do not compile initializers with AVX]
|
||||||
|
|
||||||
#include <c10/util/complex.h>
|
#include <c10/util/complex.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <ATen/cpu/vec/intrinsics.h>
|
#include <ATen/cpu/vec/intrinsics.h>
|
||||||
#include <ATen/cpu/vec/vec_base.h>
|
#include <ATen/cpu/vec/vec_base.h>
|
||||||
|
|
||||||
@ -109,7 +110,7 @@ public:
|
|||||||
Vectorized<c10::complex<double>> map(c10::complex<double> (*const f)(const c10::complex<double> &)) const {
|
Vectorized<c10::complex<double>> map(c10::complex<double> (*const f)(const c10::complex<double> &)) const {
|
||||||
__at_align__ c10::complex<double> tmp[size()];
|
__at_align__ c10::complex<double> tmp[size()];
|
||||||
store(tmp);
|
store(tmp);
|
||||||
for (int i = 0; i < size(); i++) {
|
for (const auto i : c10::irange(size())) {
|
||||||
tmp[i] = f(tmp[i]);
|
tmp[i] = f(tmp[i]);
|
||||||
}
|
}
|
||||||
return loadu(tmp);
|
return loadu(tmp);
|
||||||
@ -293,7 +294,7 @@ public:
|
|||||||
__at_align__ c10::complex<double> y_tmp[size()];
|
__at_align__ c10::complex<double> y_tmp[size()];
|
||||||
store(x_tmp);
|
store(x_tmp);
|
||||||
exp.store(y_tmp);
|
exp.store(y_tmp);
|
||||||
for (int i = 0; i < size(); i++) {
|
for (const auto i : c10::irange(size())) {
|
||||||
x_tmp[i] = std::pow(x_tmp[i], y_tmp[i]);
|
x_tmp[i] = std::pow(x_tmp[i], y_tmp[i]);
|
||||||
}
|
}
|
||||||
return loadu(x_tmp);
|
return loadu(x_tmp);
|
||||||
|
@ -4,6 +4,7 @@
|
|||||||
// See Note [Do not compile initializers with AVX]
|
// See Note [Do not compile initializers with AVX]
|
||||||
|
|
||||||
#include <c10/util/complex.h>
|
#include <c10/util/complex.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <ATen/cpu/vec/intrinsics.h>
|
#include <ATen/cpu/vec/intrinsics.h>
|
||||||
#include <ATen/cpu/vec/vec_base.h>
|
#include <ATen/cpu/vec/vec_base.h>
|
||||||
#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
|
#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
|
||||||
@ -144,7 +145,7 @@ public:
|
|||||||
Vectorized<c10::complex<float>> map(c10::complex<float> (*const f)(const c10::complex<float> &)) const {
|
Vectorized<c10::complex<float>> map(c10::complex<float> (*const f)(const c10::complex<float> &)) const {
|
||||||
__at_align__ c10::complex<float> tmp[size()];
|
__at_align__ c10::complex<float> tmp[size()];
|
||||||
store(tmp);
|
store(tmp);
|
||||||
for (int i = 0; i < size(); i++) {
|
for (const auto i : c10::irange(size())) {
|
||||||
tmp[i] = f(tmp[i]);
|
tmp[i] = f(tmp[i]);
|
||||||
}
|
}
|
||||||
return loadu(tmp);
|
return loadu(tmp);
|
||||||
@ -327,7 +328,7 @@ public:
|
|||||||
__at_align__ c10::complex<float> y_tmp[size()];
|
__at_align__ c10::complex<float> y_tmp[size()];
|
||||||
store(x_tmp);
|
store(x_tmp);
|
||||||
exp.store(y_tmp);
|
exp.store(y_tmp);
|
||||||
for (int i = 0; i < size(); i++) {
|
for (const auto i : c10::irange(size())) {
|
||||||
x_tmp[i] = std::pow(x_tmp[i], y_tmp[i]);
|
x_tmp[i] = std::pow(x_tmp[i], y_tmp[i]);
|
||||||
}
|
}
|
||||||
return loadu(x_tmp);
|
return loadu(x_tmp);
|
||||||
|
@ -5,6 +5,7 @@
|
|||||||
|
|
||||||
#include <ATen/cpu/vec/intrinsics.h>
|
#include <ATen/cpu/vec/intrinsics.h>
|
||||||
#include <ATen/cpu/vec/vec_base.h>
|
#include <ATen/cpu/vec/vec_base.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
|
#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
|
||||||
#include <sleef.h>
|
#include <sleef.h>
|
||||||
#endif
|
#endif
|
||||||
@ -72,7 +73,7 @@ public:
|
|||||||
// Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
|
// Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
|
||||||
// for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
|
// for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
|
||||||
// instructions while a loop would be compiled to one instruction.
|
// instructions while a loop would be compiled to one instruction.
|
||||||
for (auto i = 0; i < size(); ++i) {
|
for (const auto i : c10::irange(size())) {
|
||||||
tmp_values[i] = 0.0;
|
tmp_values[i] = 0.0;
|
||||||
}
|
}
|
||||||
std::memcpy(
|
std::memcpy(
|
||||||
@ -103,7 +104,7 @@ public:
|
|||||||
Vectorized<double> map(double (*const f)(double)) const {
|
Vectorized<double> map(double (*const f)(double)) const {
|
||||||
__at_align__ double tmp[size()];
|
__at_align__ double tmp[size()];
|
||||||
store(tmp);
|
store(tmp);
|
||||||
for (int64_t i = 0; i < size(); i++) {
|
for (const auto i : c10::irange(size())) {
|
||||||
tmp[i] = f(tmp[i]);
|
tmp[i] = f(tmp[i]);
|
||||||
}
|
}
|
||||||
return loadu(tmp);
|
return loadu(tmp);
|
||||||
@ -180,7 +181,7 @@ public:
|
|||||||
__at_align__ double tmp_x[size()];
|
__at_align__ double tmp_x[size()];
|
||||||
store(tmp);
|
store(tmp);
|
||||||
x.store(tmp_x);
|
x.store(tmp_x);
|
||||||
for (int64_t i = 0; i < size(); i++) {
|
for (const auto i : c10::irange(size())) {
|
||||||
tmp[i] = calc_igamma(tmp[i], tmp_x[i]);
|
tmp[i] = calc_igamma(tmp[i], tmp_x[i]);
|
||||||
}
|
}
|
||||||
return loadu(tmp);
|
return loadu(tmp);
|
||||||
@ -190,7 +191,7 @@ public:
|
|||||||
__at_align__ double tmp_x[size()];
|
__at_align__ double tmp_x[size()];
|
||||||
store(tmp);
|
store(tmp);
|
||||||
x.store(tmp_x);
|
x.store(tmp_x);
|
||||||
for (int64_t i = 0; i < size(); i++) {
|
for (const auto i : c10::irange(size())) {
|
||||||
tmp[i] = calc_igammac(tmp[i], tmp_x[i]);
|
tmp[i] = calc_igammac(tmp[i], tmp_x[i]);
|
||||||
}
|
}
|
||||||
return loadu(tmp);
|
return loadu(tmp);
|
||||||
|
@ -5,6 +5,7 @@
|
|||||||
|
|
||||||
#include <ATen/cpu/vec/intrinsics.h>
|
#include <ATen/cpu/vec/intrinsics.h>
|
||||||
#include <ATen/cpu/vec/vec_base.h>
|
#include <ATen/cpu/vec/vec_base.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
|
#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
|
||||||
#include <sleef.h>
|
#include <sleef.h>
|
||||||
#endif
|
#endif
|
||||||
@ -80,7 +81,7 @@ public:
|
|||||||
// Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
|
// Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
|
||||||
// for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
|
// for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
|
||||||
// instructions while a loop would be compiled to one instruction.
|
// instructions while a loop would be compiled to one instruction.
|
||||||
for (auto i = 0; i < size(); ++i) {
|
for (const auto i : c10::irange(size())) {
|
||||||
tmp_values[i] = 0.0;
|
tmp_values[i] = 0.0;
|
||||||
}
|
}
|
||||||
std::memcpy(
|
std::memcpy(
|
||||||
@ -109,7 +110,7 @@ public:
|
|||||||
Vectorized<float> map(float (*const f)(float)) const {
|
Vectorized<float> map(float (*const f)(float)) const {
|
||||||
__at_align__ float tmp[size()];
|
__at_align__ float tmp[size()];
|
||||||
store(tmp);
|
store(tmp);
|
||||||
for (int64_t i = 0; i < size(); i++) {
|
for (const auto i : c10::irange(size())) {
|
||||||
tmp[i] = f(tmp[i]);
|
tmp[i] = f(tmp[i]);
|
||||||
}
|
}
|
||||||
return loadu(tmp);
|
return loadu(tmp);
|
||||||
@ -217,7 +218,7 @@ public:
|
|||||||
__at_align__ float tmp_x[size()];
|
__at_align__ float tmp_x[size()];
|
||||||
store(tmp);
|
store(tmp);
|
||||||
x.store(tmp_x);
|
x.store(tmp_x);
|
||||||
for (int64_t i = 0; i < size(); i++) {
|
for (const auto i : c10::irange(size())) {
|
||||||
tmp[i] = calc_igamma(tmp[i], tmp_x[i]);
|
tmp[i] = calc_igamma(tmp[i], tmp_x[i]);
|
||||||
}
|
}
|
||||||
return loadu(tmp);
|
return loadu(tmp);
|
||||||
@ -227,7 +228,7 @@ public:
|
|||||||
__at_align__ float tmp_x[size()];
|
__at_align__ float tmp_x[size()];
|
||||||
store(tmp);
|
store(tmp);
|
||||||
x.store(tmp_x);
|
x.store(tmp_x);
|
||||||
for (int64_t i = 0; i < size(); i++) {
|
for (const auto i : c10::irange(size())) {
|
||||||
tmp[i] = calc_igammac(tmp[i], tmp_x[i]);
|
tmp[i] = calc_igammac(tmp[i], tmp_x[i]);
|
||||||
}
|
}
|
||||||
return loadu(tmp);
|
return loadu(tmp);
|
||||||
|
@ -5,6 +5,7 @@
|
|||||||
|
|
||||||
#include <ATen/cpu/vec/intrinsics.h>
|
#include <ATen/cpu/vec/intrinsics.h>
|
||||||
#include <ATen/cpu/vec/vec_base.h>
|
#include <ATen/cpu/vec/vec_base.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
// Sleef offers vectorized versions of some transcedentals
|
// Sleef offers vectorized versions of some transcedentals
|
||||||
// such as sin, cos, tan etc..
|
// such as sin, cos, tan etc..
|
||||||
// However for now opting for STL, since we are not building
|
// However for now opting for STL, since we are not building
|
||||||
@ -221,7 +222,7 @@ public:
|
|||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
__at_align__ float tmp_values[size()];
|
__at_align__ float tmp_values[size()];
|
||||||
for (auto i = 0; i < size(); ++i) {
|
for (const auto i : c10::irange(size())) {
|
||||||
tmp_values[i] = 0.0;
|
tmp_values[i] = 0.0;
|
||||||
}
|
}
|
||||||
std::memcpy(
|
std::memcpy(
|
||||||
@ -287,7 +288,7 @@ public:
|
|||||||
__at_align__ float tmp[size()];
|
__at_align__ float tmp[size()];
|
||||||
__at_align__ float res[size()];
|
__at_align__ float res[size()];
|
||||||
store(tmp);
|
store(tmp);
|
||||||
for (int i = 0; i < size(); i++) {
|
for (const auto i : c10::irange(size())) {
|
||||||
if (_isnan(tmp[i])) {
|
if (_isnan(tmp[i])) {
|
||||||
std::memset(static_cast<void*>(&res[i]), 0xFF, sizeof(float));
|
std::memset(static_cast<void*>(&res[i]), 0xFF, sizeof(float));
|
||||||
} else {
|
} else {
|
||||||
@ -299,7 +300,7 @@ public:
|
|||||||
Vectorized<float> map(float (*const f)(float)) const {
|
Vectorized<float> map(float (*const f)(float)) const {
|
||||||
__at_align__ float tmp[size()];
|
__at_align__ float tmp[size()];
|
||||||
store(tmp);
|
store(tmp);
|
||||||
for (int64_t i = 0; i < size(); i++) {
|
for (const auto i : c10::irange(size())) {
|
||||||
tmp[i] = f(tmp[i]);
|
tmp[i] = f(tmp[i]);
|
||||||
}
|
}
|
||||||
return loadu(tmp);
|
return loadu(tmp);
|
||||||
@ -336,7 +337,7 @@ public:
|
|||||||
__at_align__ float tmp_exp[size()];
|
__at_align__ float tmp_exp[size()];
|
||||||
store(tmp);
|
store(tmp);
|
||||||
exp.store(tmp_exp);
|
exp.store(tmp_exp);
|
||||||
for (int64_t i = 0; i < size(); i++) {
|
for (const auto i : c10::irange(size())) {
|
||||||
tmp[i] = std::atan2(tmp[i], tmp_exp[i]);
|
tmp[i] = std::atan2(tmp[i], tmp_exp[i]);
|
||||||
}
|
}
|
||||||
return loadu(tmp);
|
return loadu(tmp);
|
||||||
@ -371,7 +372,7 @@ public:
|
|||||||
__at_align__ float tmp_q[size()];
|
__at_align__ float tmp_q[size()];
|
||||||
store(tmp);
|
store(tmp);
|
||||||
q.store(tmp_q);
|
q.store(tmp_q);
|
||||||
for (int64_t i = 0; i < size(); i++) {
|
for (const auto i : c10::irange(size())) {
|
||||||
tmp[i] = std::fmod(tmp[i], tmp_q[i]);
|
tmp[i] = std::fmod(tmp[i], tmp_q[i]);
|
||||||
}
|
}
|
||||||
return loadu(tmp);
|
return loadu(tmp);
|
||||||
@ -381,7 +382,7 @@ public:
|
|||||||
__at_align__ float tmp_b[size()];
|
__at_align__ float tmp_b[size()];
|
||||||
store(tmp);
|
store(tmp);
|
||||||
b.store(tmp_b);
|
b.store(tmp_b);
|
||||||
for (int64_t i = 0; i < size(); i++) {
|
for (const auto i : c10::irange(size())) {
|
||||||
tmp[i] = std::hypot(tmp[i], tmp_b[i]);
|
tmp[i] = std::hypot(tmp[i], tmp_b[i]);
|
||||||
}
|
}
|
||||||
return loadu(tmp);
|
return loadu(tmp);
|
||||||
@ -397,7 +398,7 @@ public:
|
|||||||
__at_align__ float tmp_x[size()];
|
__at_align__ float tmp_x[size()];
|
||||||
store(tmp);
|
store(tmp);
|
||||||
x.store(tmp_x);
|
x.store(tmp_x);
|
||||||
for (int64_t i = 0; i < size(); i++) {
|
for (const auto i : c10::irange(size())) {
|
||||||
tmp[i] = calc_igamma(tmp[i], tmp_x[i]);
|
tmp[i] = calc_igamma(tmp[i], tmp_x[i]);
|
||||||
}
|
}
|
||||||
return loadu(tmp);
|
return loadu(tmp);
|
||||||
@ -407,7 +408,7 @@ public:
|
|||||||
__at_align__ float tmp_x[size()];
|
__at_align__ float tmp_x[size()];
|
||||||
store(tmp);
|
store(tmp);
|
||||||
x.store(tmp_x);
|
x.store(tmp_x);
|
||||||
for (int64_t i = 0; i < size(); i++) {
|
for (const auto i : c10::irange(size())) {
|
||||||
tmp[i] = calc_igammac(tmp[i], tmp_x[i]);
|
tmp[i] = calc_igammac(tmp[i], tmp_x[i]);
|
||||||
}
|
}
|
||||||
return loadu(tmp);
|
return loadu(tmp);
|
||||||
@ -429,7 +430,7 @@ public:
|
|||||||
__at_align__ float tmp_b[size()];
|
__at_align__ float tmp_b[size()];
|
||||||
store(tmp);
|
store(tmp);
|
||||||
b.store(tmp_b);
|
b.store(tmp_b);
|
||||||
for (int64_t i = 0; i < size(); i++) {
|
for (const auto i : c10::irange(size())) {
|
||||||
tmp[i] = std::nextafter(tmp[i], tmp_b[i]);
|
tmp[i] = std::nextafter(tmp[i], tmp_b[i]);
|
||||||
}
|
}
|
||||||
return loadu(tmp);
|
return loadu(tmp);
|
||||||
@ -494,7 +495,7 @@ public:
|
|||||||
__at_align__ float tmp_exp[size()];
|
__at_align__ float tmp_exp[size()];
|
||||||
store(tmp);
|
store(tmp);
|
||||||
exp.store(tmp_exp);
|
exp.store(tmp_exp);
|
||||||
for (int64_t i = 0; i < size(); i++) {
|
for (const auto i : c10::irange(size())) {
|
||||||
tmp[i] = std::pow(tmp[i], tmp_exp[i]);
|
tmp[i] = std::pow(tmp[i], tmp_exp[i]);
|
||||||
}
|
}
|
||||||
return loadu(tmp);
|
return loadu(tmp);
|
||||||
|
@ -6,6 +6,7 @@
|
|||||||
#include <ATen/cpu/vec/intrinsics.h>
|
#include <ATen/cpu/vec/intrinsics.h>
|
||||||
#include <ATen/cpu/vec/vec_base.h>
|
#include <ATen/cpu/vec/vec_base.h>
|
||||||
#include <c10/macros/Macros.h>
|
#include <c10/macros/Macros.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
|
||||||
namespace at {
|
namespace at {
|
||||||
@ -98,7 +99,7 @@ public:
|
|||||||
// Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
|
// Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
|
||||||
// for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
|
// for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
|
||||||
// instructions while a loop would be compiled to one instruction.
|
// instructions while a loop would be compiled to one instruction.
|
||||||
for (auto i = 0; i < size(); ++i) {
|
for (const auto i : c10::irange(size())) {
|
||||||
tmp_values[i] = 0;
|
tmp_values[i] = 0;
|
||||||
}
|
}
|
||||||
std::memcpy(tmp_values, ptr, count * sizeof(int64_t));
|
std::memcpy(tmp_values, ptr, count * sizeof(int64_t));
|
||||||
@ -221,7 +222,7 @@ public:
|
|||||||
// Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
|
// Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
|
||||||
// for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
|
// for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
|
||||||
// instructions while a loop would be compiled to one instruction.
|
// instructions while a loop would be compiled to one instruction.
|
||||||
for (auto i = 0; i < size(); ++i) {
|
for (const auto i : c10::irange(size())) {
|
||||||
tmp_values[i] = 0;
|
tmp_values[i] = 0;
|
||||||
}
|
}
|
||||||
std::memcpy(tmp_values, ptr, count * sizeof(int32_t));
|
std::memcpy(tmp_values, ptr, count * sizeof(int32_t));
|
||||||
@ -435,7 +436,7 @@ public:
|
|||||||
// Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
|
// Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
|
||||||
// for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
|
// for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
|
||||||
// instructions while a loop would be compiled to one instruction.
|
// instructions while a loop would be compiled to one instruction.
|
||||||
for (auto i = 0; i < size(); ++i) {
|
for (const auto i : c10::irange(size())) {
|
||||||
tmp_values[i] = 0;
|
tmp_values[i] = 0;
|
||||||
}
|
}
|
||||||
std::memcpy(tmp_values, ptr, count * sizeof(int16_t));
|
std::memcpy(tmp_values, ptr, count * sizeof(int16_t));
|
||||||
@ -684,7 +685,7 @@ public:
|
|||||||
// Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
|
// Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
|
||||||
// for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
|
// for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
|
||||||
// instructions while a loop would be compiled to one instruction.
|
// instructions while a loop would be compiled to one instruction.
|
||||||
for (size_t i = 0; i < size(); ++i) {
|
for (const auto i : c10::irange(size())) {
|
||||||
tmp_values[i] = 0;
|
tmp_values[i] = 0;
|
||||||
}
|
}
|
||||||
std::memcpy(tmp_values, ptr, count * sizeof(int8_t));
|
std::memcpy(tmp_values, ptr, count * sizeof(int8_t));
|
||||||
|
@ -6,6 +6,8 @@
|
|||||||
#include <ATen/cpu/vec/intrinsics.h>
|
#include <ATen/cpu/vec/intrinsics.h>
|
||||||
#include <ATen/cpu/vec/vec_base.h>
|
#include <ATen/cpu/vec/vec_base.h>
|
||||||
#include <ATen/native/quantized/affine_quantizer_base.h>
|
#include <ATen/native/quantized/affine_quantizer_base.h>
|
||||||
|
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <c10/util/qint32.h>
|
#include <c10/util/qint32.h>
|
||||||
#include <c10/util/qint8.h>
|
#include <c10/util/qint8.h>
|
||||||
#include <c10/util/quint8.h>
|
#include <c10/util/quint8.h>
|
||||||
@ -739,7 +741,7 @@ struct VectorizedQuantizedConverter {
|
|||||||
std::array<value_type, size_> vals;
|
std::array<value_type, size_> vals;
|
||||||
|
|
||||||
VectorizedQuantizedConverter(T val) {
|
VectorizedQuantizedConverter(T val) {
|
||||||
for (size_t i = 0; i < size(); ++i) {
|
for (const auto i : c10::irange(size())) {
|
||||||
vals[i] = val.val_;
|
vals[i] = val.val_;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -757,9 +759,9 @@ struct VectorizedQuantizedConverter {
|
|||||||
Vectorized<float> zero_point,
|
Vectorized<float> zero_point,
|
||||||
Vectorized<float> scale_zp_premul) const {
|
Vectorized<float> scale_zp_premul) const {
|
||||||
float_vec_return_type rv;
|
float_vec_return_type rv;
|
||||||
for (int i = 0; i < float_num_vecs(); ++i) {
|
for (const auto i : c10::irange(float_num_vecs())) {
|
||||||
float tmp_vals[8];
|
float tmp_vals[8];
|
||||||
for (int j = 0; j < 8; ++j) {
|
for (const auto j : c10::irange(8)) {
|
||||||
tmp_vals[j] = at::native::dequantize_val<T>(
|
tmp_vals[j] = at::native::dequantize_val<T>(
|
||||||
scale[j], zero_point[j], T(vals[8 * i + j]));
|
scale[j], zero_point[j], T(vals[8 * i + j]));
|
||||||
}
|
}
|
||||||
@ -816,7 +818,7 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
|
|||||||
std::array<value_type, size()> qvals;
|
std::array<value_type, size()> qvals;
|
||||||
std::array<float, float_num_vecs() * 8> float_vals;
|
std::array<float, float_num_vecs() * 8> float_vals;
|
||||||
|
|
||||||
for (int i = 0; i < float_num_vecs(); ++i) {
|
for (const auto i : c10::irange(float_num_vecs())) {
|
||||||
rhs[i].store(&float_vals[i * 8], 8);
|
rhs[i].store(&float_vals[i * 8], 8);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -832,7 +834,7 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
|
|||||||
|
|
||||||
Vectorized<c10::qint32> maximum(Vectorized<c10::qint32> b) const {
|
Vectorized<c10::qint32> maximum(Vectorized<c10::qint32> b) const {
|
||||||
Vectorized<c10::qint32> retval;
|
Vectorized<c10::qint32> retval;
|
||||||
for (size_t i = 0; i < size(); ++i) {
|
for (const auto i : c10::irange(size())) {
|
||||||
retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]);
|
retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]);
|
||||||
}
|
}
|
||||||
return retval;
|
return retval;
|
||||||
@ -840,7 +842,7 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
|
|||||||
|
|
||||||
Vectorized<c10::qint32> minimum(Vectorized<c10::qint32> b) const {
|
Vectorized<c10::qint32> minimum(Vectorized<c10::qint32> b) const {
|
||||||
Vectorized<c10::qint32> retval;
|
Vectorized<c10::qint32> retval;
|
||||||
for (size_t i = 0; i < size(); ++i) {
|
for (const auto i : c10::irange(size())) {
|
||||||
retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]);
|
retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]);
|
||||||
}
|
}
|
||||||
return retval;
|
return retval;
|
||||||
@ -855,7 +857,7 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
|
|||||||
Vectorized<c10::qint32> zero_point,
|
Vectorized<c10::qint32> zero_point,
|
||||||
Vectorized<c10::qint32> q_six) {
|
Vectorized<c10::qint32> q_six) {
|
||||||
Vectorized<c10::qint32> retval;
|
Vectorized<c10::qint32> retval;
|
||||||
for (size_t i = 0; i < size(); ++i) {
|
for (const auto i : c10::irange(size())) {
|
||||||
retval.vals[i] = std::min<value_type>(
|
retval.vals[i] = std::min<value_type>(
|
||||||
std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]);
|
std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]);
|
||||||
}
|
}
|
||||||
@ -864,7 +866,7 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
|
|||||||
|
|
||||||
int_vec_return_type widening_subtract(Vectorized<c10::qint32> b) const {
|
int_vec_return_type widening_subtract(Vectorized<c10::qint32> b) const {
|
||||||
int_vec_return_type retval;
|
int_vec_return_type retval;
|
||||||
for (size_t i = 0; i < size(); ++i) {
|
for (const auto i : c10::irange(size())) {
|
||||||
retval[0].vals[i] = vals[i] - b.vals[i];
|
retval[0].vals[i] = vals[i] - b.vals[i];
|
||||||
}
|
}
|
||||||
return retval;
|
return retval;
|
||||||
@ -875,7 +877,7 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
|
|||||||
float multiplier,
|
float multiplier,
|
||||||
int32_t zero_point) {
|
int32_t zero_point) {
|
||||||
Vectorized<c10::qint32> retval;
|
Vectorized<c10::qint32> retval;
|
||||||
for (size_t i = 0; i < size(); ++i) {
|
for (const auto i : c10::irange(size())) {
|
||||||
retval.vals[i] =
|
retval.vals[i] =
|
||||||
nearbyint(static_cast<float>(inp[0].vals[i]) * multiplier) +
|
nearbyint(static_cast<float>(inp[0].vals[i]) * multiplier) +
|
||||||
zero_point;
|
zero_point;
|
||||||
@ -948,7 +950,7 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
|
|||||||
std::array<value_type, size()> qvals;
|
std::array<value_type, size()> qvals;
|
||||||
std::array<float, float_num_vecs() * 8> float_vals;
|
std::array<float, float_num_vecs() * 8> float_vals;
|
||||||
|
|
||||||
for (int i = 0; i < float_num_vecs(); ++i) {
|
for (const auto i : c10::irange(float_num_vecs())) {
|
||||||
rhs[i].store(&float_vals[i * 8], 8);
|
rhs[i].store(&float_vals[i * 8], 8);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -964,7 +966,7 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
|
|||||||
|
|
||||||
Vectorized<c10::qint8> maximum(Vectorized<c10::qint8> b) const {
|
Vectorized<c10::qint8> maximum(Vectorized<c10::qint8> b) const {
|
||||||
Vectorized<c10::qint8> retval;
|
Vectorized<c10::qint8> retval;
|
||||||
for (size_t i = 0; i < size(); ++i) {
|
for (const auto i : c10::irange(size())) {
|
||||||
retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]);
|
retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]);
|
||||||
}
|
}
|
||||||
return retval;
|
return retval;
|
||||||
@ -972,7 +974,7 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
|
|||||||
|
|
||||||
Vectorized<c10::qint8> minimum(Vectorized<c10::qint8> b) const {
|
Vectorized<c10::qint8> minimum(Vectorized<c10::qint8> b) const {
|
||||||
Vectorized<c10::qint8> retval;
|
Vectorized<c10::qint8> retval;
|
||||||
for (size_t i = 0; i < size(); ++i) {
|
for (const auto i : c10::irange(size())) {
|
||||||
retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]);
|
retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]);
|
||||||
}
|
}
|
||||||
return retval;
|
return retval;
|
||||||
@ -986,7 +988,7 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
|
|||||||
Vectorized<c10::qint8> zero_point,
|
Vectorized<c10::qint8> zero_point,
|
||||||
Vectorized<c10::qint8> q_six) {
|
Vectorized<c10::qint8> q_six) {
|
||||||
Vectorized<c10::qint8> retval;
|
Vectorized<c10::qint8> retval;
|
||||||
for (size_t i = 0; i < size(); ++i) {
|
for (const auto i : c10::irange(size())) {
|
||||||
retval.vals[i] = std::min<value_type>(
|
retval.vals[i] = std::min<value_type>(
|
||||||
std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]);
|
std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]);
|
||||||
}
|
}
|
||||||
@ -996,8 +998,8 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
|
|||||||
int_vec_return_type widening_subtract(Vectorized<c10::qint8> b) const {
|
int_vec_return_type widening_subtract(Vectorized<c10::qint8> b) const {
|
||||||
int_vec_return_type retval;
|
int_vec_return_type retval;
|
||||||
constexpr int elem_per_int_vec = size() / int_num_vecs();
|
constexpr int elem_per_int_vec = size() / int_num_vecs();
|
||||||
for (size_t i = 0; i < int_num_vecs(); ++i) {
|
for (const auto i : c10::irange(int_num_vecs())) {
|
||||||
for (size_t j = 0; j < elem_per_int_vec; ++j) {
|
for (const auto j : c10::irange(elem_per_int_vec)) {
|
||||||
retval[i].vals[j] =
|
retval[i].vals[j] =
|
||||||
static_cast<int32_t>(vals[i * elem_per_int_vec + j]) -
|
static_cast<int32_t>(vals[i * elem_per_int_vec + j]) -
|
||||||
static_cast<int32_t>(b.vals[i * elem_per_int_vec + j]);
|
static_cast<int32_t>(b.vals[i * elem_per_int_vec + j]);
|
||||||
@ -1013,8 +1015,8 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
|
|||||||
constexpr auto min_val = std::numeric_limits<value_type>::min();
|
constexpr auto min_val = std::numeric_limits<value_type>::min();
|
||||||
constexpr auto max_val = std::numeric_limits<value_type>::max();
|
constexpr auto max_val = std::numeric_limits<value_type>::max();
|
||||||
Vectorized<c10::qint8> retval;
|
Vectorized<c10::qint8> retval;
|
||||||
for (size_t i = 0; i < int_num_vecs(); ++i) {
|
for (const auto i : c10::irange(int_num_vecs())) {
|
||||||
for (size_t j = 0; j < elem_per_int_vec; ++j) {
|
for (const auto j : c10::irange(elem_per_int_vec)) {
|
||||||
int32_t rounded =
|
int32_t rounded =
|
||||||
nearbyint(static_cast<float>(inp[i].vals[j]) * multiplier) +
|
nearbyint(static_cast<float>(inp[i].vals[j]) * multiplier) +
|
||||||
zero_point;
|
zero_point;
|
||||||
@ -1068,7 +1070,7 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
|
|||||||
std::array<value_type, size()> qvals;
|
std::array<value_type, size()> qvals;
|
||||||
std::array<float, float_num_vecs() * 8> float_vals;
|
std::array<float, float_num_vecs() * 8> float_vals;
|
||||||
|
|
||||||
for (int i = 0; i < float_num_vecs(); ++i) {
|
for (const auto i : c10::irange(float_num_vecs())) {
|
||||||
rhs[i].store(&float_vals[i * 8], 8);
|
rhs[i].store(&float_vals[i * 8], 8);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1084,7 +1086,7 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
|
|||||||
|
|
||||||
Vectorized<c10::quint8> maximum(Vectorized<c10::quint8> b) const {
|
Vectorized<c10::quint8> maximum(Vectorized<c10::quint8> b) const {
|
||||||
Vectorized<c10::quint8> retval;
|
Vectorized<c10::quint8> retval;
|
||||||
for (size_t i = 0; i < size(); ++i) {
|
for (const auto i : c10::irange(size())) {
|
||||||
retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]);
|
retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]);
|
||||||
}
|
}
|
||||||
return retval;
|
return retval;
|
||||||
@ -1092,7 +1094,7 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
|
|||||||
|
|
||||||
Vectorized<c10::quint8> minimum(Vectorized<c10::quint8> b) const {
|
Vectorized<c10::quint8> minimum(Vectorized<c10::quint8> b) const {
|
||||||
Vectorized<c10::quint8> retval;
|
Vectorized<c10::quint8> retval;
|
||||||
for (size_t i = 0; i < size(); ++i) {
|
for (const auto i : c10::irange(size())) {
|
||||||
retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]);
|
retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]);
|
||||||
}
|
}
|
||||||
return retval;
|
return retval;
|
||||||
@ -1107,7 +1109,7 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
|
|||||||
Vectorized<c10::quint8> zero_point,
|
Vectorized<c10::quint8> zero_point,
|
||||||
Vectorized<c10::quint8> q_six) {
|
Vectorized<c10::quint8> q_six) {
|
||||||
Vectorized<c10::quint8> retval;
|
Vectorized<c10::quint8> retval;
|
||||||
for (size_t i = 0; i < size(); ++i) {
|
for (const auto i : c10::irange(size())) {
|
||||||
retval.vals[i] = std::min<value_type>(
|
retval.vals[i] = std::min<value_type>(
|
||||||
std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]);
|
std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]);
|
||||||
}
|
}
|
||||||
@ -1117,8 +1119,8 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
|
|||||||
int_vec_return_type widening_subtract(Vectorized<c10::quint8> b) const {
|
int_vec_return_type widening_subtract(Vectorized<c10::quint8> b) const {
|
||||||
int_vec_return_type retval;
|
int_vec_return_type retval;
|
||||||
constexpr int elem_per_int_vec = size() / int_num_vecs();
|
constexpr int elem_per_int_vec = size() / int_num_vecs();
|
||||||
for (size_t i = 0; i < int_num_vecs(); ++i) {
|
for (const auto i : c10::irange(int_num_vecs())) {
|
||||||
for (size_t j = 0; j < elem_per_int_vec; ++j) {
|
for (const auto j : c10::irange(elem_per_int_vec)) {
|
||||||
retval[i].vals[j] =
|
retval[i].vals[j] =
|
||||||
static_cast<int32_t>(vals[i * elem_per_int_vec + j]) -
|
static_cast<int32_t>(vals[i * elem_per_int_vec + j]) -
|
||||||
static_cast<int32_t>(b.vals[i * elem_per_int_vec + j]);
|
static_cast<int32_t>(b.vals[i * elem_per_int_vec + j]);
|
||||||
@ -1134,8 +1136,8 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
|
|||||||
constexpr auto min_val = std::numeric_limits<value_type>::min();
|
constexpr auto min_val = std::numeric_limits<value_type>::min();
|
||||||
constexpr auto max_val = std::numeric_limits<value_type>::max();
|
constexpr auto max_val = std::numeric_limits<value_type>::max();
|
||||||
Vectorized<c10::quint8> retval;
|
Vectorized<c10::quint8> retval;
|
||||||
for (size_t i = 0; i < int_num_vecs(); ++i) {
|
for (const auto i : c10::irange(int_num_vecs())) {
|
||||||
for (size_t j = 0; j < elem_per_int_vec; ++j) {
|
for (const auto j : c10::irange(elem_per_int_vec)) {
|
||||||
int32_t rounded =
|
int32_t rounded =
|
||||||
nearbyint(static_cast<float>(inp[i].vals[j]) * multiplier) +
|
nearbyint(static_cast<float>(inp[i].vals[j]) * multiplier) +
|
||||||
zero_point;
|
zero_point;
|
||||||
|
@ -3,6 +3,7 @@
|
|||||||
#include <ATen/cpu/vec/vec_base.h>
|
#include <ATen/cpu/vec/vec_base.h>
|
||||||
#include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
|
#include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
|
||||||
#include <c10/util/complex.h>
|
#include <c10/util/complex.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
namespace at {
|
namespace at {
|
||||||
namespace vec {
|
namespace vec {
|
||||||
@ -167,7 +168,7 @@ class Vectorized<ComplexDbl> {
|
|||||||
Vectorized<ComplexDbl> map(ComplexDbl (*const f)(ComplexDbl)) const {
|
Vectorized<ComplexDbl> map(ComplexDbl (*const f)(ComplexDbl)) const {
|
||||||
__at_align__ ComplexDbl tmp[size()];
|
__at_align__ ComplexDbl tmp[size()];
|
||||||
store(tmp);
|
store(tmp);
|
||||||
for (int i = 0; i < size(); i++) {
|
for (const auto i : c10::irange(size())) {
|
||||||
tmp[i] = f(tmp[i]);
|
tmp[i] = f(tmp[i]);
|
||||||
}
|
}
|
||||||
return loadu(tmp);
|
return loadu(tmp);
|
||||||
@ -176,7 +177,7 @@ class Vectorized<ComplexDbl> {
|
|||||||
Vectorized<ComplexDbl> map(ComplexDbl (*const f)(const ComplexDbl&)) const {
|
Vectorized<ComplexDbl> map(ComplexDbl (*const f)(const ComplexDbl&)) const {
|
||||||
__at_align__ ComplexDbl tmp[size()];
|
__at_align__ ComplexDbl tmp[size()];
|
||||||
store(tmp);
|
store(tmp);
|
||||||
for (int i = 0; i < size(); i++) {
|
for (const auto i : c10::irange(size())) {
|
||||||
tmp[i] = f(tmp[i]);
|
tmp[i] = f(tmp[i]);
|
||||||
}
|
}
|
||||||
return loadu(tmp);
|
return loadu(tmp);
|
||||||
@ -454,7 +455,7 @@ class Vectorized<ComplexDbl> {
|
|||||||
__at_align__ ComplexDbl y_tmp[size()];
|
__at_align__ ComplexDbl y_tmp[size()];
|
||||||
store(x_tmp);
|
store(x_tmp);
|
||||||
exp.store(y_tmp);
|
exp.store(y_tmp);
|
||||||
for (int i = 0; i < size(); i++) {
|
for (const auto i : c10::irange(size())) {
|
||||||
x_tmp[i] = std::pow(x_tmp[i], y_tmp[i]);
|
x_tmp[i] = std::pow(x_tmp[i], y_tmp[i]);
|
||||||
}
|
}
|
||||||
return loadu(x_tmp);
|
return loadu(x_tmp);
|
||||||
|
@ -4,6 +4,7 @@
|
|||||||
#include <ATen/cpu/vec/vec_base.h>
|
#include <ATen/cpu/vec/vec_base.h>
|
||||||
#include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
|
#include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
|
||||||
#include <c10/util/complex.h>
|
#include <c10/util/complex.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
namespace at {
|
namespace at {
|
||||||
namespace vec {
|
namespace vec {
|
||||||
@ -222,7 +223,7 @@ class Vectorized<ComplexFlt> {
|
|||||||
Vectorized<ComplexFlt> map(ComplexFlt (*const f)(ComplexFlt)) const {
|
Vectorized<ComplexFlt> map(ComplexFlt (*const f)(ComplexFlt)) const {
|
||||||
__at_align__ ComplexFlt tmp[size()];
|
__at_align__ ComplexFlt tmp[size()];
|
||||||
store(tmp);
|
store(tmp);
|
||||||
for (int i = 0; i < size(); i++) {
|
for (const auto i : c10::irange(size())) {
|
||||||
tmp[i] = f(tmp[i]);
|
tmp[i] = f(tmp[i]);
|
||||||
}
|
}
|
||||||
return loadu(tmp);
|
return loadu(tmp);
|
||||||
@ -231,7 +232,7 @@ class Vectorized<ComplexFlt> {
|
|||||||
Vectorized<ComplexFlt> map(ComplexFlt (*const f)(const ComplexFlt&)) const {
|
Vectorized<ComplexFlt> map(ComplexFlt (*const f)(const ComplexFlt&)) const {
|
||||||
__at_align__ ComplexFlt tmp[size()];
|
__at_align__ ComplexFlt tmp[size()];
|
||||||
store(tmp);
|
store(tmp);
|
||||||
for (int i = 0; i < size(); i++) {
|
for (const auto i : c10::irange(size())) {
|
||||||
tmp[i] = f(tmp[i]);
|
tmp[i] = f(tmp[i]);
|
||||||
}
|
}
|
||||||
return loadu(tmp);
|
return loadu(tmp);
|
||||||
@ -430,7 +431,7 @@ class Vectorized<ComplexFlt> {
|
|||||||
__at_align__ ComplexFlt y_tmp[size()];
|
__at_align__ ComplexFlt y_tmp[size()];
|
||||||
store(x_tmp);
|
store(x_tmp);
|
||||||
exp.store(y_tmp);
|
exp.store(y_tmp);
|
||||||
for (int i = 0; i < size(); i++) {
|
for (const auto i : c10::irange(size())) {
|
||||||
x_tmp[i] = std::pow(x_tmp[i], y_tmp[i]);
|
x_tmp[i] = std::pow(x_tmp[i], y_tmp[i]);
|
||||||
}
|
}
|
||||||
return loadu(x_tmp);
|
return loadu(x_tmp);
|
||||||
|
@ -3,6 +3,8 @@
|
|||||||
#include <ATen/cpu/vec/intrinsics.h>
|
#include <ATen/cpu/vec/intrinsics.h>
|
||||||
#include <ATen/cpu/vec/vec_base.h>
|
#include <ATen/cpu/vec/vec_base.h>
|
||||||
#include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
|
#include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
|
||||||
|
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <c10/util/quint8.h>
|
#include <c10/util/quint8.h>
|
||||||
#include <array>
|
#include <array>
|
||||||
|
|
||||||
|
@ -4,6 +4,7 @@
|
|||||||
// See Note [Do not compile initializers with AVX]
|
// See Note [Do not compile initializers with AVX]
|
||||||
|
|
||||||
#include <c10/util/complex.h>
|
#include <c10/util/complex.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <ATen/cpu/vec/intrinsics.h>
|
#include <ATen/cpu/vec/intrinsics.h>
|
||||||
#include <ATen/cpu/vec/vec_base.h>
|
#include <ATen/cpu/vec/vec_base.h>
|
||||||
#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
|
#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
|
||||||
@ -149,7 +150,7 @@ public:
|
|||||||
Vectorized<c10::complex<double>> map(c10::complex<double> (*const f)(const c10::complex<double> &)) const {
|
Vectorized<c10::complex<double>> map(c10::complex<double> (*const f)(const c10::complex<double> &)) const {
|
||||||
__at_align__ c10::complex<double> tmp[size()];
|
__at_align__ c10::complex<double> tmp[size()];
|
||||||
store(tmp);
|
store(tmp);
|
||||||
for (int i = 0; i < size(); i++) {
|
for (const auto i : c10::irange(size())) {
|
||||||
tmp[i] = f(tmp[i]);
|
tmp[i] = f(tmp[i]);
|
||||||
}
|
}
|
||||||
return loadu(tmp);
|
return loadu(tmp);
|
||||||
@ -357,7 +358,7 @@ public:
|
|||||||
__at_align__ c10::complex<double> y_tmp[size()];
|
__at_align__ c10::complex<double> y_tmp[size()];
|
||||||
store(x_tmp);
|
store(x_tmp);
|
||||||
exp.store(y_tmp);
|
exp.store(y_tmp);
|
||||||
for (int i = 0; i < size(); i++) {
|
for (const auto i : c10::irange(size())) {
|
||||||
x_tmp[i] = std::pow(x_tmp[i], y_tmp[i]);
|
x_tmp[i] = std::pow(x_tmp[i], y_tmp[i]);
|
||||||
}
|
}
|
||||||
return loadu(x_tmp);
|
return loadu(x_tmp);
|
||||||
|
@ -4,6 +4,7 @@
|
|||||||
// See Note [Do not compile initializers with AVX]
|
// See Note [Do not compile initializers with AVX]
|
||||||
|
|
||||||
#include <c10/util/complex.h>
|
#include <c10/util/complex.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <ATen/cpu/vec/intrinsics.h>
|
#include <ATen/cpu/vec/intrinsics.h>
|
||||||
#include <ATen/cpu/vec/vec_base.h>
|
#include <ATen/cpu/vec/vec_base.h>
|
||||||
#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
|
#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
|
||||||
@ -667,7 +668,7 @@ public:
|
|||||||
Vectorized<c10::complex<float>> map(c10::complex<float> (*const f)(const c10::complex<float> &)) const {
|
Vectorized<c10::complex<float>> map(c10::complex<float> (*const f)(const c10::complex<float> &)) const {
|
||||||
__at_align__ c10::complex<float> tmp[size()];
|
__at_align__ c10::complex<float> tmp[size()];
|
||||||
store(tmp);
|
store(tmp);
|
||||||
for (int i = 0; i < size(); i++) {
|
for (const auto i : c10::irange(size())) {
|
||||||
tmp[i] = f(tmp[i]);
|
tmp[i] = f(tmp[i]);
|
||||||
}
|
}
|
||||||
return loadu(tmp);
|
return loadu(tmp);
|
||||||
@ -858,7 +859,7 @@ public:
|
|||||||
__at_align__ c10::complex<float> y_tmp[size()];
|
__at_align__ c10::complex<float> y_tmp[size()];
|
||||||
store(x_tmp);
|
store(x_tmp);
|
||||||
exp.store(y_tmp);
|
exp.store(y_tmp);
|
||||||
for (int i = 0; i < size(); i++) {
|
for (const auto i : c10::irange(size())) {
|
||||||
x_tmp[i] = std::pow(x_tmp[i], y_tmp[i]);
|
x_tmp[i] = std::pow(x_tmp[i], y_tmp[i]);
|
||||||
}
|
}
|
||||||
return loadu(x_tmp);
|
return loadu(x_tmp);
|
||||||
|
@ -5,6 +5,7 @@
|
|||||||
|
|
||||||
#include <ATen/cpu/vec/intrinsics.h>
|
#include <ATen/cpu/vec/intrinsics.h>
|
||||||
#include <ATen/cpu/vec/vec_base.h>
|
#include <ATen/cpu/vec/vec_base.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#if (defined(CPU_CAPABILITY_AVX512)) && !defined(_MSC_VER)
|
#if (defined(CPU_CAPABILITY_AVX512)) && !defined(_MSC_VER)
|
||||||
#include <sleef.h>
|
#include <sleef.h>
|
||||||
#endif
|
#endif
|
||||||
@ -87,7 +88,7 @@ public:
|
|||||||
// Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
|
// Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
|
||||||
// for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
|
// for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
|
||||||
// instructions while a loop would be compiled to one instruction.
|
// instructions while a loop would be compiled to one instruction.
|
||||||
for (auto i = 0; i < size(); ++i) {
|
for (const auto i : c10::irange(size())) {
|
||||||
tmp_values[i] = 0.0;
|
tmp_values[i] = 0.0;
|
||||||
}
|
}
|
||||||
std::memcpy(
|
std::memcpy(
|
||||||
@ -120,7 +121,7 @@ public:
|
|||||||
Vectorized<double> map(double (*const f)(double)) const {
|
Vectorized<double> map(double (*const f)(double)) const {
|
||||||
__at_align__ double tmp[size()];
|
__at_align__ double tmp[size()];
|
||||||
store(tmp);
|
store(tmp);
|
||||||
for (int64_t i = 0; i < size(); i++) {
|
for (const auto i : c10::irange(size())) {
|
||||||
tmp[i] = f(tmp[i]);
|
tmp[i] = f(tmp[i]);
|
||||||
}
|
}
|
||||||
return loadu(tmp);
|
return loadu(tmp);
|
||||||
@ -200,7 +201,7 @@ public:
|
|||||||
__at_align__ double tmp_x[size()];
|
__at_align__ double tmp_x[size()];
|
||||||
store(tmp);
|
store(tmp);
|
||||||
x.store(tmp_x);
|
x.store(tmp_x);
|
||||||
for (int64_t i = 0; i < size(); i++) {
|
for (const auto i : c10::irange(size())) {
|
||||||
tmp[i] = calc_igamma(tmp[i], tmp_x[i]);
|
tmp[i] = calc_igamma(tmp[i], tmp_x[i]);
|
||||||
}
|
}
|
||||||
return loadu(tmp);
|
return loadu(tmp);
|
||||||
@ -210,7 +211,7 @@ public:
|
|||||||
__at_align__ double tmp_x[size()];
|
__at_align__ double tmp_x[size()];
|
||||||
store(tmp);
|
store(tmp);
|
||||||
x.store(tmp_x);
|
x.store(tmp_x);
|
||||||
for (int64_t i = 0; i < size(); i++) {
|
for (const auto i : c10::irange(size())) {
|
||||||
tmp[i] = calc_igammac(tmp[i], tmp_x[i]);
|
tmp[i] = calc_igammac(tmp[i], tmp_x[i]);
|
||||||
}
|
}
|
||||||
return loadu(tmp);
|
return loadu(tmp);
|
||||||
|
@ -5,6 +5,7 @@
|
|||||||
|
|
||||||
#include <ATen/cpu/vec/intrinsics.h>
|
#include <ATen/cpu/vec/intrinsics.h>
|
||||||
#include <ATen/cpu/vec/vec_base.h>
|
#include <ATen/cpu/vec/vec_base.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
|
#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
|
||||||
#include <sleef.h>
|
#include <sleef.h>
|
||||||
#endif
|
#endif
|
||||||
@ -104,7 +105,7 @@ public:
|
|||||||
// Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
|
// Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
|
||||||
// for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
|
// for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
|
||||||
// instructions while a loop would be compiled to one instruction.
|
// instructions while a loop would be compiled to one instruction.
|
||||||
for (auto i = 0; i < size(); ++i) {
|
for (const auto i : c10::irange(size())) {
|
||||||
tmp_values[i] = 0.0;
|
tmp_values[i] = 0.0;
|
||||||
}
|
}
|
||||||
std::memcpy(
|
std::memcpy(
|
||||||
@ -135,7 +136,7 @@ public:
|
|||||||
Vectorized<float> map(float (*const f)(float)) const {
|
Vectorized<float> map(float (*const f)(float)) const {
|
||||||
__at_align__ float tmp[size()];
|
__at_align__ float tmp[size()];
|
||||||
store(tmp);
|
store(tmp);
|
||||||
for (int64_t i = 0; i < size(); i++) {
|
for (const auto i : c10::irange(size())) {
|
||||||
tmp[i] = f(tmp[i]);
|
tmp[i] = f(tmp[i]);
|
||||||
}
|
}
|
||||||
return loadu(tmp);
|
return loadu(tmp);
|
||||||
@ -246,7 +247,7 @@ public:
|
|||||||
__at_align__ float tmp_x[size()];
|
__at_align__ float tmp_x[size()];
|
||||||
store(tmp);
|
store(tmp);
|
||||||
x.store(tmp_x);
|
x.store(tmp_x);
|
||||||
for (int64_t i = 0; i < size(); i++) {
|
for (const auto i : c10::irange(size())) {
|
||||||
tmp[i] = calc_igamma(tmp[i], tmp_x[i]);
|
tmp[i] = calc_igamma(tmp[i], tmp_x[i]);
|
||||||
}
|
}
|
||||||
return loadu(tmp);
|
return loadu(tmp);
|
||||||
@ -256,7 +257,7 @@ public:
|
|||||||
__at_align__ float tmp_x[size()];
|
__at_align__ float tmp_x[size()];
|
||||||
store(tmp);
|
store(tmp);
|
||||||
x.store(tmp_x);
|
x.store(tmp_x);
|
||||||
for (int64_t i = 0; i < size(); i++) {
|
for (const auto i : c10::irange(size())) {
|
||||||
tmp[i] = calc_igammac(tmp[i], tmp_x[i]);
|
tmp[i] = calc_igammac(tmp[i], tmp_x[i]);
|
||||||
}
|
}
|
||||||
return loadu(tmp);
|
return loadu(tmp);
|
||||||
|
@ -6,6 +6,7 @@
|
|||||||
#include <ATen/cpu/vec/intrinsics.h>
|
#include <ATen/cpu/vec/intrinsics.h>
|
||||||
#include <ATen/cpu/vec/vec_base.h>
|
#include <ATen/cpu/vec/vec_base.h>
|
||||||
#include <c10/macros/Macros.h>
|
#include <c10/macros/Macros.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
namespace at {
|
namespace at {
|
||||||
namespace vec {
|
namespace vec {
|
||||||
@ -100,7 +101,7 @@ public:
|
|||||||
// Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
|
// Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
|
||||||
// for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
|
// for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
|
||||||
// instructions while a loop would be compiled to one instruction.
|
// instructions while a loop would be compiled to one instruction.
|
||||||
for (auto i = 0; i < size(); ++i) {
|
for (const auto i : c10::irange(size())) {
|
||||||
tmp_values[i] = 0;
|
tmp_values[i] = 0;
|
||||||
}
|
}
|
||||||
std::memcpy(tmp_values, ptr, count * sizeof(int64_t));
|
std::memcpy(tmp_values, ptr, count * sizeof(int64_t));
|
||||||
@ -253,7 +254,7 @@ public:
|
|||||||
// Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
|
// Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
|
||||||
// for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
|
// for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
|
||||||
// instructions while a loop would be compiled to one instruction.
|
// instructions while a loop would be compiled to one instruction.
|
||||||
for (auto i = 0; i < size(); ++i) {
|
for (const auto i : c10::irange(size())) {
|
||||||
tmp_values[i] = 0;
|
tmp_values[i] = 0;
|
||||||
}
|
}
|
||||||
std::memcpy(tmp_values, ptr, count * sizeof(int32_t));
|
std::memcpy(tmp_values, ptr, count * sizeof(int32_t));
|
||||||
@ -485,7 +486,7 @@ public:
|
|||||||
// Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
|
// Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
|
||||||
// for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
|
// for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
|
||||||
// instructions while a loop would be compiled to one instruction.
|
// instructions while a loop would be compiled to one instruction.
|
||||||
for (auto i = 0; i < size(); ++i) {
|
for (const auto i : c10::irange(size())) {
|
||||||
tmp_values[i] = 0;
|
tmp_values[i] = 0;
|
||||||
}
|
}
|
||||||
std::memcpy(tmp_values, ptr, count * sizeof(int16_t));
|
std::memcpy(tmp_values, ptr, count * sizeof(int16_t));
|
||||||
@ -761,7 +762,7 @@ public:
|
|||||||
// Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
|
// Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
|
||||||
// for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
|
// for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
|
||||||
// instructions while a loop would be compiled to one instruction.
|
// instructions while a loop would be compiled to one instruction.
|
||||||
for (size_t i = 0; i < size(); ++i) {
|
for (const auto i : c10::irange(size())) {
|
||||||
tmp_values[i] = 0;
|
tmp_values[i] = 0;
|
||||||
}
|
}
|
||||||
std::memcpy(tmp_values, ptr, count * sizeof(int8_t));
|
std::memcpy(tmp_values, ptr, count * sizeof(int8_t));
|
||||||
|
@ -6,6 +6,8 @@
|
|||||||
#include <ATen/cpu/vec/intrinsics.h>
|
#include <ATen/cpu/vec/intrinsics.h>
|
||||||
#include <ATen/cpu/vec/vec_base.h>
|
#include <ATen/cpu/vec/vec_base.h>
|
||||||
#include <ATen/native/quantized/affine_quantizer_base.h>
|
#include <ATen/native/quantized/affine_quantizer_base.h>
|
||||||
|
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <c10/util/qint32.h>
|
#include <c10/util/qint32.h>
|
||||||
#include <c10/util/qint8.h>
|
#include <c10/util/qint8.h>
|
||||||
#include <c10/util/quint8.h>
|
#include <c10/util/quint8.h>
|
||||||
@ -744,7 +746,7 @@ struct VectorizedQuantizedConverter {
|
|||||||
std::array<value_type, size_> vals;
|
std::array<value_type, size_> vals;
|
||||||
|
|
||||||
VectorizedQuantizedConverter(T val) {
|
VectorizedQuantizedConverter(T val) {
|
||||||
for (size_t i = 0; i < size(); ++i) {
|
for (const auto i : c10::irange(size())) {
|
||||||
vals[i] = val.val_;
|
vals[i] = val.val_;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -762,9 +764,9 @@ struct VectorizedQuantizedConverter {
|
|||||||
Vectorized<float> zero_point,
|
Vectorized<float> zero_point,
|
||||||
Vectorized<float> scale_zp_premul) const {
|
Vectorized<float> scale_zp_premul) const {
|
||||||
float_vec_return_type rv;
|
float_vec_return_type rv;
|
||||||
for (int i = 0; i < float_num_vecs(); ++i) {
|
for (const auto i : c10::irange(float_num_vecs())) {
|
||||||
float tmp_vals[16];
|
float tmp_vals[16];
|
||||||
for (int j = 0; j < 16; ++j) {
|
for (const auto j : c10::irange(16)) {
|
||||||
tmp_vals[j] = at::native::dequantize_val<T>(
|
tmp_vals[j] = at::native::dequantize_val<T>(
|
||||||
scale[j], zero_point[j], T(vals[16 * i + j]));
|
scale[j], zero_point[j], T(vals[16 * i + j]));
|
||||||
}
|
}
|
||||||
@ -829,7 +831,7 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
|
|||||||
std::array<value_type, size()> qvals;
|
std::array<value_type, size()> qvals;
|
||||||
std::array<float, float_num_vecs() * 16> float_vals;
|
std::array<float, float_num_vecs() * 16> float_vals;
|
||||||
|
|
||||||
for (int i = 0; i < float_num_vecs(); ++i) {
|
for (const auto i : c10::irange(float_num_vecs())) {
|
||||||
rhs[i].store(&float_vals[i * 16], 16);
|
rhs[i].store(&float_vals[i * 16], 16);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -845,7 +847,7 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
|
|||||||
|
|
||||||
Vectorized<c10::qint32> maximum(Vectorized<c10::qint32> b) const {
|
Vectorized<c10::qint32> maximum(Vectorized<c10::qint32> b) const {
|
||||||
Vectorized<c10::qint32> retval;
|
Vectorized<c10::qint32> retval;
|
||||||
for (size_t i = 0; i < size(); ++i) {
|
for (const auto i : c10::irange(size())) {
|
||||||
retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]);
|
retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]);
|
||||||
}
|
}
|
||||||
return retval;
|
return retval;
|
||||||
@ -853,7 +855,7 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
|
|||||||
|
|
||||||
Vectorized<c10::qint32> minimum(Vectorized<c10::qint32> b) const {
|
Vectorized<c10::qint32> minimum(Vectorized<c10::qint32> b) const {
|
||||||
Vectorized<c10::qint32> retval;
|
Vectorized<c10::qint32> retval;
|
||||||
for (size_t i = 0; i < size(); ++i) {
|
for (const auto i : c10::irange(size())) {
|
||||||
retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]);
|
retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]);
|
||||||
}
|
}
|
||||||
return retval;
|
return retval;
|
||||||
@ -868,7 +870,7 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
|
|||||||
Vectorized<c10::qint32> zero_point,
|
Vectorized<c10::qint32> zero_point,
|
||||||
Vectorized<c10::qint32> q_six) {
|
Vectorized<c10::qint32> q_six) {
|
||||||
Vectorized<c10::qint32> retval;
|
Vectorized<c10::qint32> retval;
|
||||||
for (size_t i = 0; i < size(); ++i) {
|
for (const auto i : c10::irange(size())) {
|
||||||
retval.vals[i] = std::min<value_type>(
|
retval.vals[i] = std::min<value_type>(
|
||||||
std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]);
|
std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]);
|
||||||
}
|
}
|
||||||
@ -877,7 +879,7 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
|
|||||||
|
|
||||||
int_vec_return_type widening_subtract(Vectorized<c10::qint32> b) const {
|
int_vec_return_type widening_subtract(Vectorized<c10::qint32> b) const {
|
||||||
int_vec_return_type retval;
|
int_vec_return_type retval;
|
||||||
for (size_t i = 0; i < size(); ++i) {
|
for (const auto i : c10::irange(size())) {
|
||||||
retval[0].vals[i] = vals[i] - b.vals[i];
|
retval[0].vals[i] = vals[i] - b.vals[i];
|
||||||
}
|
}
|
||||||
return retval;
|
return retval;
|
||||||
@ -888,7 +890,7 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
|
|||||||
float multiplier,
|
float multiplier,
|
||||||
int32_t zero_point) {
|
int32_t zero_point) {
|
||||||
Vectorized<c10::qint32> retval;
|
Vectorized<c10::qint32> retval;
|
||||||
for (size_t i = 0; i < size(); ++i) {
|
for (const auto i : c10::irange(size())) {
|
||||||
retval.vals[i] =
|
retval.vals[i] =
|
||||||
nearbyint(static_cast<float>(inp[0].vals[i]) * multiplier) +
|
nearbyint(static_cast<float>(inp[0].vals[i]) * multiplier) +
|
||||||
zero_point;
|
zero_point;
|
||||||
@ -961,7 +963,7 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
|
|||||||
std::array<value_type, size()> qvals;
|
std::array<value_type, size()> qvals;
|
||||||
std::array<float, float_num_vecs() * 16> float_vals;
|
std::array<float, float_num_vecs() * 16> float_vals;
|
||||||
|
|
||||||
for (int i = 0; i < float_num_vecs(); ++i) {
|
for (const auto i : c10::irange(float_num_vecs())) {
|
||||||
rhs[i].store(&float_vals[i * 16], 16);
|
rhs[i].store(&float_vals[i * 16], 16);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -977,7 +979,7 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
|
|||||||
|
|
||||||
Vectorized<c10::qint8> maximum(Vectorized<c10::qint8> b) const {
|
Vectorized<c10::qint8> maximum(Vectorized<c10::qint8> b) const {
|
||||||
Vectorized<c10::qint8> retval;
|
Vectorized<c10::qint8> retval;
|
||||||
for (size_t i = 0; i < size(); ++i) {
|
for (const auto i : c10::irange(size())) {
|
||||||
retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]);
|
retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]);
|
||||||
}
|
}
|
||||||
return retval;
|
return retval;
|
||||||
@ -985,7 +987,7 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
|
|||||||
|
|
||||||
Vectorized<c10::qint8> minimum(Vectorized<c10::qint8> b) const {
|
Vectorized<c10::qint8> minimum(Vectorized<c10::qint8> b) const {
|
||||||
Vectorized<c10::qint8> retval;
|
Vectorized<c10::qint8> retval;
|
||||||
for (size_t i = 0; i < size(); ++i) {
|
for (const auto i : c10::irange(size())) {
|
||||||
retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]);
|
retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]);
|
||||||
}
|
}
|
||||||
return retval;
|
return retval;
|
||||||
@ -999,7 +1001,7 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
|
|||||||
Vectorized<c10::qint8> zero_point,
|
Vectorized<c10::qint8> zero_point,
|
||||||
Vectorized<c10::qint8> q_six) {
|
Vectorized<c10::qint8> q_six) {
|
||||||
Vectorized<c10::qint8> retval;
|
Vectorized<c10::qint8> retval;
|
||||||
for (size_t i = 0; i < size(); ++i) {
|
for (const auto i : c10::irange(size())) {
|
||||||
retval.vals[i] = std::min<value_type>(
|
retval.vals[i] = std::min<value_type>(
|
||||||
std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]);
|
std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]);
|
||||||
}
|
}
|
||||||
@ -1009,8 +1011,8 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
|
|||||||
int_vec_return_type widening_subtract(Vectorized<c10::qint8> b) const {
|
int_vec_return_type widening_subtract(Vectorized<c10::qint8> b) const {
|
||||||
int_vec_return_type retval;
|
int_vec_return_type retval;
|
||||||
constexpr int elem_per_int_vec = size() / int_num_vecs();
|
constexpr int elem_per_int_vec = size() / int_num_vecs();
|
||||||
for (size_t i = 0; i < int_num_vecs(); ++i) {
|
for (const auto i : c10::irange(int_num_vecs())) {
|
||||||
for (size_t j = 0; j < elem_per_int_vec; ++j) {
|
for (const auto j : c10::irange(elem_per_int_vec)) {
|
||||||
retval[i].vals[j] =
|
retval[i].vals[j] =
|
||||||
static_cast<int32_t>(vals[i * elem_per_int_vec + j]) -
|
static_cast<int32_t>(vals[i * elem_per_int_vec + j]) -
|
||||||
static_cast<int32_t>(b.vals[i * elem_per_int_vec + j]);
|
static_cast<int32_t>(b.vals[i * elem_per_int_vec + j]);
|
||||||
@ -1026,8 +1028,8 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
|
|||||||
constexpr auto min_val = std::numeric_limits<value_type>::min();
|
constexpr auto min_val = std::numeric_limits<value_type>::min();
|
||||||
constexpr auto max_val = std::numeric_limits<value_type>::max();
|
constexpr auto max_val = std::numeric_limits<value_type>::max();
|
||||||
Vectorized<c10::qint8> retval;
|
Vectorized<c10::qint8> retval;
|
||||||
for (size_t i = 0; i < int_num_vecs(); ++i) {
|
for (const auto i : c10::irange(int_num_vecs())) {
|
||||||
for (size_t j = 0; j < elem_per_int_vec; ++j) {
|
for (const auto j : c10::irange(elem_per_int_vec)) {
|
||||||
int32_t rounded =
|
int32_t rounded =
|
||||||
nearbyint(static_cast<float>(inp[i].vals[j]) * multiplier) +
|
nearbyint(static_cast<float>(inp[i].vals[j]) * multiplier) +
|
||||||
zero_point;
|
zero_point;
|
||||||
@ -1081,7 +1083,7 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
|
|||||||
std::array<value_type, size()> qvals;
|
std::array<value_type, size()> qvals;
|
||||||
std::array<float, float_num_vecs() * 16> float_vals;
|
std::array<float, float_num_vecs() * 16> float_vals;
|
||||||
|
|
||||||
for (int i = 0; i < float_num_vecs(); ++i) {
|
for (const auto i : c10::irange(float_num_vecs())) {
|
||||||
rhs[i].store(&float_vals[i * 16], 16);
|
rhs[i].store(&float_vals[i * 16], 16);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1097,7 +1099,7 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
|
|||||||
|
|
||||||
Vectorized<c10::quint8> maximum(Vectorized<c10::quint8> b) const {
|
Vectorized<c10::quint8> maximum(Vectorized<c10::quint8> b) const {
|
||||||
Vectorized<c10::quint8> retval;
|
Vectorized<c10::quint8> retval;
|
||||||
for (size_t i = 0; i < size(); ++i) {
|
for (const auto i : c10::irange(size())) {
|
||||||
retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]);
|
retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]);
|
||||||
}
|
}
|
||||||
return retval;
|
return retval;
|
||||||
@ -1105,7 +1107,7 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
|
|||||||
|
|
||||||
Vectorized<c10::quint8> minimum(Vectorized<c10::quint8> b) const {
|
Vectorized<c10::quint8> minimum(Vectorized<c10::quint8> b) const {
|
||||||
Vectorized<c10::quint8> retval;
|
Vectorized<c10::quint8> retval;
|
||||||
for (size_t i = 0; i < size(); ++i) {
|
for (const auto i : c10::irange(size())) {
|
||||||
retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]);
|
retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]);
|
||||||
}
|
}
|
||||||
return retval;
|
return retval;
|
||||||
@ -1120,7 +1122,7 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
|
|||||||
Vectorized<c10::quint8> zero_point,
|
Vectorized<c10::quint8> zero_point,
|
||||||
Vectorized<c10::quint8> q_six) {
|
Vectorized<c10::quint8> q_six) {
|
||||||
Vectorized<c10::quint8> retval;
|
Vectorized<c10::quint8> retval;
|
||||||
for (size_t i = 0; i < size(); ++i) {
|
for (const auto i : c10::irange(size())) {
|
||||||
retval.vals[i] = std::min<value_type>(
|
retval.vals[i] = std::min<value_type>(
|
||||||
std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]);
|
std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]);
|
||||||
}
|
}
|
||||||
@ -1130,8 +1132,8 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
|
|||||||
int_vec_return_type widening_subtract(Vectorized<c10::quint8> b) const {
|
int_vec_return_type widening_subtract(Vectorized<c10::quint8> b) const {
|
||||||
int_vec_return_type retval;
|
int_vec_return_type retval;
|
||||||
constexpr int elem_per_int_vec = size() / int_num_vecs();
|
constexpr int elem_per_int_vec = size() / int_num_vecs();
|
||||||
for (size_t i = 0; i < int_num_vecs(); ++i) {
|
for (const auto i : c10::irange(int_num_vecs())) {
|
||||||
for (size_t j = 0; j < elem_per_int_vec; ++j) {
|
for (const auto j : c10::irange(elem_per_int_vec)) {
|
||||||
retval[i].vals[j] =
|
retval[i].vals[j] =
|
||||||
static_cast<int32_t>(vals[i * elem_per_int_vec + j]) -
|
static_cast<int32_t>(vals[i * elem_per_int_vec + j]) -
|
||||||
static_cast<int32_t>(b.vals[i * elem_per_int_vec + j]);
|
static_cast<int32_t>(b.vals[i * elem_per_int_vec + j]);
|
||||||
@ -1147,8 +1149,8 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
|
|||||||
constexpr auto min_val = std::numeric_limits<value_type>::min();
|
constexpr auto min_val = std::numeric_limits<value_type>::min();
|
||||||
constexpr auto max_val = std::numeric_limits<value_type>::max();
|
constexpr auto max_val = std::numeric_limits<value_type>::max();
|
||||||
Vectorized<c10::quint8> retval;
|
Vectorized<c10::quint8> retval;
|
||||||
for (size_t i = 0; i < int_num_vecs(); ++i) {
|
for (const auto i : c10::irange(int_num_vecs())) {
|
||||||
for (size_t j = 0; j < elem_per_int_vec; ++j) {
|
for (const auto j : c10::irange(elem_per_int_vec)) {
|
||||||
int32_t rounded =
|
int32_t rounded =
|
||||||
nearbyint(static_cast<float>(inp[i].vals[j]) * multiplier) +
|
nearbyint(static_cast<float>(inp[i].vals[j]) * multiplier) +
|
||||||
zero_point;
|
zero_point;
|
||||||
|
@ -31,6 +31,7 @@
|
|||||||
#include <ATen/native/cpu/zmath.h>
|
#include <ATen/native/cpu/zmath.h>
|
||||||
#include <c10/util/TypeCast.h>
|
#include <c10/util/TypeCast.h>
|
||||||
#include <c10/macros/Macros.h>
|
#include <c10/macros/Macros.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
// These macros helped us unify vec_base.h
|
// These macros helped us unify vec_base.h
|
||||||
#ifdef CPU_CAPABILITY_AVX512
|
#ifdef CPU_CAPABILITY_AVX512
|
||||||
@ -150,7 +151,7 @@ public:
|
|||||||
static Vectorized<T> blend(const Vectorized<T>& a, const Vectorized<T>& b) {
|
static Vectorized<T> blend(const Vectorized<T>& a, const Vectorized<T>& b) {
|
||||||
int64_t mask = mask_;
|
int64_t mask = mask_;
|
||||||
Vectorized vector;
|
Vectorized vector;
|
||||||
for (int64_t i = 0; i < size(); i++) {
|
for (const auto i : c10::irange(size())) {
|
||||||
if (mask & 0x01) {
|
if (mask & 0x01) {
|
||||||
vector[i] = b[i];
|
vector[i] = b[i];
|
||||||
} else {
|
} else {
|
||||||
@ -165,7 +166,7 @@ public:
|
|||||||
Vectorized vector;
|
Vectorized vector;
|
||||||
int_same_size_t<T> buffer[size()];
|
int_same_size_t<T> buffer[size()];
|
||||||
mask.store(buffer);
|
mask.store(buffer);
|
||||||
for (int64_t i = 0; i < size(); i++) {
|
for (const auto i : c10::irange(size())) {
|
||||||
if (buffer[i] & 0x01)
|
if (buffer[i] & 0x01)
|
||||||
{
|
{
|
||||||
vector[i] = b[i];
|
vector[i] = b[i];
|
||||||
@ -178,14 +179,14 @@ public:
|
|||||||
template<typename step_t> // step sometimes requires a higher precision type (e.g., T=int, step_t=double)
|
template<typename step_t> // step sometimes requires a higher precision type (e.g., T=int, step_t=double)
|
||||||
static Vectorized<T> arange(T base = static_cast<T>(0), step_t step = static_cast<step_t>(1)) {
|
static Vectorized<T> arange(T base = static_cast<T>(0), step_t step = static_cast<step_t>(1)) {
|
||||||
Vectorized vector;
|
Vectorized vector;
|
||||||
for (int64_t i = 0; i < size(); i++) {
|
for (const auto i : c10::irange(size())) {
|
||||||
vector.values[i] = base + i * step;
|
vector.values[i] = base + i * step;
|
||||||
}
|
}
|
||||||
return vector;
|
return vector;
|
||||||
}
|
}
|
||||||
static Vectorized<T> set(const Vectorized<T>& a, const Vectorized<T>& b, int64_t count = size()) {
|
static Vectorized<T> set(const Vectorized<T>& a, const Vectorized<T>& b, int64_t count = size()) {
|
||||||
Vectorized vector;
|
Vectorized vector;
|
||||||
for (int64_t i = 0; i < size(); i++) {
|
for (const auto i : c10::irange(size())) {
|
||||||
if (i < count) {
|
if (i < count) {
|
||||||
vector[i] = b[i];
|
vector[i] = b[i];
|
||||||
} else {
|
} else {
|
||||||
@ -340,7 +341,7 @@ public:
|
|||||||
}
|
}
|
||||||
Vectorized<T> atan2(const Vectorized<T> &exp) const {
|
Vectorized<T> atan2(const Vectorized<T> &exp) const {
|
||||||
Vectorized<T> ret;
|
Vectorized<T> ret;
|
||||||
for (int64_t i = 0; i < size(); i++) {
|
for (const auto i : c10::irange(size())) {
|
||||||
ret[i] = std::atan2(values[i], exp[i]);
|
ret[i] = std::atan2(values[i], exp[i]);
|
||||||
}
|
}
|
||||||
return ret;
|
return ret;
|
||||||
@ -380,7 +381,7 @@ public:
|
|||||||
// U is for SFINAE purposes only. Make sure it is not changed.
|
// U is for SFINAE purposes only. Make sure it is not changed.
|
||||||
static_assert(std::is_same<U, T>::value, "U must be T");
|
static_assert(std::is_same<U, T>::value, "U must be T");
|
||||||
Vectorized<T> ret;
|
Vectorized<T> ret;
|
||||||
for (int64_t i = 0; i < size(); i++) {
|
for (const auto i : c10::irange(size())) {
|
||||||
ret[i] = std::fmod(values[i], q[i]);
|
ret[i] = std::fmod(values[i], q[i]);
|
||||||
}
|
}
|
||||||
return ret;
|
return ret;
|
||||||
@ -423,7 +424,7 @@ public:
|
|||||||
}
|
}
|
||||||
Vectorized<T> hypot(const Vectorized<T> &b) const {
|
Vectorized<T> hypot(const Vectorized<T> &b) const {
|
||||||
Vectorized<T> ret;
|
Vectorized<T> ret;
|
||||||
for (int64_t i = 0; i < size(); i++) {
|
for (const auto i : c10::irange(size())) {
|
||||||
ret[i] = std::hypot(values[i], b[i]);
|
ret[i] = std::hypot(values[i], b[i]);
|
||||||
}
|
}
|
||||||
return ret;
|
return ret;
|
||||||
@ -436,14 +437,14 @@ public:
|
|||||||
}
|
}
|
||||||
Vectorized<T> igamma(const Vectorized<T> &x) const {
|
Vectorized<T> igamma(const Vectorized<T> &x) const {
|
||||||
Vectorized<T> ret;
|
Vectorized<T> ret;
|
||||||
for (int64_t i = 0; i < size(); i++) {
|
for (const auto i : c10::irange(size())) {
|
||||||
ret[i] = calc_igamma(values[i], x[i]);
|
ret[i] = calc_igamma(values[i], x[i]);
|
||||||
}
|
}
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
Vectorized<T> igammac(const Vectorized<T> &x) const {
|
Vectorized<T> igammac(const Vectorized<T> &x) const {
|
||||||
Vectorized<T> ret;
|
Vectorized<T> ret;
|
||||||
for (int64_t i = 0; i < size(); i++) {
|
for (const auto i : c10::irange(size())) {
|
||||||
ret[i] = calc_igammac(values[i], x[i]);
|
ret[i] = calc_igammac(values[i], x[i]);
|
||||||
}
|
}
|
||||||
return ret;
|
return ret;
|
||||||
@ -456,7 +457,7 @@ public:
|
|||||||
}
|
}
|
||||||
Vectorized<T> nextafter(const Vectorized<T> &b) const {
|
Vectorized<T> nextafter(const Vectorized<T> &b) const {
|
||||||
Vectorized<T> ret;
|
Vectorized<T> ret;
|
||||||
for (int64_t i = 0; i < size(); i++) {
|
for (const auto i : c10::irange(size())) {
|
||||||
ret[i] = std::nextafter(values[i], b[i]);
|
ret[i] = std::nextafter(values[i], b[i]);
|
||||||
}
|
}
|
||||||
return ret;
|
return ret;
|
||||||
@ -494,7 +495,7 @@ public:
|
|||||||
}
|
}
|
||||||
Vectorized<T> pow(const Vectorized<T> &exp) const {
|
Vectorized<T> pow(const Vectorized<T> &exp) const {
|
||||||
Vectorized<T> ret;
|
Vectorized<T> ret;
|
||||||
for (int64_t i = 0; i < size(); i++) {
|
for (const auto i : c10::irange(size())) {
|
||||||
ret[i] = std::pow(values[i], exp[i]);
|
ret[i] = std::pow(values[i], exp[i]);
|
||||||
}
|
}
|
||||||
return ret;
|
return ret;
|
||||||
@ -808,7 +809,7 @@ inline gather(T const* base_addr, const Vectorized<int_same_size_t<T>>& vindex)
|
|||||||
int_same_size_t<T> index_arr[size];
|
int_same_size_t<T> index_arr[size];
|
||||||
vindex.store(static_cast<void*>(index_arr));
|
vindex.store(static_cast<void*>(index_arr));
|
||||||
T buffer[size];
|
T buffer[size];
|
||||||
for (int64_t i = 0; i < size; i++) {
|
for (const auto i : c10::irange(size)) {
|
||||||
buffer[i] = base_addr[index_arr[i] * scale / sizeof(T)];
|
buffer[i] = base_addr[index_arr[i] * scale / sizeof(T)];
|
||||||
}
|
}
|
||||||
return Vectorized<T>::loadu(static_cast<void*>(buffer));
|
return Vectorized<T>::loadu(static_cast<void*>(buffer));
|
||||||
@ -826,7 +827,7 @@ inline mask_gather(const Vectorized<T>& src, T const* base_addr,
|
|||||||
mask.store(static_cast<void*>(mask_arr));
|
mask.store(static_cast<void*>(mask_arr));
|
||||||
vindex.store(static_cast<void*>(index_arr));
|
vindex.store(static_cast<void*>(index_arr));
|
||||||
T buffer[size];
|
T buffer[size];
|
||||||
for (int64_t i = 0; i < size; i++) {
|
for (const auto i : c10::irange(size)) {
|
||||||
if (mask_arr[i] & 0x01) { // check highest bit
|
if (mask_arr[i] & 0x01) { // check highest bit
|
||||||
buffer[i] = base_addr[index_arr[i] * scale / sizeof(T)];
|
buffer[i] = base_addr[index_arr[i] * scale / sizeof(T)];
|
||||||
} else {
|
} else {
|
||||||
@ -872,7 +873,7 @@ inline Vectorized<int_same_size_t<T>> convert_to_int_of_same_size(const Vectoriz
|
|||||||
T src_arr[size];
|
T src_arr[size];
|
||||||
src.store(static_cast<void*>(src_arr));
|
src.store(static_cast<void*>(src_arr));
|
||||||
int_same_size_t<T> buffer[size];
|
int_same_size_t<T> buffer[size];
|
||||||
for (int64_t i = 0; i < size; i++) {
|
for (const auto i : c10::irange(size)) {
|
||||||
buffer[i] = static_cast<int_same_size_t<T>>(src_arr[i]);
|
buffer[i] = static_cast<int_same_size_t<T>>(src_arr[i]);
|
||||||
}
|
}
|
||||||
return Vectorized<int_same_size_t<T>>::loadu(static_cast<void*>(buffer));
|
return Vectorized<int_same_size_t<T>>::loadu(static_cast<void*>(buffer));
|
||||||
@ -899,7 +900,7 @@ deinterleave2(const Vectorized<T>& a, const Vectorized<T>& b) {
|
|||||||
T buffer2[size];
|
T buffer2[size];
|
||||||
a.store(static_cast<void*>(a_arr));
|
a.store(static_cast<void*>(a_arr));
|
||||||
b.store(static_cast<void*>(b_arr));
|
b.store(static_cast<void*>(b_arr));
|
||||||
for (int64_t i = 0; i < half_size; i++) {
|
for (const auto i : c10::irange(half_size)) {
|
||||||
buffer1[i] = a_arr[i * 2];
|
buffer1[i] = a_arr[i * 2];
|
||||||
buffer1[half_size + i] = b_arr[i * 2];
|
buffer1[half_size + i] = b_arr[i * 2];
|
||||||
buffer2[i] = a_arr[i * 2 + 1];
|
buffer2[i] = a_arr[i * 2 + 1];
|
||||||
@ -931,7 +932,7 @@ interleave2(const Vectorized<T>& a, const Vectorized<T>& b) {
|
|||||||
T buffer2[size];
|
T buffer2[size];
|
||||||
a.store(static_cast<void*>(a_arr));
|
a.store(static_cast<void*>(a_arr));
|
||||||
b.store(static_cast<void*>(b_arr));
|
b.store(static_cast<void*>(b_arr));
|
||||||
for (int64_t i = 0; i < half_size; i++) {
|
for (const auto i : c10::irange(half_size)) {
|
||||||
buffer1[i * 2] = a_arr[i];
|
buffer1[i * 2] = a_arr[i];
|
||||||
buffer1[i * 2 + 1] = b_arr[i];
|
buffer1[i * 2 + 1] = b_arr[i];
|
||||||
buffer2[i * 2] = a_arr[half_size + i];
|
buffer2[i * 2] = a_arr[half_size + i];
|
||||||
@ -946,7 +947,8 @@ inline void convert(const src_T *src, dst_T *dst, int64_t n) {
|
|||||||
#ifndef _MSC_VER
|
#ifndef _MSC_VER
|
||||||
# pragma unroll
|
# pragma unroll
|
||||||
#endif
|
#endif
|
||||||
for (int64_t i = 0; i < n; i++) {
|
for (const auto i : c10::irange(n)) {
|
||||||
|
(void)i; //Suppress unused variable warning
|
||||||
*dst = c10::static_cast_with_inter_type<dst_T, src_T>::apply(*src);
|
*dst = c10::static_cast_with_inter_type<dst_T, src_T>::apply(*src);
|
||||||
src++;
|
src++;
|
||||||
dst++;
|
dst++;
|
||||||
|
@ -4,6 +4,7 @@
|
|||||||
|
|
||||||
#include <ATen/cuda/CUDABlas.h>
|
#include <ATen/cuda/CUDABlas.h>
|
||||||
#include <ATen/cuda/Exceptions.h>
|
#include <ATen/cuda/Exceptions.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
#define CUDABLAS_POSINT_CHECK(FD, X) \
|
#define CUDABLAS_POSINT_CHECK(FD, X) \
|
||||||
TORCH_CHECK( \
|
TORCH_CHECK( \
|
||||||
@ -295,7 +296,7 @@ void bgemm<at::Half>(CUDABLAS_BGEMM_ARGTYPES(at::Half)) {
|
|||||||
c, CUDA_R_16F, ldc, stridec,
|
c, CUDA_R_16F, ldc, stridec,
|
||||||
num_batches, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
num_batches, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
||||||
} else {
|
} else {
|
||||||
for (int64_t i = 0; i < num_batches; ++i) {
|
for (const auto i : c10::irange(num_batches)) {
|
||||||
at::cuda::blas::gemm<at::Half>(
|
at::cuda::blas::gemm<at::Half>(
|
||||||
transa, transb,
|
transa, transb,
|
||||||
m, n, k,
|
m, n, k,
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
#include <ATen/cudnn/Descriptors.h>
|
#include <ATen/cudnn/Descriptors.h>
|
||||||
|
|
||||||
#include <ATen/ATen.h>
|
#include <ATen/ATen.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
@ -47,11 +48,11 @@ void TensorDescriptor::set(cudnnDataType_t datatype, IntArrayRef t_sizes, IntArr
|
|||||||
#undef STR
|
#undef STR
|
||||||
int size[CUDNN_DIM_MAX];
|
int size[CUDNN_DIM_MAX];
|
||||||
int stride[CUDNN_DIM_MAX];
|
int stride[CUDNN_DIM_MAX];
|
||||||
for (size_t i = 0; i < dim; ++i) {
|
for (const auto i : c10::irange(dim)) {
|
||||||
size[i] = static_cast<int>(t_sizes[i]);
|
size[i] = static_cast<int>(t_sizes[i]);
|
||||||
stride[i] = static_cast<int>(t_strides[i]);
|
stride[i] = static_cast<int>(t_strides[i]);
|
||||||
}
|
}
|
||||||
for (size_t i = dim; i < pad; ++i) {
|
for (const auto i : c10::irange(dim, pad)) {
|
||||||
size[i] = 1;
|
size[i] = 1;
|
||||||
stride[i] = 1;
|
stride[i] = 1;
|
||||||
}
|
}
|
||||||
@ -126,10 +127,10 @@ void FilterDescriptor::set(const at::Tensor &t, const at::MemoryFormat memory_fo
|
|||||||
"cuDNN filters (a.k.a. weights) must be contiguous in desired memory_format");
|
"cuDNN filters (a.k.a. weights) must be contiguous in desired memory_format");
|
||||||
|
|
||||||
int size[CUDNN_DIM_MAX];
|
int size[CUDNN_DIM_MAX];
|
||||||
for (int i = 0; i < dim; ++i) {
|
for (const auto i : c10::irange(dim)) {
|
||||||
size[i] = (int) t.size(i);
|
size[i] = (int) t.size(i);
|
||||||
}
|
}
|
||||||
for (int i = dim; i < pad; ++i) {
|
for (const auto i : c10::irange(dim, pad)) {
|
||||||
size[i] = (int) 1;
|
size[i] = (int) 1;
|
||||||
}
|
}
|
||||||
dim = std::max(dim, pad);
|
dim = std::max(dim, pad);
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
#include <ATen/miopen/Descriptors.h>
|
#include <ATen/miopen/Descriptors.h>
|
||||||
#include <ATen/ATen.h>
|
#include <ATen/ATen.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
|
||||||
@ -39,11 +40,11 @@ void TensorDescriptor::set(miopenDataType_t datatype, IntArrayRef t_sizes, IntAr
|
|||||||
#undef STR
|
#undef STR
|
||||||
int size[MIOPEN_DIM_MAX];
|
int size[MIOPEN_DIM_MAX];
|
||||||
int stride[MIOPEN_DIM_MAX];
|
int stride[MIOPEN_DIM_MAX];
|
||||||
for (size_t i = 0; i < dim; ++i) {
|
for (const auto i : c10::irange(dim)) {
|
||||||
size[i] = static_cast<int>(t_sizes[i]);
|
size[i] = static_cast<int>(t_sizes[i]);
|
||||||
stride[i] = static_cast<int>(t_strides[i]);
|
stride[i] = static_cast<int>(t_strides[i]);
|
||||||
}
|
}
|
||||||
for (size_t i = dim; i < pad; ++i) {
|
for (const auto i : c10::irange(dim, pad)) {
|
||||||
size[i] = 1;
|
size[i] = 1;
|
||||||
stride[i] = 1;
|
stride[i] = 1;
|
||||||
}
|
}
|
||||||
@ -103,10 +104,10 @@ void FilterDescriptor::set(const at::Tensor &t, const at::MemoryFormat memory_fo
|
|||||||
|
|
||||||
int size[MIOPEN_DIM_MAX];
|
int size[MIOPEN_DIM_MAX];
|
||||||
int stride[MIOPEN_DIM_MAX];
|
int stride[MIOPEN_DIM_MAX];
|
||||||
for (int i = 0; i < dim; ++i) {
|
for (const auto i : c10::irange(dim)) {
|
||||||
size[i] = (int) t.size(i);
|
size[i] = (int) t.size(i);
|
||||||
}
|
}
|
||||||
for (int i = dim; i < pad; ++i) {
|
for (const auto i : c10::irange(dim, pad)) {
|
||||||
size[i] = (int) 1;
|
size[i] = (int) 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -500,7 +500,7 @@ inline void _rrelu_with_noise_train(
|
|||||||
scalar_t* noise_data = noise.data_ptr<scalar_t>();
|
scalar_t* noise_data = noise.data_ptr<scalar_t>();
|
||||||
auto gen = at::get_generator_or_default<CPUGeneratorImpl>(generator, detail::getDefaultCPUGenerator());
|
auto gen = at::get_generator_or_default<CPUGeneratorImpl>(generator, detail::getDefaultCPUGenerator());
|
||||||
std::lock_guard<std::mutex> lock(gen->mutex_);
|
std::lock_guard<std::mutex> lock(gen->mutex_);
|
||||||
for (int64_t i = 0; i < input.numel(); i++) {
|
for (const auto i : c10::irange(input.numel())) {
|
||||||
if (input_data[i] <= 0) {
|
if (input_data[i] <= 0) {
|
||||||
at::uniform_real_distribution<double> uniform(lower, upper);
|
at::uniform_real_distribution<double> uniform(lower, upper);
|
||||||
const scalar_t r = (scalar_t)uniform(gen);
|
const scalar_t r = (scalar_t)uniform(gen);
|
||||||
@ -610,7 +610,7 @@ void inline prelu_cpu_kernel_share_weights(
|
|||||||
auto weight_val = weight.data_ptr<scalar_t>()[0];
|
auto weight_val = weight.data_ptr<scalar_t>()[0];
|
||||||
|
|
||||||
at::parallel_for(0, input_numel, 1000, [&](int64_t start, int64_t end) {
|
at::parallel_for(0, input_numel, 1000, [&](int64_t start, int64_t end) {
|
||||||
for (auto i = start; i < end; i++) {
|
for (const auto i : c10::irange(start, end)) {
|
||||||
scalar_t input_data_val = input_data[i];
|
scalar_t input_data_val = input_data[i];
|
||||||
// to allow for compiler optimization, here splitting into two lines:
|
// to allow for compiler optimization, here splitting into two lines:
|
||||||
scalar_t r = (input_data_val > 0) ? scalar_t(1) : weight_val;
|
scalar_t r = (input_data_val > 0) ? scalar_t(1) : weight_val;
|
||||||
@ -725,7 +725,7 @@ void inline prelu_cpu_backward_kernel_share_weights(
|
|||||||
scalar_t sum = at::parallel_reduce(0, input_numel, 1000, scalar_t(0),
|
scalar_t sum = at::parallel_reduce(0, input_numel, 1000, scalar_t(0),
|
||||||
[&](int64_t start, int64_t end, scalar_t ident) -> scalar_t {
|
[&](int64_t start, int64_t end, scalar_t ident) -> scalar_t {
|
||||||
scalar_t partial_sum = ident;
|
scalar_t partial_sum = ident;
|
||||||
for (auto i = start; i < end; i++) {
|
for (const auto i : c10::irange(start, end)) {
|
||||||
scalar_t input_data_val = input_data[i];
|
scalar_t input_data_val = input_data[i];
|
||||||
scalar_t grad_out_data_val = grad_out_data[i];
|
scalar_t grad_out_data_val = grad_out_data[i];
|
||||||
// to allow for compiler optimization, here splitting into two lines:
|
// to allow for compiler optimization, here splitting into two lines:
|
||||||
@ -839,7 +839,7 @@ std::tuple<Tensor, Tensor> prelu_backward_cpu(const Tensor& grad_out_, const Ten
|
|||||||
std::vector<int64_t> reduce_dims;
|
std::vector<int64_t> reduce_dims;
|
||||||
reduce_dims.push_back(0);
|
reduce_dims.push_back(0);
|
||||||
if (dims > 2) {
|
if (dims > 2) {
|
||||||
for(int64_t i = 2; i < dims; i++) reduce_dims.push_back(i);
|
for (const auto i : c10::irange(2, dims))reduce_dims.push_back(i);
|
||||||
}
|
}
|
||||||
weight_grad = weight_grad_collector.sum(reduce_dims);
|
weight_grad = weight_grad_collector.sum(reduce_dims);
|
||||||
}
|
}
|
||||||
|
@ -2,6 +2,7 @@
|
|||||||
#include <ATen/NativeFunctions.h>
|
#include <ATen/NativeFunctions.h>
|
||||||
#include <ATen/native/AdaptivePooling.h>
|
#include <ATen/native/AdaptivePooling.h>
|
||||||
#include <ATen/native/xnnpack/Engine.h>
|
#include <ATen/native/xnnpack/Engine.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
|
|
||||||
namespace at {
|
namespace at {
|
||||||
@ -16,7 +17,7 @@ namespace {
|
|||||||
{
|
{
|
||||||
TORCH_CHECK(output_size.size() == 2, "adaptive_avg_pool2d: output_size must be 2");
|
TORCH_CHECK(output_size.size() == 2, "adaptive_avg_pool2d: output_size must be 2");
|
||||||
int64_t ndim = input.ndimension();
|
int64_t ndim = input.ndimension();
|
||||||
for (int64_t i = 1; i < ndim; i++) {
|
for (const auto i : c10::irange(1, ndim)) {
|
||||||
TORCH_CHECK(input.size(i) > 0,
|
TORCH_CHECK(input.size(i) > 0,
|
||||||
"adaptive_avg_pool2d(): Expected input to have non-zero size for non-batch dimensions, "
|
"adaptive_avg_pool2d(): Expected input to have non-zero size for non-batch dimensions, "
|
||||||
"but input has sizes ", input.sizes(), " with dimension ", i, " being "
|
"but input has sizes ", input.sizes(), " with dimension ", i, " being "
|
||||||
@ -52,7 +53,7 @@ namespace {
|
|||||||
const Tensor& input)
|
const Tensor& input)
|
||||||
{
|
{
|
||||||
int64_t ndim = grad_output.ndimension();
|
int64_t ndim = grad_output.ndimension();
|
||||||
for (int64_t i = 1; i < ndim; i++) {
|
for (const auto i : c10::irange(1, ndim)) {
|
||||||
TORCH_CHECK(grad_output.size(i) > 0,
|
TORCH_CHECK(grad_output.size(i) > 0,
|
||||||
"adaptive_avg_pool2d_backward(): Expected grad_output to have non-zero size for non-batch dimensions, "
|
"adaptive_avg_pool2d_backward(): Expected grad_output to have non-zero size for non-batch dimensions, "
|
||||||
"but grad_output has sizes ", grad_output.sizes(), " with dimension ", i, " being "
|
"but grad_output has sizes ", grad_output.sizes(), " with dimension ", i, " being "
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
#include <ATen/ATen.h>
|
#include <ATen/ATen.h>
|
||||||
#include <ATen/NativeFunctions.h>
|
#include <ATen/NativeFunctions.h>
|
||||||
#include <ATen/Parallel.h>
|
#include <ATen/Parallel.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
namespace at {
|
namespace at {
|
||||||
namespace native {
|
namespace native {
|
||||||
@ -33,19 +34,19 @@ static void adaptive_avg_pool3d_out_frame(
|
|||||||
int64_t istrideH,
|
int64_t istrideH,
|
||||||
int64_t istrideW) {
|
int64_t istrideW) {
|
||||||
at::parallel_for(0, sizeD, 1, [&](int64_t start, int64_t end) {
|
at::parallel_for(0, sizeD, 1, [&](int64_t start, int64_t end) {
|
||||||
for (int64_t d = start; d < end; d++) {
|
for (const auto d : c10::irange(start, end)) {
|
||||||
/* loop over output */
|
/* loop over output */
|
||||||
for (int64_t ot = 0; ot < osizeT; ot++) {
|
for (const auto ot : c10::irange(osizeT)) {
|
||||||
int istartT = start_index(ot, osizeT, isizeT);
|
int istartT = start_index(ot, osizeT, isizeT);
|
||||||
int iendT = end_index(ot, osizeT, isizeT);
|
int iendT = end_index(ot, osizeT, isizeT);
|
||||||
int kT = iendT - istartT;
|
int kT = iendT - istartT;
|
||||||
|
|
||||||
for (int64_t oh = 0; oh < osizeH; oh++) {
|
for (const auto oh : c10::irange(osizeH)) {
|
||||||
int istartH = start_index(oh, osizeH, isizeH);
|
int istartH = start_index(oh, osizeH, isizeH);
|
||||||
int iendH = end_index(oh, osizeH, isizeH);
|
int iendH = end_index(oh, osizeH, isizeH);
|
||||||
int kH = iendH - istartH;
|
int kH = iendH - istartH;
|
||||||
|
|
||||||
for (int64_t ow = 0; ow < osizeW; ow++) {
|
for (const auto ow : c10::irange(osizeW)) {
|
||||||
int istartW = start_index(ow, osizeW, isizeW);
|
int istartW = start_index(ow, osizeW, isizeW);
|
||||||
int iendW = end_index(ow, osizeW, isizeW);
|
int iendW = end_index(ow, osizeW, isizeW);
|
||||||
int kW = iendW - istartW;
|
int kW = iendW - istartW;
|
||||||
@ -58,9 +59,9 @@ static void adaptive_avg_pool3d_out_frame(
|
|||||||
|
|
||||||
/* compute local average: */
|
/* compute local average: */
|
||||||
scalar_t sum = 0;
|
scalar_t sum = 0;
|
||||||
for (int it = 0; it < kT; it++) {
|
for (const auto it : c10::irange(kT)) {
|
||||||
for (int ih = 0; ih < kH; ih++) {
|
for (const auto ih : c10::irange(kH)) {
|
||||||
for (int iw = 0; iw < kW; iw++) {
|
for (const auto iw : c10::irange(kW)) {
|
||||||
scalar_t val =
|
scalar_t val =
|
||||||
*(ip + it * istrideT + ih * istrideH + iw * istrideW);
|
*(ip + it * istrideT + ih * istrideH + iw * istrideW);
|
||||||
sum += val;
|
sum += val;
|
||||||
@ -83,7 +84,7 @@ void adaptive_avg_pool3d_out_cpu_template(
|
|||||||
IntArrayRef output_size) {
|
IntArrayRef output_size) {
|
||||||
TORCH_CHECK(output_size.size() == 3, "adaptive_avg_pool3d: output_size must be 3");
|
TORCH_CHECK(output_size.size() == 3, "adaptive_avg_pool3d: output_size must be 3");
|
||||||
|
|
||||||
for (int64_t i = 1; i < input.ndimension(); i++) {
|
for (const auto i : c10::irange(1, input.ndimension())) {
|
||||||
TORCH_CHECK(
|
TORCH_CHECK(
|
||||||
input.size(i) > 0,
|
input.size(i) > 0,
|
||||||
"adaptive_avg_pool3d(): Expected input to have non-zero size for non-batch dimensions, "
|
"adaptive_avg_pool3d(): Expected input to have non-zero size for non-batch dimensions, "
|
||||||
@ -148,7 +149,7 @@ void adaptive_avg_pool3d_out_cpu_template(
|
|||||||
auto input_data = input.data_ptr<scalar_t>();
|
auto input_data = input.data_ptr<scalar_t>();
|
||||||
auto output_data = output.data_ptr<scalar_t>();
|
auto output_data = output.data_ptr<scalar_t>();
|
||||||
at::parallel_for(0, n, 1, [&](int64_t start, int64_t end) {
|
at::parallel_for(0, n, 1, [&](int64_t start, int64_t end) {
|
||||||
for (int64_t b = start; b < end; ++b) {
|
for (const auto b : c10::irange(start, end)) {
|
||||||
adaptive_avg_pool3d_out_frame<scalar_t>(
|
adaptive_avg_pool3d_out_frame<scalar_t>(
|
||||||
input_data + b * input.stride(0),
|
input_data + b * input.stride(0),
|
||||||
output_data + b * sizeD * osizeT * osizeH * osizeW,
|
output_data + b * sizeD * osizeT * osizeH * osizeW,
|
||||||
@ -181,22 +182,22 @@ static void adaptive_avg_pool3d_backward_out_frame(
|
|||||||
int64_t osizeH,
|
int64_t osizeH,
|
||||||
int64_t osizeW) {
|
int64_t osizeW) {
|
||||||
at::parallel_for(0, sizeD, 1, [&](int64_t start, int64_t end) {
|
at::parallel_for(0, sizeD, 1, [&](int64_t start, int64_t end) {
|
||||||
for (int64_t d = start; d < end; d++) {
|
for (const auto d : c10::irange(start, end)) {
|
||||||
scalar_t* gradInput_p_d = gradInput_p + d * isizeT * isizeW * isizeH;
|
scalar_t* gradInput_p_d = gradInput_p + d * isizeT * isizeW * isizeH;
|
||||||
scalar_t* gradOutput_p_d = gradOutput_p + d * osizeT * osizeW * osizeH;
|
scalar_t* gradOutput_p_d = gradOutput_p + d * osizeT * osizeW * osizeH;
|
||||||
|
|
||||||
/* calculate average */
|
/* calculate average */
|
||||||
for (int64_t ot = 0; ot < osizeT; ot++) {
|
for (const auto ot : c10::irange(osizeT)) {
|
||||||
int istartT = start_index(ot, osizeT, isizeT);
|
int istartT = start_index(ot, osizeT, isizeT);
|
||||||
int iendT = end_index(ot, osizeT, isizeT);
|
int iendT = end_index(ot, osizeT, isizeT);
|
||||||
int kT = iendT - istartT;
|
int kT = iendT - istartT;
|
||||||
|
|
||||||
for (int64_t oh = 0; oh < osizeH; oh++) {
|
for (const auto oh : c10::irange(osizeH)) {
|
||||||
int istartH = start_index(oh, osizeH, isizeH);
|
int istartH = start_index(oh, osizeH, isizeH);
|
||||||
int iendH = end_index(oh, osizeH, isizeH);
|
int iendH = end_index(oh, osizeH, isizeH);
|
||||||
int kH = iendH - istartH;
|
int kH = iendH - istartH;
|
||||||
|
|
||||||
for (int64_t ow = 0; ow < osizeW; ow++) {
|
for (const auto ow : c10::irange(osizeW)) {
|
||||||
int istartW = start_index(ow, osizeW, isizeW);
|
int istartW = start_index(ow, osizeW, isizeW);
|
||||||
int iendW = end_index(ow, osizeW, isizeW);
|
int iendW = end_index(ow, osizeW, isizeW);
|
||||||
int kW = iendW - istartW;
|
int kW = iendW - istartW;
|
||||||
@ -205,9 +206,9 @@ static void adaptive_avg_pool3d_backward_out_frame(
|
|||||||
gradOutput_p_d[ot * osizeH * osizeW + oh * osizeW + ow] / kT /
|
gradOutput_p_d[ot * osizeH * osizeW + oh * osizeW + ow] / kT /
|
||||||
kH / kW;
|
kH / kW;
|
||||||
|
|
||||||
for (int it = istartT; it < iendT; it++) {
|
for (const auto it : c10::irange(istartT, iendT)) {
|
||||||
for (int ih = istartH; ih < iendH; ih++) {
|
for (const auto ih : c10::irange(istartH, iendH)) {
|
||||||
for (int iw = istartW; iw < iendW; iw++) {
|
for (const auto iw : c10::irange(istartW, iendW)) {
|
||||||
/* update gradient */
|
/* update gradient */
|
||||||
gradInput_p_d[it * isizeH * isizeW + ih * isizeW + iw] +=
|
gradInput_p_d[it * isizeH * isizeW + ih * isizeW + iw] +=
|
||||||
grad_delta;
|
grad_delta;
|
||||||
@ -265,7 +266,7 @@ Tensor& adaptive_avg_pool3d_backward_out_cpu_template(
|
|||||||
scalar_t* gradInput_data = gradInput.data_ptr<scalar_t>();
|
scalar_t* gradInput_data = gradInput.data_ptr<scalar_t>();
|
||||||
scalar_t* gradOutput_data = gradOutput.data_ptr<scalar_t>();
|
scalar_t* gradOutput_data = gradOutput.data_ptr<scalar_t>();
|
||||||
at::parallel_for(0, n, 1, [&](int64_t start, int64_t end) {
|
at::parallel_for(0, n, 1, [&](int64_t start, int64_t end) {
|
||||||
for (int64_t b = start; b < end; b++) {
|
for (const auto b : c10::irange(start, end)) {
|
||||||
adaptive_avg_pool3d_backward_out_frame<scalar_t>(
|
adaptive_avg_pool3d_backward_out_frame<scalar_t>(
|
||||||
gradInput_data + b * sizeD * isizeT * isizeH * isizeW,
|
gradInput_data + b * sizeD * isizeT * isizeH * isizeW,
|
||||||
gradOutput_data + b * sizeD * osizeT * osizeH * osizeW,
|
gradOutput_data + b * sizeD * osizeT * osizeH * osizeW,
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
#include <ATen/ATen.h>
|
#include <ATen/ATen.h>
|
||||||
#include <ATen/NativeFunctions.h>
|
#include <ATen/NativeFunctions.h>
|
||||||
#include <ATen/native/AdaptivePooling.h>
|
#include <ATen/native/AdaptivePooling.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
|
|
||||||
namespace at {
|
namespace at {
|
||||||
@ -10,7 +11,7 @@ TORCH_META_FUNC(adaptive_max_pool2d) (const Tensor& input, IntArrayRef output_si
|
|||||||
TORCH_CHECK(ndim == 3 || ndim == 4,
|
TORCH_CHECK(ndim == 3 || ndim == 4,
|
||||||
"adaptive_max_pool2d(): Expected 3D or 4D tensor, but got: ",
|
"adaptive_max_pool2d(): Expected 3D or 4D tensor, but got: ",
|
||||||
input.sizes());
|
input.sizes());
|
||||||
for (int64_t i = 1; i < ndim; i++) {
|
for (const auto i : c10::irange(1, ndim)) {
|
||||||
TORCH_CHECK(input.size(i) > 0,
|
TORCH_CHECK(input.size(i) > 0,
|
||||||
"adaptive_max_pool2d(): Expected input to have non-zero size for non-batch dimensions, "
|
"adaptive_max_pool2d(): Expected input to have non-zero size for non-batch dimensions, "
|
||||||
"but input has sizes ", input.sizes(), " with dimension ", i,
|
"but input has sizes ", input.sizes(), " with dimension ", i,
|
||||||
@ -51,7 +52,7 @@ TORCH_META_FUNC(adaptive_max_pool2d_backward)
|
|||||||
int64_t ndim = grad_output.ndimension();
|
int64_t ndim = grad_output.ndimension();
|
||||||
TORCH_CHECK(ndim == 3 || ndim == 4,
|
TORCH_CHECK(ndim == 3 || ndim == 4,
|
||||||
"adaptive_max_pooling2d_backward(): Expected 3D or 4D grad_output, but got: ", grad_output.sizes());
|
"adaptive_max_pooling2d_backward(): Expected 3D or 4D grad_output, but got: ", grad_output.sizes());
|
||||||
for (int64_t i = 1; i < ndim; i++) {
|
for (const auto i : c10::irange(1, ndim)) {
|
||||||
TORCH_CHECK(grad_output.size(i) > 0,
|
TORCH_CHECK(grad_output.size(i) > 0,
|
||||||
"adaptive_max_pooling2d_backward(): Expected grad_output to have non-zero size for non-batch dimensions, "
|
"adaptive_max_pooling2d_backward(): Expected grad_output to have non-zero size for non-batch dimensions, "
|
||||||
"but grad_output has sizes ", grad_output.sizes(), " with dimension ", i,
|
"but grad_output has sizes ", grad_output.sizes(), " with dimension ", i,
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
#include <ATen/ATen.h>
|
#include <ATen/ATen.h>
|
||||||
#include <ATen/NativeFunctions.h>
|
#include <ATen/NativeFunctions.h>
|
||||||
#include <ATen/Parallel.h>
|
#include <ATen/Parallel.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <tuple>
|
#include <tuple>
|
||||||
|
|
||||||
|
|
||||||
@ -11,7 +12,7 @@ TORCH_META_FUNC(adaptive_max_pool3d) (const Tensor& input, IntArrayRef output_si
|
|||||||
TORCH_CHECK(
|
TORCH_CHECK(
|
||||||
ndim == 4 || ndim == 5,
|
ndim == 4 || ndim == 5,
|
||||||
"adaptive_max_pool3d(): Expected 4D or 5D tensor, but got: ", input.sizes());
|
"adaptive_max_pool3d(): Expected 4D or 5D tensor, but got: ", input.sizes());
|
||||||
for (int64_t i = 1; i < ndim; i++) {
|
for (const auto i : c10::irange(1, ndim)) {
|
||||||
TORCH_CHECK(
|
TORCH_CHECK(
|
||||||
input.size(i) > 0,
|
input.size(i) > 0,
|
||||||
"adaptive_max_pool3d(): Expected input to have non-zero size for non-batch dimensions, "
|
"adaptive_max_pool3d(): Expected input to have non-zero size for non-batch dimensions, "
|
||||||
@ -96,8 +97,7 @@ static void adaptive_max_pool3d_single_out_frame(
|
|||||||
int64_t istrideW)
|
int64_t istrideW)
|
||||||
{
|
{
|
||||||
at::parallel_for(0, sizeD, 0, [&](int64_t start, int64_t end) {
|
at::parallel_for(0, sizeD, 0, [&](int64_t start, int64_t end) {
|
||||||
for (auto d = start; d < end; d++)
|
for (const auto d : c10::irange(start, end)) {
|
||||||
{
|
|
||||||
/* loop over output */
|
/* loop over output */
|
||||||
int64_t ot, oh, ow;
|
int64_t ot, oh, ow;
|
||||||
for(ot = 0; ot < osizeT; ot++)
|
for(ot = 0; ot < osizeT; ot++)
|
||||||
@ -176,8 +176,7 @@ static void adaptive_max_pool3d_out_frame(
|
|||||||
int64_t istrideW)
|
int64_t istrideW)
|
||||||
{
|
{
|
||||||
at::parallel_for(0, sizeB, 0, [&](int64_t start, int64_t end) {
|
at::parallel_for(0, sizeB, 0, [&](int64_t start, int64_t end) {
|
||||||
for (auto b = start; b < end; b++)
|
for (const auto b : c10::irange(start, end)) {
|
||||||
{
|
|
||||||
adaptive_max_pool3d_single_out_frame<scalar_t>(input_data+b*istrideB, output_data+b*sizeD*osizeT*osizeH*osizeW,
|
adaptive_max_pool3d_single_out_frame<scalar_t>(input_data+b*istrideB, output_data+b*sizeD*osizeT*osizeH*osizeW,
|
||||||
indices_data+b*sizeD*osizeT*osizeH*osizeW,
|
indices_data+b*sizeD*osizeT*osizeH*osizeW,
|
||||||
sizeD,
|
sizeD,
|
||||||
@ -203,8 +202,7 @@ static void adaptive_max_pool3d_backward_single_out_frame(
|
|||||||
int64_t osizeW)
|
int64_t osizeW)
|
||||||
{
|
{
|
||||||
at::parallel_for(0, sizeD, 0, [&](int64_t start, int64_t end) {
|
at::parallel_for(0, sizeD, 0, [&](int64_t start, int64_t end) {
|
||||||
for (auto d = start; d < end; d++)
|
for (const auto d : c10::irange(start, end)) {
|
||||||
{
|
|
||||||
scalar_t *gradInput_p_d = gradInput_p + d*isizeT*isizeH*isizeW;
|
scalar_t *gradInput_p_d = gradInput_p + d*isizeT*isizeH*isizeW;
|
||||||
scalar_t *gradOutput_p_d = gradOutput_p + d*osizeT*osizeH*osizeW;
|
scalar_t *gradOutput_p_d = gradOutput_p + d*osizeT*osizeH*osizeW;
|
||||||
int64_t *ind_p_d = ind_p + d*osizeT*osizeH*osizeW;
|
int64_t *ind_p_d = ind_p + d*osizeT*osizeH*osizeW;
|
||||||
@ -244,8 +242,7 @@ static void adaptive_max_pool3d_backward_out_frame(
|
|||||||
int64_t osizeW)
|
int64_t osizeW)
|
||||||
{
|
{
|
||||||
at::parallel_for(0, sizeB, 0, [&](int64_t start, int64_t end) {
|
at::parallel_for(0, sizeB, 0, [&](int64_t start, int64_t end) {
|
||||||
for (auto b = start; b < end; b++)
|
for (const auto b : c10::irange(start, end)) {
|
||||||
{
|
|
||||||
adaptive_max_pool3d_backward_single_out_frame<scalar_t>(gradInput_data+b*sizeD*isizeT*isizeH*isizeW, gradOutput_data+b*sizeD*osizeT*osizeH*osizeW,
|
adaptive_max_pool3d_backward_single_out_frame<scalar_t>(gradInput_data+b*sizeD*isizeT*isizeH*isizeW, gradOutput_data+b*sizeD*osizeT*osizeH*osizeW,
|
||||||
indices_data+b*sizeD*osizeT*osizeH*osizeW,
|
indices_data+b*sizeD*osizeT*osizeH*osizeW,
|
||||||
sizeD,
|
sizeD,
|
||||||
|
@ -2,6 +2,7 @@
|
|||||||
#include <ATen/Parallel.h>
|
#include <ATen/Parallel.h>
|
||||||
#include <ATen/NativeFunctions.h>
|
#include <ATen/NativeFunctions.h>
|
||||||
#include <ATen/native/Pool.h>
|
#include <ATen/native/Pool.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <tuple>
|
#include <tuple>
|
||||||
|
|
||||||
|
|
||||||
@ -169,8 +170,7 @@ static void avg_pool3d_out_frame(
|
|||||||
c10::optional<int64_t> divisor_override)
|
c10::optional<int64_t> divisor_override)
|
||||||
{
|
{
|
||||||
at::parallel_for(0, nslices, 0, [&](int64_t start, int64_t end) {
|
at::parallel_for(0, nslices, 0, [&](int64_t start, int64_t end) {
|
||||||
for (auto k = start; k < end; k++)
|
for (const auto k : c10::irange(start, end)) {
|
||||||
{
|
|
||||||
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
|
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
|
||||||
int64_t i, j, ti;
|
int64_t i, j, ti;
|
||||||
|
|
||||||
@ -315,7 +315,7 @@ TORCH_IMPL_FUNC(avg_pool3d_out_cpu) (
|
|||||||
scalar_t *output_data = output.data_ptr<scalar_t>();
|
scalar_t *output_data = output.data_ptr<scalar_t>();
|
||||||
|
|
||||||
at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) {
|
at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) {
|
||||||
for (auto p = start; p < end; p++) {
|
for (const auto p : c10::irange(start, end)) {
|
||||||
avg_pool3d_out_frame(
|
avg_pool3d_out_frame(
|
||||||
input_data + p * istride, output_data + p * ostride, nslices,
|
input_data + p * istride, output_data + p * ostride, nslices,
|
||||||
itime, iwidth, iheight,
|
itime, iwidth, iheight,
|
||||||
@ -358,8 +358,7 @@ static void avg_pool3d_backward_out_frame(
|
|||||||
c10::optional<int64_t> divisor_override)
|
c10::optional<int64_t> divisor_override)
|
||||||
{
|
{
|
||||||
at::parallel_for(0, nslices, 0, [&](int64_t start, int64_t end) {
|
at::parallel_for(0, nslices, 0, [&](int64_t start, int64_t end) {
|
||||||
for (auto k = start; k < end; k++)
|
for (const auto k : c10::irange(start, end)) {
|
||||||
{
|
|
||||||
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
|
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
|
||||||
int64_t i, j, ti;
|
int64_t i, j, ti;
|
||||||
|
|
||||||
@ -500,8 +499,7 @@ TORCH_IMPL_FUNC(avg_pool3d_backward_out_cpu) (
|
|||||||
scalar_t *gradOutput_data = gradOutput.data_ptr<scalar_t>();
|
scalar_t *gradOutput_data = gradOutput.data_ptr<scalar_t>();
|
||||||
|
|
||||||
at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) {
|
at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) {
|
||||||
for (auto p = start; p < end; p++)
|
for (const auto p : c10::irange(start, end)) {
|
||||||
{
|
|
||||||
avg_pool3d_backward_out_frame(
|
avg_pool3d_backward_out_frame(
|
||||||
gradInput_data + p * istride, gradOutput_data + p * ostride, nslices,
|
gradInput_data + p * istride, gradOutput_data + p * ostride, nslices,
|
||||||
itime, iwidth, iheight,
|
itime, iwidth, iheight,
|
||||||
|
@ -63,7 +63,7 @@ void apply_reflect_conj_tri_single(scalar_t* self, int64_t n, int64_t stride, bo
|
|||||||
std::function<void(int64_t, int64_t)> loop = [](int64_t, int64_t){};
|
std::function<void(int64_t, int64_t)> loop = [](int64_t, int64_t){};
|
||||||
if (upper) {
|
if (upper) {
|
||||||
loop = [&](int64_t start, int64_t end) {
|
loop = [&](int64_t start, int64_t end) {
|
||||||
for (int64_t i = start; i < end; i++) {
|
for (const auto i : c10::irange(start, end)) {
|
||||||
for (int64_t j = i + 1; j < n; j++) {
|
for (int64_t j = i + 1; j < n; j++) {
|
||||||
self[i * stride + j] = conj_impl(self[j * stride + i]);
|
self[i * stride + j] = conj_impl(self[j * stride + i]);
|
||||||
}
|
}
|
||||||
@ -71,8 +71,8 @@ void apply_reflect_conj_tri_single(scalar_t* self, int64_t n, int64_t stride, bo
|
|||||||
};
|
};
|
||||||
} else {
|
} else {
|
||||||
loop = [&](int64_t start, int64_t end) {
|
loop = [&](int64_t start, int64_t end) {
|
||||||
for (int64_t i = start; i < end; i++) {
|
for (const auto i : c10::irange(start, end)) {
|
||||||
for (int64_t j = 0; j < i; j++) {
|
for (const auto j : c10::irange(i)) {
|
||||||
self[i * stride + j] = conj_impl(self[j * stride + i]);
|
self[i * stride + j] = conj_impl(self[j * stride + i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -106,7 +106,7 @@ void apply_cholesky_inverse(Tensor& input, Tensor& infos, bool upper) {
|
|||||||
auto n = input.size(-2);
|
auto n = input.size(-2);
|
||||||
auto lda = std::max<int64_t>(1, n);
|
auto lda = std::max<int64_t>(1, n);
|
||||||
|
|
||||||
for (int64_t i = 0; i < batch_size; i++) {
|
for (const auto i : c10::irange(batch_size)) {
|
||||||
scalar_t* input_working_ptr = &input_data[i * input_matrix_stride];
|
scalar_t* input_working_ptr = &input_data[i * input_matrix_stride];
|
||||||
int* info_working_ptr = &infos_data[i];
|
int* info_working_ptr = &infos_data[i];
|
||||||
lapackCholeskyInverse<scalar_t>(uplo, n, input_working_ptr, lda, info_working_ptr);
|
lapackCholeskyInverse<scalar_t>(uplo, n, input_working_ptr, lda, info_working_ptr);
|
||||||
@ -501,7 +501,7 @@ inline void apply_orgqr(Tensor& self, const Tensor& tau) {
|
|||||||
lwork = std::max<int>(1, real_impl<scalar_t, value_t>(wkopt));
|
lwork = std::max<int>(1, real_impl<scalar_t, value_t>(wkopt));
|
||||||
Tensor work = at::empty({lwork}, self.options());
|
Tensor work = at::empty({lwork}, self.options());
|
||||||
|
|
||||||
for (int64_t i = 0; i < batch_size; i++) {
|
for (const auto i : c10::irange(batch_size)) {
|
||||||
scalar_t* self_working_ptr = &self_data[i * self_matrix_stride];
|
scalar_t* self_working_ptr = &self_data[i * self_matrix_stride];
|
||||||
scalar_t* tau_working_ptr = &tau_data[i * tau_stride];
|
scalar_t* tau_working_ptr = &tau_data[i * tau_stride];
|
||||||
|
|
||||||
|
@ -2,6 +2,7 @@
|
|||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <ATen/ATen.h>
|
#include <ATen/ATen.h>
|
||||||
#include <ATen/Config.h>
|
#include <ATen/Config.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
#if AT_BUILD_WITH_BLAS()
|
#if AT_BUILD_WITH_BLAS()
|
||||||
extern "C" double ddot_(int *n, double *x, int *incx, double *y, int *incy);
|
extern "C" double ddot_(int *n, double *x, int *incx, double *y, int *incy);
|
||||||
@ -151,7 +152,7 @@ inline void scal(int64_t n, scalar_t a, scalar_t *x, int64_t incx)
|
|||||||
blas_impl::scal_fast_path<scalar_t>(&i_n, &a, x, &i_incx);
|
blas_impl::scal_fast_path<scalar_t>(&i_n, &a, x, &i_incx);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
for (int64_t i = 0; i < n; i++) {
|
for (const auto i : c10::irange(n)) {
|
||||||
if (a == scalar_t(0)) {
|
if (a == scalar_t(0)) {
|
||||||
x[i * incx] = 0;
|
x[i * incx] = 0;
|
||||||
} else {
|
} else {
|
||||||
@ -176,11 +177,10 @@ void gemv(char trans, int64_t m, int64_t n, scalar_t alpha, scalar_t *a, int64_t
|
|||||||
}
|
}
|
||||||
|
|
||||||
if ((trans == 'T') || (trans == 't')) {
|
if ((trans == 'T') || (trans == 't')) {
|
||||||
for (int64_t i = 0; i < n; i++)
|
for (const auto i : c10::irange(n)) {
|
||||||
{
|
|
||||||
scalar_t sum = 0;
|
scalar_t sum = 0;
|
||||||
scalar_t *row_ = a + lda * i;
|
scalar_t *row_ = a + lda * i;
|
||||||
for (int64_t j = 0; j < m; j++) {
|
for (const auto j : c10::irange(m)) {
|
||||||
sum += x[j * incx] * row_[j];
|
sum += x[j * incx] * row_[j];
|
||||||
}
|
}
|
||||||
if (beta == scalar_t(0)) {
|
if (beta == scalar_t(0)) {
|
||||||
@ -192,10 +192,10 @@ void gemv(char trans, int64_t m, int64_t n, scalar_t alpha, scalar_t *a, int64_t
|
|||||||
} else {
|
} else {
|
||||||
if (beta != scalar_t(1) && beta != scalar_t(0)) scal<scalar_t>(m, beta, y, incy);
|
if (beta != scalar_t(1) && beta != scalar_t(0)) scal<scalar_t>(m, beta, y, incy);
|
||||||
|
|
||||||
for (int64_t j = 0; j < n; j++) {
|
for (const auto j : c10::irange(n)) {
|
||||||
scalar_t *column_ = a + lda * j;
|
scalar_t *column_ = a + lda * j;
|
||||||
scalar_t z = alpha * x[j * incx];
|
scalar_t z = alpha * x[j * incx];
|
||||||
for (int64_t i = 0; i < m; i++) {
|
for (const auto i : c10::irange(m)) {
|
||||||
//output values are ignored if beta is 0, and set to 0, nans and infs are not propagated
|
//output values are ignored if beta is 0, and set to 0, nans and infs are not propagated
|
||||||
if (j==0 && beta==scalar_t(0)) {
|
if (j==0 && beta==scalar_t(0)) {
|
||||||
y[i * incy] = scalar_t(0);
|
y[i * incy] = scalar_t(0);
|
||||||
|
@ -2,6 +2,7 @@
|
|||||||
#include <ATen/Parallel.h>
|
#include <ATen/Parallel.h>
|
||||||
#include <ATen/native/BucketizationUtils.h>
|
#include <ATen/native/BucketizationUtils.h>
|
||||||
#include <ATen/native/Resize.h>
|
#include <ATen/native/Resize.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
/* Implement a TF like searchsorted and a bucketize function running on cpu
|
/* Implement a TF like searchsorted and a bucketize function running on cpu
|
||||||
*
|
*
|
||||||
@ -58,7 +59,7 @@ void searchsorted_cpu_contiguous(Tensor& result, const Tensor& input, const Tens
|
|||||||
|
|
||||||
bool is_1d_boundaries = boundaries.dim() == 1;
|
bool is_1d_boundaries = boundaries.dim() == 1;
|
||||||
at::parallel_for(0, numel_in, SEARCHSORTED_GRAIN_SIZE, [&](int64_t start, int64_t end) {
|
at::parallel_for(0, numel_in, SEARCHSORTED_GRAIN_SIZE, [&](int64_t start, int64_t end) {
|
||||||
for (int64_t i = start; i < end; ++i) {
|
for (const auto i : c10::irange(start, end)) {
|
||||||
// If boundaries tensor is 1d, we always search the entire boundary tensor
|
// If boundaries tensor is 1d, we always search the entire boundary tensor
|
||||||
int64_t start_bd = is_1d_boundaries ? 0 : i / idim_in * idim_bd;
|
int64_t start_bd = is_1d_boundaries ? 0 : i / idim_in * idim_bd;
|
||||||
const input_t *data_bd_start = &data_bd[start_bd];
|
const input_t *data_bd_start = &data_bd[start_bd];
|
||||||
|
@ -5,6 +5,7 @@
|
|||||||
|
|
||||||
#include <ATen/native/im2col.h>
|
#include <ATen/native/im2col.h>
|
||||||
#include <ATen/native/im2col_shape_check.h>
|
#include <ATen/native/im2col_shape_check.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
// Note [im2col/col2im output padding]
|
// Note [im2col/col2im output padding]
|
||||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
@ -150,7 +151,7 @@ static void col2im_out_cpu_template(
|
|||||||
stride_width +
|
stride_width +
|
||||||
1;
|
1;
|
||||||
|
|
||||||
for (int64_t elt = 0; elt < batch_size; elt++) {
|
for (const auto elt : c10::irange(batch_size)) {
|
||||||
input_n = input.select(0, elt);
|
input_n = input.select(0, elt);
|
||||||
output_n = output.select(0, elt);
|
output_n = output.select(0, elt);
|
||||||
|
|
||||||
|
@ -24,7 +24,7 @@ inline Tensor view_tensor(
|
|||||||
|
|
||||||
inline DimVector computeStrideForViewAsReal(IntArrayRef oldstride) {
|
inline DimVector computeStrideForViewAsReal(IntArrayRef oldstride) {
|
||||||
DimVector res(oldstride.size() + 1);
|
DimVector res(oldstride.size() + 1);
|
||||||
for(size_t i = 0; i < oldstride.size(); i++) {
|
for (const auto i : c10::irange(oldstride.size())) {
|
||||||
res[i] = oldstride[i] * 2;
|
res[i] = oldstride[i] * 2;
|
||||||
}
|
}
|
||||||
res.back() = 1;
|
res.back() = 1;
|
||||||
|
@ -47,7 +47,7 @@ Tensor constant_pad_nd(const Tensor& self, IntArrayRef pad, const Scalar& value)
|
|||||||
new_shape.emplace_back(input_sizes[i]);
|
new_shape.emplace_back(input_sizes[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (size_t i = 0; i < (size_t)l_pad; i++) {
|
for (const auto i : c10::irange((size_t)l_pad)) {
|
||||||
auto pad_idx = pad.size() - ((i + 1) * 2);
|
auto pad_idx = pad.size() - ((i + 1) * 2);
|
||||||
auto new_dim = input_sizes[l_diff + i] + pad[pad_idx] + pad[pad_idx + 1];
|
auto new_dim = input_sizes[l_diff + i] + pad[pad_idx] + pad[pad_idx + 1];
|
||||||
TORCH_CHECK(new_dim > 0, "The input size ", input_sizes[l_diff + i], ", plus negative padding ",
|
TORCH_CHECK(new_dim > 0, "The input size ", input_sizes[l_diff + i], ", plus negative padding ",
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
#include <ATen/detail/CUDAHooksInterface.h>
|
#include <ATen/detail/CUDAHooksInterface.h>
|
||||||
#include <c10/util/env.h>
|
#include <c10/util/env.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
namespace at { namespace native {
|
namespace at { namespace native {
|
||||||
|
|
||||||
@ -35,7 +36,7 @@ static inline std::vector<int64_t> conv_output_size(
|
|||||||
std::vector<int64_t> output_size(dim);
|
std::vector<int64_t> output_size(dim);
|
||||||
output_size[0] = input_size[input_batch_size_dim];
|
output_size[0] = input_size[input_batch_size_dim];
|
||||||
output_size[1] = weight_size[weight_output_channels_dim];
|
output_size[1] = weight_size[weight_output_channels_dim];
|
||||||
for (size_t d = 2; d < dim; ++d) {
|
for (const auto d : c10::irange(2, dim)) {
|
||||||
auto dilation_ = has_dilation ? dilation[d - 2] : 1;
|
auto dilation_ = has_dilation ? dilation[d - 2] : 1;
|
||||||
auto kernel = dilation_ * (weight_size[d] - 1) + 1;
|
auto kernel = dilation_ * (weight_size[d] - 1) + 1;
|
||||||
output_size[d] = (input_size[d] + (2 * padding[d - 2]) - kernel) / stride[d - 2] + 1;
|
output_size[d] = (input_size[d] + (2 * padding[d - 2]) - kernel) / stride[d - 2] + 1;
|
||||||
@ -53,7 +54,7 @@ static inline std::vector<int64_t> conv_input_size(
|
|||||||
std::vector<int64_t> input_size(dim);
|
std::vector<int64_t> input_size(dim);
|
||||||
input_size[0] = output_size[output_batch_size_dim];
|
input_size[0] = output_size[output_batch_size_dim];
|
||||||
input_size[1] = weight_size[weight_input_channels_dim] * groups;
|
input_size[1] = weight_size[weight_input_channels_dim] * groups;
|
||||||
for (size_t d = 2; d < dim; ++d) {
|
for (const auto d : c10::irange(2, dim)) {
|
||||||
int kernel = dilation[d - 2] * (weight_size[d] - 1) + 1;
|
int kernel = dilation[d - 2] * (weight_size[d] - 1) + 1;
|
||||||
input_size[d] = (output_size[d] - 1) * stride[d - 2] - (2 * padding[d - 2]) +
|
input_size[d] = (output_size[d] - 1) * stride[d - 2] - (2 * padding[d - 2]) +
|
||||||
kernel + output_padding[d - 2];
|
kernel + output_padding[d - 2];
|
||||||
@ -69,7 +70,7 @@ static inline std::vector<int64_t> conv_weight_size(
|
|||||||
std::vector<int64_t> weight_size(dim);
|
std::vector<int64_t> weight_size(dim);
|
||||||
weight_size[0] = output_size[1];
|
weight_size[0] = output_size[1];
|
||||||
weight_size[1] = input_size[1] / groups;
|
weight_size[1] = input_size[1] / groups;
|
||||||
for (size_t d = 2; d < dim; ++d) {
|
for (const auto d : c10::irange(2, dim)) {
|
||||||
int kernel = input_size[d] - (output_size[d] - 1) * stride[d - 2]
|
int kernel = input_size[d] - (output_size[d] - 1) * stride[d - 2]
|
||||||
+ 2 * padding[d - 2] - output_padding[d - 2];
|
+ 2 * padding[d - 2] - output_padding[d - 2];
|
||||||
weight_size[d] = (kernel - 1) / dilation[d - 2] + 1;
|
weight_size[d] = (kernel - 1) / dilation[d - 2] + 1;
|
||||||
|
@ -975,7 +975,7 @@ at::Tensor _convolution(
|
|||||||
} else {
|
} else {
|
||||||
std::vector<Tensor> outputs(params.groups);
|
std::vector<Tensor> outputs(params.groups);
|
||||||
input = input.contiguous();
|
input = input.contiguous();
|
||||||
for (int g = 0; g < params.groups; ++g) {
|
for (const auto g : c10::irange(params.groups)) {
|
||||||
auto input_g = subtensor(input, 1, params.groups, g);
|
auto input_g = subtensor(input, 1, params.groups, g);
|
||||||
auto weight_g = subtensor(weight, 0, params.groups, g);
|
auto weight_g = subtensor(weight, 0, params.groups, g);
|
||||||
auto bias_g = subtensor(bias, 0, params.groups, g);
|
auto bias_g = subtensor(bias, 0, params.groups, g);
|
||||||
@ -1212,7 +1212,7 @@ std::tuple<Tensor,Tensor,Tensor> _convolution_double_backward( const c10::option
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
std::vector<Tensor> gWt_list(groups);
|
std::vector<Tensor> gWt_list(groups);
|
||||||
for (int g = 0; g < groups; ++g) {
|
for (const auto g : c10::irange(groups)) {
|
||||||
auto ggIt_g = subvariable(ggIt, 0, groups, g);
|
auto ggIt_g = subvariable(ggIt, 0, groups, g);
|
||||||
auto gOt_g = subvariable(gOt, 0, groups, g);
|
auto gOt_g = subvariable(gOt, 0, groups, g);
|
||||||
if (gOt_g.is_cuda()) {
|
if (gOt_g.is_cuda()) {
|
||||||
@ -1239,7 +1239,7 @@ std::tuple<Tensor,Tensor,Tensor> _convolution_double_backward( const c10::option
|
|||||||
// the ConvForward kernels don't support asymmetric padding.
|
// the ConvForward kernels don't support asymmetric padding.
|
||||||
auto gW_size = gW.sizes();
|
auto gW_size = gW.sizes();
|
||||||
auto w_size = weight.sizes();
|
auto w_size = weight.sizes();
|
||||||
for (size_t i = 2; i < gW_size.size(); ++i) {
|
for (const auto i : c10::irange(2, gW_size.size())) {
|
||||||
if (gW_size[i] > w_size[i]) {
|
if (gW_size[i] > w_size[i]) {
|
||||||
gW = gW.narrow(i, 0, w_size[i]);
|
gW = gW.narrow(i, 0, w_size[i]);
|
||||||
gW_size = gW.sizes();
|
gW_size = gW.sizes();
|
||||||
@ -1268,7 +1268,7 @@ std::tuple<Tensor,Tensor,Tensor> _convolution_double_backward( const c10::option
|
|||||||
// rather than narrowing the computed gI
|
// rather than narrowing the computed gI
|
||||||
auto gI_size = gI.sizes();
|
auto gI_size = gI.sizes();
|
||||||
auto i_size = input.sizes();
|
auto i_size = input.sizes();
|
||||||
for (size_t i = 2; i < gI_size.size(); ++i) {
|
for (const auto i : c10::irange(2, gI_size.size())) {
|
||||||
if (gI_size[i] > i_size[i]) {
|
if (gI_size[i] > i_size[i]) {
|
||||||
gI = gI.narrow(i, 0, i_size[i]);
|
gI = gI.narrow(i, 0, i_size[i]);
|
||||||
gI_size = gI.sizes();
|
gI_size = gI.sizes();
|
||||||
@ -1289,7 +1289,7 @@ std::tuple<Tensor,Tensor,Tensor> _convolution_double_backward( const c10::option
|
|||||||
gi_conv_params.output_padding[1] = input_shape[0] - expected_input_shape;
|
gi_conv_params.output_padding[1] = input_shape[0] - expected_input_shape;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for(size_t i = 0; i < kernel_size.size(); ++i) {
|
for (const auto i : c10::irange(kernel_size.size())) {
|
||||||
// Check if whole input has been used or not
|
// Check if whole input has been used or not
|
||||||
auto expected_input_shape = (kernel_size[i] - 1) * gi_conv_params.dilation[i]
|
auto expected_input_shape = (kernel_size[i] - 1) * gi_conv_params.dilation[i]
|
||||||
- 2 * gi_conv_params.padding[i]
|
- 2 * gi_conv_params.padding[i]
|
||||||
|
@ -7,6 +7,7 @@
|
|||||||
#include <ATen/div_rtn.h>
|
#include <ATen/div_rtn.h>
|
||||||
#include <ATen/native/CPUBlas.h>
|
#include <ATen/native/CPUBlas.h>
|
||||||
#include <ATen/native/Unfold2d.h>
|
#include <ATen/native/Unfold2d.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
namespace at {
|
namespace at {
|
||||||
namespace native {
|
namespace native {
|
||||||
@ -299,7 +300,7 @@ void slow_conv2d_backward_out_cpu_template(
|
|||||||
at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) {
|
at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) {
|
||||||
auto fgrad_input = std::make_unique<scalar_t[]>(
|
auto fgrad_input = std::make_unique<scalar_t[]>(
|
||||||
c10::multiply_integers(finput.sizes().slice(1)));
|
c10::multiply_integers(finput.sizes().slice(1)));
|
||||||
for (int64_t t = start; t < end; t++) {
|
for (const auto t : c10::irange(start, end)) {
|
||||||
auto grad_input_t = grad_input_a[t];
|
auto grad_input_t = grad_input_a[t];
|
||||||
auto grad_output_t = grad_output_a[t];
|
auto grad_output_t = grad_output_a[t];
|
||||||
slow_conv2d_backward_update_grad_input_frame(
|
slow_conv2d_backward_update_grad_input_frame(
|
||||||
@ -478,7 +479,7 @@ std::tuple<Tensor&, Tensor&> slow_conv2d_forward_out_cpu(
|
|||||||
auto weight_2d_a = weight_2d.accessor<scalar_t, 2>();
|
auto weight_2d_a = weight_2d.accessor<scalar_t, 2>();
|
||||||
|
|
||||||
at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) {
|
at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) {
|
||||||
for (int64_t t = start; t < end; t++) {
|
for (const auto t : c10::irange(start, end)) {
|
||||||
auto input_t = input_a[t];
|
auto input_t = input_a[t];
|
||||||
auto output_t = output_a[t];
|
auto output_t = output_a[t];
|
||||||
auto finput_t = finput_a[t];
|
auto finput_t = finput_a[t];
|
||||||
|
@ -6,6 +6,7 @@
|
|||||||
#include <ATen/div_rtn.h>
|
#include <ATen/div_rtn.h>
|
||||||
#include <ATen/native/CPUBlas.h>
|
#include <ATen/native/CPUBlas.h>
|
||||||
#include <ATen/native/Unfold3d.h>
|
#include <ATen/native/Unfold3d.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
constexpr int64_t CONV3D_GRAIN_SALT = 20;
|
constexpr int64_t CONV3D_GRAIN_SALT = 20;
|
||||||
|
|
||||||
@ -358,7 +359,7 @@ void slow_conv3d_backward_out_cpu_template(
|
|||||||
auto fgrad_input_a = fgrad_input.accessor<scalar_t, 3>();
|
auto fgrad_input_a = fgrad_input.accessor<scalar_t, 3>();
|
||||||
auto weight_2d_a = weight2d.accessor<scalar_t, 2>();
|
auto weight_2d_a = weight2d.accessor<scalar_t, 2>();
|
||||||
|
|
||||||
for (int64_t t = start; t < end; t++) {
|
for (const auto t : c10::irange(start, end)) {
|
||||||
auto grad_input_t = grad_input_a[t];
|
auto grad_input_t = grad_input_a[t];
|
||||||
auto grad_output_t = grad_output_a[t];
|
auto grad_output_t = grad_output_a[t];
|
||||||
auto fgrad_input_t = fgrad_input_a[t];
|
auto fgrad_input_t = fgrad_input_a[t];
|
||||||
@ -462,7 +463,7 @@ static void slow_conv3d_backward_parameters_out_cpu_template(
|
|||||||
auto grad_weight_2d_a = grad_weight_2d.accessor<scalar_t, 2>();
|
auto grad_weight_2d_a = grad_weight_2d.accessor<scalar_t, 2>();
|
||||||
auto grad_output_a = grad_output_contiguous.accessor<scalar_t, 5>();
|
auto grad_output_a = grad_output_contiguous.accessor<scalar_t, 5>();
|
||||||
auto finput_a = finput.accessor<scalar_t, 3>();
|
auto finput_a = finput.accessor<scalar_t, 3>();
|
||||||
for (int64_t t = 0; t < batch_size; t++) {
|
for (const auto t : c10::irange(batch_size)) {
|
||||||
auto grad_output_t = grad_output_a[t];
|
auto grad_output_t = grad_output_a[t];
|
||||||
auto finput_t = finput_a[t];
|
auto finput_t = finput_a[t];
|
||||||
slow_conv3d_backward_weight_frame(
|
slow_conv3d_backward_weight_frame(
|
||||||
@ -564,7 +565,7 @@ std::tuple<Tensor&, Tensor&, Tensor&> slow_conv3d_forward_out_cpu(const Tensor&
|
|||||||
|
|
||||||
at::parallel_for(
|
at::parallel_for(
|
||||||
0, batch_size, CONV3D_GRAIN_SALT, [&](int64_t start, int64_t end) {
|
0, batch_size, CONV3D_GRAIN_SALT, [&](int64_t start, int64_t end) {
|
||||||
for (int64_t t = start; t < end; t++) {
|
for (const auto t : c10::irange(start, end)) {
|
||||||
auto input_t = input_a[t];
|
auto input_t = input_a[t];
|
||||||
auto output_t = output_a[t];
|
auto output_t = output_a[t];
|
||||||
auto finput_t = finput_a[t];
|
auto finput_t = finput_a[t];
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
#include <ATen/ATen.h>
|
#include <ATen/ATen.h>
|
||||||
#include <ATen/NativeFunctions.h>
|
#include <ATen/NativeFunctions.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <tuple>
|
#include <tuple>
|
||||||
|
|
||||||
namespace at {
|
namespace at {
|
||||||
@ -39,7 +40,7 @@ Tensor conv_tbc(const Tensor& self, const Tensor& weight, const Tensor& bias, in
|
|||||||
weight_size[2],
|
weight_size[2],
|
||||||
}, self.options());
|
}, self.options());
|
||||||
output.copy_(bias.expand(output.sizes()));
|
output.copy_(bias.expand(output.sizes()));
|
||||||
for (int k = 0; k < kw; k++) {
|
for (const auto k : c10::irange(kw)) {
|
||||||
int iShift = std::max(0, static_cast<int>(k - real_pad));
|
int iShift = std::max(0, static_cast<int>(k - real_pad));
|
||||||
int oShift = std::max(0, static_cast<int>(real_pad - k));
|
int oShift = std::max(0, static_cast<int>(real_pad - k));
|
||||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||||
|
@ -12,6 +12,7 @@
|
|||||||
#include <ATen/MemoryOverlap.h>
|
#include <ATen/MemoryOverlap.h>
|
||||||
#include <ATen/NamedTensorUtils.h>
|
#include <ATen/NamedTensorUtils.h>
|
||||||
#include <ATen/Parallel.h>
|
#include <ATen/Parallel.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <torch/library.h>
|
#include <torch/library.h>
|
||||||
|
|
||||||
#ifdef USE_FBGEMM
|
#ifdef USE_FBGEMM
|
||||||
@ -65,16 +66,16 @@ void copy_same_type_transpose_(Tensor& self, const Tensor& src) {
|
|||||||
int nc = std::min(NC - C, BLOCK_SZ);
|
int nc = std::min(NC - C, BLOCK_SZ);
|
||||||
|
|
||||||
// 1. copy columns from src to buf
|
// 1. copy columns from src to buf
|
||||||
for (int c = 0; c < nc; c++) {
|
for (const auto c : c10::irange(nc)) {
|
||||||
memcpy(bp + c * BLOCK_SZ, spo + c * NR, nr * sizeof(scalar_t));
|
memcpy(bp + c * BLOCK_SZ, spo + c * NR, nr * sizeof(scalar_t));
|
||||||
}
|
}
|
||||||
|
|
||||||
// 2. transpose buf in place
|
// 2. transpose buf in place
|
||||||
int rc_max = std::max(nr, nc);
|
int rc_max = std::max(nr, nc);
|
||||||
int rc_min = std::min(nr, nc);
|
int rc_min = std::min(nr, nc);
|
||||||
for (int r = 0; r < rc_max; r++) {
|
for (const auto r : c10::irange(rc_max)) {
|
||||||
int end = std::min(r, rc_min);
|
int end = std::min(r, rc_min);
|
||||||
for (int c = 0; c < end; c++) {
|
for (const auto c : c10::irange(end)) {
|
||||||
scalar_t tmp = bp[r + BLOCK_SZ * c];
|
scalar_t tmp = bp[r + BLOCK_SZ * c];
|
||||||
bp[r + BLOCK_SZ * c] = bp[r * BLOCK_SZ + c];
|
bp[r + BLOCK_SZ * c] = bp[r * BLOCK_SZ + c];
|
||||||
bp[r * BLOCK_SZ + c] = tmp;
|
bp[r * BLOCK_SZ + c] = tmp;
|
||||||
@ -82,7 +83,7 @@ void copy_same_type_transpose_(Tensor& self, const Tensor& src) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// 3. copy rows from buf to dst
|
// 3. copy rows from buf to dst
|
||||||
for (int r = 0; r < nr; r++) {
|
for (const auto r : c10::irange(nr)) {
|
||||||
memcpy(rpo + r * NC, bp + r * BLOCK_SZ, nc * sizeof(scalar_t));
|
memcpy(rpo + r * NC, bp + r * BLOCK_SZ, nc * sizeof(scalar_t));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -3,6 +3,7 @@
|
|||||||
#include <ATen/NativeFunctions.h>
|
#include <ATen/NativeFunctions.h>
|
||||||
|
|
||||||
#include <ATen/native/Cross.h>
|
#include <ATen/native/Cross.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
namespace at { namespace native {
|
namespace at { namespace native {
|
||||||
|
|
||||||
@ -30,7 +31,7 @@ Tensor & cross_out(const Tensor & input, const Tensor & other, const c10::option
|
|||||||
|
|
||||||
int64_t dim = -1;
|
int64_t dim = -1;
|
||||||
if(!dimension.has_value()) {
|
if(!dimension.has_value()) {
|
||||||
for(int64_t i = 0; i < input.dim(); i++) {
|
for (const auto i : c10::irange(input.dim())) {
|
||||||
if(input.size(i) == 3) {
|
if(input.size(i) == 3) {
|
||||||
dim = i;
|
dim = i;
|
||||||
break;
|
break;
|
||||||
|
@ -5,6 +5,7 @@
|
|||||||
|
|
||||||
#include <ATen/div_rtn.h>
|
#include <ATen/div_rtn.h>
|
||||||
#include <ATen/ATen.h>
|
#include <ATen/ATen.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
#define TORCH_CHECK_DIM_SIZE(T, DIM, DIM_SIZE, SIZE) \
|
#define TORCH_CHECK_DIM_SIZE(T, DIM, DIM_SIZE, SIZE) \
|
||||||
TORCH_CHECK( \
|
TORCH_CHECK( \
|
||||||
@ -43,7 +44,7 @@ std::vector<int64_t> get_output_size(
|
|||||||
IntArrayRef pad_size,
|
IntArrayRef pad_size,
|
||||||
IntArrayRef dilation_size) {
|
IntArrayRef dilation_size) {
|
||||||
std::vector<int64_t> sizes;
|
std::vector<int64_t> sizes;
|
||||||
for (int index = 0; index < dim; index++) {
|
for (const auto index : c10::irange(dim)) {
|
||||||
sizes.push_back(
|
sizes.push_back(
|
||||||
div_rtn<int64_t>(
|
div_rtn<int64_t>(
|
||||||
input.size(index + input.dim() - dim) + 2 * pad_size[index] -
|
input.size(index + input.dim() - dim) + 2 * pad_size[index] -
|
||||||
|
@ -3,6 +3,7 @@
|
|||||||
#include <ATen/NamedTensorUtils.h>
|
#include <ATen/NamedTensorUtils.h>
|
||||||
#include <ATen/NativeFunctions.h>
|
#include <ATen/NativeFunctions.h>
|
||||||
#include <ATen/native/Pool.h>
|
#include <ATen/native/Pool.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <tuple>
|
#include <tuple>
|
||||||
|
|
||||||
|
|
||||||
@ -37,8 +38,7 @@ static void max_pool3d_with_indices_single_out_frame(
|
|||||||
int dilationH)
|
int dilationH)
|
||||||
{
|
{
|
||||||
at::parallel_for(0, nslices, 0, [&](int64_t start, int64_t end) {
|
at::parallel_for(0, nslices, 0, [&](int64_t start, int64_t end) {
|
||||||
for (auto k = start; k < end; k++)
|
for (const auto k : c10::irange(start, end)) {
|
||||||
{
|
|
||||||
/* loop over output */
|
/* loop over output */
|
||||||
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
|
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
|
||||||
int64_t i, j, ti;
|
int64_t i, j, ti;
|
||||||
@ -120,8 +120,7 @@ static void max_pool3d_with_indices_out_frame(
|
|||||||
int dilationT, int dilationW, int dilationH)
|
int dilationT, int dilationW, int dilationH)
|
||||||
{
|
{
|
||||||
at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) {
|
at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) {
|
||||||
for (auto p = start; p < end; p++)
|
for (const auto p : c10::irange(start, end)) {
|
||||||
{
|
|
||||||
max_pool3d_with_indices_single_out_frame(
|
max_pool3d_with_indices_single_out_frame(
|
||||||
input_data + p * istride,
|
input_data + p * istride,
|
||||||
output_data + p * ostride,
|
output_data + p * ostride,
|
||||||
@ -285,8 +284,7 @@ static void max_pool3d_with_indices_backward_single_out_frame(
|
|||||||
int dilationH)
|
int dilationH)
|
||||||
{
|
{
|
||||||
at::parallel_for(0, nslices, 0, [&](int64_t start, int64_t end) {
|
at::parallel_for(0, nslices, 0, [&](int64_t start, int64_t end) {
|
||||||
for (auto k = start; k < end; k++)
|
for (const auto k : c10::irange(start, end)) {
|
||||||
{
|
|
||||||
scalar_t *gradInput_p_k = gradInput_p + k * itime * iwidth * iheight;
|
scalar_t *gradInput_p_k = gradInput_p + k * itime * iwidth * iheight;
|
||||||
scalar_t *gradOutput_p_k = gradOutput_p + k * otime * owidth * oheight;
|
scalar_t *gradOutput_p_k = gradOutput_p + k * otime * owidth * oheight;
|
||||||
int64_t *indz_p_k = indz_p + k * otime * owidth * oheight;
|
int64_t *indz_p_k = indz_p + k * otime * owidth * oheight;
|
||||||
@ -330,8 +328,7 @@ static void max_pool3d_with_indices_backward_out_frame(
|
|||||||
int dilationT, int dilationW, int dilationH)
|
int dilationT, int dilationW, int dilationH)
|
||||||
{
|
{
|
||||||
at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) {
|
at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) {
|
||||||
for (auto p = start; p < end; p++)
|
for (const auto p : c10::irange(start, end)) {
|
||||||
{
|
|
||||||
max_pool3d_with_indices_backward_single_out_frame<scalar_t>(
|
max_pool3d_with_indices_backward_single_out_frame<scalar_t>(
|
||||||
gradInput_data + p * istride,
|
gradInput_data + p * istride,
|
||||||
gradOutput_data + p * ostride,
|
gradOutput_data + p * ostride,
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
#include <ATen/ATen.h>
|
#include <ATen/ATen.h>
|
||||||
#include <ATen/Dispatch.h>
|
#include <ATen/Dispatch.h>
|
||||||
#include <ATen/NamedTensorUtils.h>
|
#include <ATen/NamedTensorUtils.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
namespace at { namespace native {
|
namespace at { namespace native {
|
||||||
|
|
||||||
@ -16,8 +17,10 @@ Tensor make_feature_noise(const Tensor& input) {
|
|||||||
sizes.reserve(input.dim());
|
sizes.reserve(input.dim());
|
||||||
sizes.push_back(input_sizes[0]);
|
sizes.push_back(input_sizes[0]);
|
||||||
sizes.push_back(input_sizes[1]);
|
sizes.push_back(input_sizes[1]);
|
||||||
for (int64_t i = 2; i < input.dim(); ++i)
|
for (const auto i : c10::irange(2, input.dim())) {
|
||||||
|
(void)i; //Suppress unused variable warning
|
||||||
sizes.push_back(1);
|
sizes.push_back(1);
|
||||||
|
}
|
||||||
return at::empty(sizes, input.options());
|
return at::empty(sizes, input.options());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -123,7 +123,7 @@ Tensor embedding_dense_backward_cpu(
|
|||||||
|
|
||||||
auto parallel_section = [&](index_t start, index_t end) {
|
auto parallel_section = [&](index_t start, index_t end) {
|
||||||
TensorIterator iter(add_iter);
|
TensorIterator iter(add_iter);
|
||||||
for (int64_t i = 0; i < numel; i++) {
|
for (const auto i : c10::irange(numel)) {
|
||||||
if (indices_data[i] != padding_idx) {
|
if (indices_data[i] != padding_idx) {
|
||||||
index_t k = indices_data[i];
|
index_t k = indices_data[i];
|
||||||
if (k >= start && k < end) {
|
if (k >= start && k < end) {
|
||||||
@ -167,7 +167,7 @@ Tensor & embedding_renorm_cpu_(
|
|||||||
|
|
||||||
// Note that we cannot use at::parallel_for here because we perform operations on
|
// Note that we cannot use at::parallel_for here because we perform operations on
|
||||||
// Tensor inside the loop. See github.com/pytorch/pytorch/issues/28370 for more details.
|
// Tensor inside the loop. See github.com/pytorch/pytorch/issues/28370 for more details.
|
||||||
for (auto i = 0; i < num_indices; i++) {
|
for (const auto i : c10::irange(num_indices)) {
|
||||||
if (i > 0 && sorted_indices[i] == sorted_indices[i - 1]) {
|
if (i > 0 && sorted_indices[i] == sorted_indices[i - 1]) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -107,7 +107,7 @@ index_select_add(const Tensor &select_indices,
|
|||||||
auto output_stride0 = output.strides()[0];
|
auto output_stride0 = output.strides()[0];
|
||||||
auto output_stride1 = output.strides()[1];
|
auto output_stride1 = output.strides()[1];
|
||||||
|
|
||||||
for (int64_t i = 0; i < numel; i++) {
|
for (const auto i : c10::irange(numel)) {
|
||||||
// We can skip indices equal to padding_idx so they are not included in
|
// We can skip indices equal to padding_idx so they are not included in
|
||||||
// the reduction
|
// the reduction
|
||||||
if (select_indices_data[i] != padding_idx) {
|
if (select_indices_data[i] != padding_idx) {
|
||||||
@ -247,7 +247,7 @@ index_select_add(const Tensor &select_indices,
|
|||||||
auto output_stride0 = output.strides()[0];
|
auto output_stride0 = output.strides()[0];
|
||||||
auto output_stride1 = output.strides()[1];
|
auto output_stride1 = output.strides()[1];
|
||||||
auto numel = add_indices.numel();
|
auto numel = add_indices.numel();
|
||||||
for (int64_t i = 0; i < numel; i++) {
|
for (const auto i : c10::irange(numel)) {
|
||||||
// We can skip indices equal to padding_idx so they are not included in
|
// We can skip indices equal to padding_idx so they are not included in
|
||||||
// the reduction
|
// the reduction
|
||||||
if (select_indices_data[i] != padding_idx) {
|
if (select_indices_data[i] != padding_idx) {
|
||||||
@ -302,14 +302,14 @@ index_select_scale_add(const Tensor &select_indices,
|
|||||||
auto* scale_data = scale.data_ptr<data_t>();
|
auto* scale_data = scale.data_ptr<data_t>();
|
||||||
auto scale_stride = scale.strides()[0];
|
auto scale_stride = scale.strides()[0];
|
||||||
|
|
||||||
for (int64_t i = 0; i < numel; i++) {
|
for (const auto i : c10::irange(numel)) {
|
||||||
// We can skip indices equal to padding_idx so they are not included in
|
// We can skip indices equal to padding_idx so they are not included in
|
||||||
// the reduction
|
// the reduction
|
||||||
if (select_indices_data[i] != padding_idx) {
|
if (select_indices_data[i] != padding_idx) {
|
||||||
auto* src_base = src_data + src_stride0 * select_indices_data[i];
|
auto* src_base = src_data + src_stride0 * select_indices_data[i];
|
||||||
auto* output_base = output_data + output_stride0 * add_indices_data[i];
|
auto* output_base = output_data + output_stride0 * add_indices_data[i];
|
||||||
auto scale = scale_data[i * scale_stride];
|
auto scale = scale_data[i * scale_stride];
|
||||||
for (int64_t j = 0; j < ddim; j++) {
|
for (const auto j : c10::irange(ddim)) {
|
||||||
output_base[j * output_stride1] += src_base[j * src_stride1] * scale;
|
output_base[j * output_stride1] += src_base[j * src_stride1] * scale;
|
||||||
}
|
}
|
||||||
} else if (bag_size.defined()) {
|
} else if (bag_size.defined()) {
|
||||||
@ -419,14 +419,14 @@ index_select_scale_add(const Tensor &select_indices,
|
|||||||
auto numel = add_indices.numel();
|
auto numel = add_indices.numel();
|
||||||
|
|
||||||
|
|
||||||
for (int64_t i = 0; i < numel; i++) {
|
for (const auto i : c10::irange(numel)) {
|
||||||
// We can skip indices equal to padding_idx so they are not included in
|
// We can skip indices equal to padding_idx so they are not included in
|
||||||
// the reduction
|
// the reduction
|
||||||
if (select_indices_data[i] != padding_idx) {
|
if (select_indices_data[i] != padding_idx) {
|
||||||
auto* src_base = src_data + src_stride0 * select_indices_data[i];
|
auto* src_base = src_data + src_stride0 * select_indices_data[i];
|
||||||
auto* output_base = output_data + output_stride0 * add_indices_data[i];
|
auto* output_base = output_data + output_stride0 * add_indices_data[i];
|
||||||
auto scale = scale_data[i * scale_stride];
|
auto scale = scale_data[i * scale_stride];
|
||||||
for (int64_t j = 0; j < ddim; j++) {
|
for (const auto j : c10::irange(ddim)) {
|
||||||
output_base[j * output_stride1] += src_base[j * src_stride1] * scale;
|
output_base[j * output_stride1] += src_base[j * src_stride1] * scale;
|
||||||
}
|
}
|
||||||
} else if (bag_size.defined()) {
|
} else if (bag_size.defined()) {
|
||||||
|
@ -6,6 +6,7 @@
|
|||||||
#include <ATen/native/TensorIterator.h>
|
#include <ATen/native/TensorIterator.h>
|
||||||
#include <ATen/Utils.h>
|
#include <ATen/Utils.h>
|
||||||
#include <c10/util/accumulate.h>
|
#include <c10/util/accumulate.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
namespace at {
|
namespace at {
|
||||||
namespace native {
|
namespace native {
|
||||||
@ -63,7 +64,7 @@ Tensor& fill_diagonal_(Tensor& self, const Scalar& fill_value, bool wrap) {
|
|||||||
|
|
||||||
if (nDims > 2) {
|
if (nDims > 2) {
|
||||||
int64_t dim1 = height;
|
int64_t dim1 = height;
|
||||||
for (int64_t i = 1; i < nDims; i++) {
|
for (const auto i : c10::irange(1, nDims)) {
|
||||||
if (self.size(i) != dim1) {
|
if (self.size(i) != dim1) {
|
||||||
AT_ERROR("all dimensions of input must be of equal length");
|
AT_ERROR("all dimensions of input must be of equal length");
|
||||||
}
|
}
|
||||||
@ -76,7 +77,7 @@ Tensor& fill_diagonal_(Tensor& self, const Scalar& fill_value, bool wrap) {
|
|||||||
int64_t size = std::min(height, width);
|
int64_t size = std::min(height, width);
|
||||||
|
|
||||||
int64_t stride = 0;
|
int64_t stride = 0;
|
||||||
for (int64_t i = 0; i < nDims; i++) {
|
for (const auto i : c10::irange(nDims)) {
|
||||||
stride += self.stride(i);
|
stride += self.stride(i);
|
||||||
}
|
}
|
||||||
strides.push_back(stride);
|
strides.push_back(stride);
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
#include <ATen/ATen.h>
|
#include <ATen/ATen.h>
|
||||||
#include <ATen/NativeFunctions.h>
|
#include <ATen/NativeFunctions.h>
|
||||||
#include <ATen/Parallel.h>
|
#include <ATen/Parallel.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
#include <tuple>
|
#include <tuple>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
@ -32,7 +33,7 @@ TORCH_META_FUNC(fractional_max_pool2d) (
|
|||||||
int64_t ndims = input.ndimension();
|
int64_t ndims = input.ndimension();
|
||||||
TORCH_CHECK(ndims == 3 || ndims == 4,
|
TORCH_CHECK(ndims == 3 || ndims == 4,
|
||||||
"fractional_max_pool2d(): Expected 3D or 4D tensor, but got: ", input.sizes());
|
"fractional_max_pool2d(): Expected 3D or 4D tensor, but got: ", input.sizes());
|
||||||
for (int64_t i = 1; i < ndims; ++i) {
|
for (const auto i : c10::irange(1, ndims)) {
|
||||||
TORCH_CHECK(input.size(i) > 0,
|
TORCH_CHECK(input.size(i) > 0,
|
||||||
"fractional_max_pool2d(): Expected input to have non-zero size for non-batch dimensions, but got",
|
"fractional_max_pool2d(): Expected input to have non-zero size for non-batch dimensions, but got",
|
||||||
input.sizes(), " with dimension ", i, " being empty.");
|
input.sizes(), " with dimension ", i, " being empty.");
|
||||||
@ -106,7 +107,7 @@ static void fractional_max_pool2d_out_single_batch_frame(
|
|||||||
int outputW, int outputH,
|
int outputW, int outputH,
|
||||||
int poolSizeW, int poolSizeH) {
|
int poolSizeW, int poolSizeH) {
|
||||||
at::parallel_for(0, numPlanes, 0, [&](int64_t start, int64_t end) {
|
at::parallel_for(0, numPlanes, 0, [&](int64_t start, int64_t end) {
|
||||||
for (auto plane = start; plane < end; ++plane) {
|
for (const auto plane : c10::irange(start, end)) {
|
||||||
/* each plane contains 2 random samples, one for W and one for H */
|
/* each plane contains 2 random samples, one for W and one for H */
|
||||||
scalar_t* randomSamplesForPlane = randomSamples + plane * 2;
|
scalar_t* randomSamplesForPlane = randomSamples + plane * 2;
|
||||||
|
|
||||||
@ -177,7 +178,7 @@ static void fractional_max_pool2d_out_frame(
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
at::parallel_for(0, numBatch, 0, [&](int64_t start, int64_t end) {
|
at::parallel_for(0, numBatch, 0, [&](int64_t start, int64_t end) {
|
||||||
for (auto batch = start; batch < end; ++batch) {
|
for (const auto batch : c10::irange(start, end)) {
|
||||||
fractional_max_pool2d_out_single_batch_frame<scalar_t>(
|
fractional_max_pool2d_out_single_batch_frame<scalar_t>(
|
||||||
input + batch * numPlanes * inputH * inputW,
|
input + batch * numPlanes * inputH * inputW,
|
||||||
output + batch * numPlanes * outputH * outputW,
|
output + batch * numPlanes * outputH * outputW,
|
||||||
@ -254,7 +255,7 @@ static void fractional_max_pool2d_backward_out_single_batch_frame(
|
|||||||
int inputW, int inputH,
|
int inputW, int inputH,
|
||||||
int outputW, int outputH) {
|
int outputW, int outputH) {
|
||||||
at::parallel_for(0, numPlanes, 0, [&](int64_t start, int64_t end) {
|
at::parallel_for(0, numPlanes, 0, [&](int64_t start, int64_t end) {
|
||||||
for (auto plane = start; plane < end; plane++) {
|
for (const auto plane : c10::irange(start, end)) {
|
||||||
scalar_t* gradInputForPlane = gradInput + plane * inputW * inputH;
|
scalar_t* gradInputForPlane = gradInput + plane * inputW * inputH;
|
||||||
scalar_t* gradOutputForPlane = gradOutput + plane * outputW * outputH;
|
scalar_t* gradOutputForPlane = gradOutput + plane * outputW * outputH;
|
||||||
int64_t* indicesForPlane = indices + plane * outputW * outputH;
|
int64_t* indicesForPlane = indices + plane * outputW * outputH;
|
||||||
@ -291,7 +292,7 @@ static void fractional_max_pool2d_backward_out_frame(
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
at::parallel_for(0, numBatch, 0, [&](int64_t start, int64_t end) {
|
at::parallel_for(0, numBatch, 0, [&](int64_t start, int64_t end) {
|
||||||
for (auto batch = start; batch < end; ++batch) {
|
for (const auto batch : c10::irange(start, end)) {
|
||||||
fractional_max_pool2d_backward_out_single_batch_frame<scalar_t>(
|
fractional_max_pool2d_backward_out_single_batch_frame<scalar_t>(
|
||||||
gradInput + batch * numPlanes * inputH * inputW,
|
gradInput + batch * numPlanes * inputH * inputW,
|
||||||
gradOutput + batch * numPlanes * outputH * outputW,
|
gradOutput + batch * numPlanes * outputH * outputW,
|
||||||
|
@ -44,7 +44,7 @@ static void fractional_max_pool3d_out_single_batch_frame(
|
|||||||
int64_t poolSizeT, int64_t poolSizeH, int64_t poolSizeW) {
|
int64_t poolSizeT, int64_t poolSizeH, int64_t poolSizeW) {
|
||||||
|
|
||||||
at::parallel_for(0, numPlanes, 0, [&](int64_t start, int64_t end) {
|
at::parallel_for(0, numPlanes, 0, [&](int64_t start, int64_t end) {
|
||||||
for (auto plane = start; plane < end; ++plane) {
|
for (const auto plane : c10::irange(start, end)) {
|
||||||
/* each plane contains 3 random samples,
|
/* each plane contains 3 random samples,
|
||||||
one for T, one for W, and one for H */
|
one for T, one for W, and one for H */
|
||||||
scalar_t* randomSamplesForPlane = randomSamples + plane * 3;
|
scalar_t* randomSamplesForPlane = randomSamples + plane * 3;
|
||||||
@ -126,7 +126,7 @@ static void fractional_max_pool3d_out_frame(
|
|||||||
}
|
}
|
||||||
|
|
||||||
at::parallel_for(0, numBatch, 0, [&](int64_t start, int64_t end) {
|
at::parallel_for(0, numBatch, 0, [&](int64_t start, int64_t end) {
|
||||||
for (auto batch = start; batch < end; ++batch) {
|
for (const auto batch : c10::irange(start, end)) {
|
||||||
fractional_max_pool3d_out_single_batch_frame<scalar_t>(
|
fractional_max_pool3d_out_single_batch_frame<scalar_t>(
|
||||||
input + batch * numPlanes * inputW * inputH * inputT,
|
input + batch * numPlanes * inputW * inputH * inputT,
|
||||||
output + batch * numPlanes * outputW * outputH * outputT,
|
output + batch * numPlanes * outputW * outputH * outputT,
|
||||||
@ -171,7 +171,7 @@ void fractional_max_pool3d_out_cpu_template(
|
|||||||
TORCH_CHECK(ndims == 4 || ndims == 5,
|
TORCH_CHECK(ndims == 4 || ndims == 5,
|
||||||
"fractional_max_pool3d_out(): Expected 4D or 5D tensor, but got: ",
|
"fractional_max_pool3d_out(): Expected 4D or 5D tensor, but got: ",
|
||||||
input_.sizes());
|
input_.sizes());
|
||||||
for (int64_t i = 1; i < ndims; ++i) {
|
for (const auto i : c10::irange(1, ndims)) {
|
||||||
TORCH_CHECK(input_.size(i) > 0,
|
TORCH_CHECK(input_.size(i) > 0,
|
||||||
"fractional_max_pool3d_out(): Expected input to have non-zero size for non-batch dimensions, but got",
|
"fractional_max_pool3d_out(): Expected input to have non-zero size for non-batch dimensions, but got",
|
||||||
input_.sizes(), " with dimension ", i, " being empty.");
|
input_.sizes(), " with dimension ", i, " being empty.");
|
||||||
@ -243,7 +243,7 @@ static void fractional_max_pool3d_backward_out_single_batch_frame(
|
|||||||
int64_t outputT, int64_t outputH, int64_t outputW) {
|
int64_t outputT, int64_t outputH, int64_t outputW) {
|
||||||
|
|
||||||
at::parallel_for(0, numPlanes, 0, [&](int64_t start, int64_t end) {
|
at::parallel_for(0, numPlanes, 0, [&](int64_t start, int64_t end) {
|
||||||
for (auto plane = start; plane < end; plane++) {
|
for (const auto plane : c10::irange(start, end)) {
|
||||||
scalar_t* gradInputForPlane = gradInput + plane * inputT * inputH * inputW;
|
scalar_t* gradInputForPlane = gradInput + plane * inputT * inputH * inputW;
|
||||||
scalar_t* gradOutputForPlane = gradOutput +
|
scalar_t* gradOutputForPlane = gradOutput +
|
||||||
plane * outputT * outputH * outputW;
|
plane * outputT * outputH * outputW;
|
||||||
@ -284,7 +284,7 @@ static void fractional_max_pool3d_backward_out_frame(
|
|||||||
}
|
}
|
||||||
|
|
||||||
at::parallel_for(0, numBatch, 0, [&](int64_t start, int64_t end) {
|
at::parallel_for(0, numBatch, 0, [&](int64_t start, int64_t end) {
|
||||||
for (auto batch = start; batch < end; ++batch) {
|
for (const auto batch : c10::irange(start, end)) {
|
||||||
fractional_max_pool3d_backward_out_single_batch_frame<scalar_t>(
|
fractional_max_pool3d_backward_out_single_batch_frame<scalar_t>(
|
||||||
gradInput + batch * numPlanes * inputW * inputH * inputT,
|
gradInput + batch * numPlanes * inputW * inputH * inputT,
|
||||||
gradOutput + batch * numPlanes * outputW * outputH * outputT,
|
gradOutput + batch * numPlanes * outputW * outputH * outputT,
|
||||||
|
@ -9,6 +9,7 @@
|
|||||||
#include <ATen/native/UpSample.h>
|
#include <ATen/native/UpSample.h>
|
||||||
#include <ATen/native/cpu/GridSamplerKernel.h>
|
#include <ATen/native/cpu/GridSamplerKernel.h>
|
||||||
#include <c10/util/Exception.h>
|
#include <c10/util/Exception.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
namespace at { namespace native {
|
namespace at { namespace native {
|
||||||
|
|
||||||
@ -51,12 +52,12 @@ namespace {
|
|||||||
scalar_t *grid_ptr = grid.data_ptr<scalar_t>();
|
scalar_t *grid_ptr = grid.data_ptr<scalar_t>();
|
||||||
// loop over each output pixel
|
// loop over each output pixel
|
||||||
at::parallel_for(0, N, 0, [&](int64_t start, int64_t end) {
|
at::parallel_for(0, N, 0, [&](int64_t start, int64_t end) {
|
||||||
for (int64_t n = start; n < end; ++n) {
|
for (const auto n : c10::irange(start, end)) {
|
||||||
scalar_t *grid_ptr_N = grid_ptr + n * grid_sN;
|
scalar_t *grid_ptr_N = grid_ptr + n * grid_sN;
|
||||||
scalar_t *inp_ptr_N = inp_ptr + n * inp_sN;
|
scalar_t *inp_ptr_N = inp_ptr + n * inp_sN;
|
||||||
for (int64_t d = 0; d < out_D; ++d) {
|
for (const auto d : c10::irange(out_D)) {
|
||||||
for (int64_t h = 0; h < out_H; ++h) {
|
for (const auto h : c10::irange(out_H)) {
|
||||||
for (int64_t w = 0; w < out_W; ++w) {
|
for (const auto w : c10::irange(out_W)) {
|
||||||
// get the corresponding input x, y, z co-ordinates from grid
|
// get the corresponding input x, y, z co-ordinates from grid
|
||||||
scalar_t *grid_ptr_NDHW = grid_ptr_N + d * grid_sD + h * grid_sH + w * grid_sW;
|
scalar_t *grid_ptr_NDHW = grid_ptr_N + d * grid_sD + h * grid_sH + w * grid_sW;
|
||||||
scalar_t ix = *grid_ptr_NDHW;
|
scalar_t ix = *grid_ptr_NDHW;
|
||||||
@ -222,12 +223,12 @@ namespace {
|
|||||||
scalar_t *gGrid_ptr = grad_grid.data_ptr<scalar_t>();
|
scalar_t *gGrid_ptr = grad_grid.data_ptr<scalar_t>();
|
||||||
// loop over each output pixel
|
// loop over each output pixel
|
||||||
at::parallel_for(0, N, 0, [&](int64_t start, int64_t end) {
|
at::parallel_for(0, N, 0, [&](int64_t start, int64_t end) {
|
||||||
for (int64_t n = start; n < end; ++n) {
|
for (const auto n : c10::irange(start, end)) {
|
||||||
scalar_t *grid_ptr_N = grid_ptr + n * grid_sN;
|
scalar_t *grid_ptr_N = grid_ptr + n * grid_sN;
|
||||||
scalar_t *inp_ptr_N = inp_ptr + n * inp_sN;
|
scalar_t *inp_ptr_N = inp_ptr + n * inp_sN;
|
||||||
scalar_t *gGrid_ptr_NDHW = gGrid_ptr + n * gGrid_sN;
|
scalar_t *gGrid_ptr_NDHW = gGrid_ptr + n * gGrid_sN;
|
||||||
for (int64_t d = 0; d < out_D; ++d) {
|
for (const auto d : c10::irange(out_D)) {
|
||||||
for (int64_t h = 0; h < out_H; ++h) {
|
for (const auto h : c10::irange(out_H)) {
|
||||||
for (int64_t w = 0; w < out_W; ++w, gGrid_ptr_NDHW += gGrid_sW /* grad_grid is contiguous */ ) {
|
for (int64_t w = 0; w < out_W; ++w, gGrid_ptr_NDHW += gGrid_sW /* grad_grid is contiguous */ ) {
|
||||||
// get the corresponding input x, y, z co-ordinates from grid
|
// get the corresponding input x, y, z co-ordinates from grid
|
||||||
scalar_t *grid_ptr_NDHW = grid_ptr_N + d * grid_sD + h * grid_sH + w * grid_sW;
|
scalar_t *grid_ptr_NDHW = grid_ptr_N + d * grid_sD + h * grid_sH + w * grid_sW;
|
||||||
@ -416,11 +417,11 @@ Tensor _grid_sampler_2d_cpu_fallback(const Tensor& input, const Tensor& grid,
|
|||||||
scalar_t *grid_ptr = grid.data_ptr<scalar_t>();
|
scalar_t *grid_ptr = grid.data_ptr<scalar_t>();
|
||||||
// loop over each output pixel
|
// loop over each output pixel
|
||||||
at::parallel_for(0, N, 0, [&](int64_t start, int64_t end) {
|
at::parallel_for(0, N, 0, [&](int64_t start, int64_t end) {
|
||||||
for (int64_t n = start; n < end; ++n) {
|
for (const auto n : c10::irange(start, end)) {
|
||||||
scalar_t *grid_ptr_N = grid_ptr + n * grid_sN;
|
scalar_t *grid_ptr_N = grid_ptr + n * grid_sN;
|
||||||
scalar_t *inp_ptr_N = inp_ptr + n * inp_sN;
|
scalar_t *inp_ptr_N = inp_ptr + n * inp_sN;
|
||||||
for (int64_t h = 0; h < out_H; ++h) {
|
for (const auto h : c10::irange(out_H)) {
|
||||||
for (int64_t w = 0; w < out_W; ++w) {
|
for (const auto w : c10::irange(out_W)) {
|
||||||
// get the corresponding input x, y, z co-ordinates from grid
|
// get the corresponding input x, y, z co-ordinates from grid
|
||||||
scalar_t *grid_ptr_NHW = grid_ptr_N + h * grid_sH + w * grid_sW;
|
scalar_t *grid_ptr_NHW = grid_ptr_N + h * grid_sH + w * grid_sW;
|
||||||
scalar_t x = *grid_ptr_NHW;
|
scalar_t x = *grid_ptr_NHW;
|
||||||
@ -505,7 +506,7 @@ Tensor _grid_sampler_2d_cpu_fallback(const Tensor& input, const Tensor& grid,
|
|||||||
scalar_t coefficients[4];
|
scalar_t coefficients[4];
|
||||||
|
|
||||||
// Interpolate 4 values in the x directon
|
// Interpolate 4 values in the x directon
|
||||||
for (int64_t i = 0; i < 4; ++i) {
|
for (const auto i : c10::irange(4)) {
|
||||||
coefficients[i] = cubic_interp1d<scalar_t>(
|
coefficients[i] = cubic_interp1d<scalar_t>(
|
||||||
get_value_bounded<scalar_t>(inp_ptr_NC, ix_nw - 1, iy_nw - 1 + i, inp_W, inp_H, inp_sW, inp_sH, padding_mode, align_corners),
|
get_value_bounded<scalar_t>(inp_ptr_NC, ix_nw - 1, iy_nw - 1 + i, inp_W, inp_H, inp_sW, inp_sH, padding_mode, align_corners),
|
||||||
get_value_bounded<scalar_t>(inp_ptr_NC, ix_nw + 0, iy_nw - 1 + i, inp_W, inp_H, inp_sW, inp_sH, padding_mode, align_corners),
|
get_value_bounded<scalar_t>(inp_ptr_NC, ix_nw + 0, iy_nw - 1 + i, inp_W, inp_H, inp_sW, inp_sH, padding_mode, align_corners),
|
||||||
@ -578,11 +579,11 @@ _grid_sampler_2d_cpu_fallback_backward(const Tensor& grad_output,
|
|||||||
scalar_t *gGrid_ptr = grad_grid.data_ptr<scalar_t>();
|
scalar_t *gGrid_ptr = grad_grid.data_ptr<scalar_t>();
|
||||||
// loop over each output pixel
|
// loop over each output pixel
|
||||||
at::parallel_for(0, N, 0, [&](int64_t start, int64_t end) {
|
at::parallel_for(0, N, 0, [&](int64_t start, int64_t end) {
|
||||||
for (int64_t n = start; n < end; ++n) {
|
for (const auto n : c10::irange(start, end)) {
|
||||||
scalar_t *grid_ptr_N = grid_ptr + n * grid_sN;
|
scalar_t *grid_ptr_N = grid_ptr + n * grid_sN;
|
||||||
scalar_t *inp_ptr_N = inp_ptr + n * inp_sN;
|
scalar_t *inp_ptr_N = inp_ptr + n * inp_sN;
|
||||||
scalar_t *gGrid_ptr_NHW = gGrid_ptr + n * gGrid_sN;
|
scalar_t *gGrid_ptr_NHW = gGrid_ptr + n * gGrid_sN;
|
||||||
for (int64_t h = 0; h < out_H; ++h) {
|
for (const auto h : c10::irange(out_H)) {
|
||||||
for (int64_t w = 0; w < out_W; ++w, gGrid_ptr_NHW += gGrid_sW /* grad_grid is contiguous */ ) {
|
for (int64_t w = 0; w < out_W; ++w, gGrid_ptr_NHW += gGrid_sW /* grad_grid is contiguous */ ) {
|
||||||
// get the corresponding input x, y co-ordinates from grid
|
// get the corresponding input x, y co-ordinates from grid
|
||||||
scalar_t *grid_ptr_NHW = grid_ptr_N + h * grid_sH + w * grid_sW;
|
scalar_t *grid_ptr_NHW = grid_ptr_N + h * grid_sH + w * grid_sW;
|
||||||
@ -703,8 +704,8 @@ _grid_sampler_2d_cpu_fallback_backward(const Tensor& grad_output,
|
|||||||
for (int64_t c = 0; c < C; ++c, gOut_ptr_NCHW += gOut_sC, gInp_ptr_NC += gInp_sC, inp_ptr_NC+= inp_sC) {
|
for (int64_t c = 0; c < C; ++c, gOut_ptr_NCHW += gOut_sC, gInp_ptr_NC += gInp_sC, inp_ptr_NC+= inp_sC) {
|
||||||
scalar_t gOut = *gOut_ptr_NCHW;
|
scalar_t gOut = *gOut_ptr_NCHW;
|
||||||
|
|
||||||
for (int64_t i = 0; i < 4; ++i) {
|
for (const auto i : c10::irange(4)) {
|
||||||
for (int64_t j = 0; j < 4; ++j) {
|
for (const auto j : c10::irange(4)) {
|
||||||
|
|
||||||
// set input gradient
|
// set input gradient
|
||||||
add_value_bounded<scalar_t>(gInp_ptr_NC, ix_nw - 1 + i, iy_nw - 1 + j,
|
add_value_bounded<scalar_t>(gInp_ptr_NC, ix_nw - 1 + i, iy_nw - 1 + j,
|
||||||
@ -857,7 +858,7 @@ Tensor grid_sampler(const Tensor& input, const Tensor& grid,
|
|||||||
!(input.dim() == 5 && static_cast<GridSamplerInterpolation>(interpolation_mode) == GridSamplerInterpolation::Bicubic),
|
!(input.dim() == 5 && static_cast<GridSamplerInterpolation>(interpolation_mode) == GridSamplerInterpolation::Bicubic),
|
||||||
"grid_sampler(): bicubic interpolation only supports 4D input"
|
"grid_sampler(): bicubic interpolation only supports 4D input"
|
||||||
);
|
);
|
||||||
for (int64_t i = 2; i < input.dim(); i++) {
|
for (const auto i : c10::irange(2, input.dim())) {
|
||||||
TORCH_CHECK(input.size(i) > 0,
|
TORCH_CHECK(input.size(i) > 0,
|
||||||
"grid_sampler(): expected input to have non-empty spatial dimensions, "
|
"grid_sampler(): expected input to have non-empty spatial dimensions, "
|
||||||
"but input has sizes ", input.sizes(), " with dimension ", i, " being "
|
"but input has sizes ", input.sizes(), " with dimension ", i, " being "
|
||||||
|
@ -5,6 +5,7 @@
|
|||||||
|
|
||||||
#include <ATen/native/im2col.h>
|
#include <ATen/native/im2col.h>
|
||||||
#include <ATen/native/im2col_shape_check.h>
|
#include <ATen/native/im2col_shape_check.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
namespace at {
|
namespace at {
|
||||||
namespace native {
|
namespace native {
|
||||||
@ -91,7 +92,7 @@ static void im2col_out_cpu_template(
|
|||||||
Tensor input_n;
|
Tensor input_n;
|
||||||
Tensor output_n;
|
Tensor output_n;
|
||||||
|
|
||||||
for (int64_t elt = 0; elt < batch_size; elt++) {
|
for (const auto elt : c10::irange(batch_size)) {
|
||||||
input_n = input.select(0, elt);
|
input_n = input.select(0, elt);
|
||||||
output_n = output.select(0, elt);
|
output_n = output.select(0, elt);
|
||||||
|
|
||||||
|
@ -2,6 +2,7 @@
|
|||||||
#include <ATen/ExpandUtils.h>
|
#include <ATen/ExpandUtils.h>
|
||||||
#include <ATen/native/TensorIterator.h>
|
#include <ATen/native/TensorIterator.h>
|
||||||
#include <ATen/core/List.h>
|
#include <ATen/core/List.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
#include <limits>
|
#include <limits>
|
||||||
|
|
||||||
@ -31,7 +32,7 @@ static C10_UNUSED std::vector<Tensor> expandTensors(const Tensor & self, const t
|
|||||||
}
|
}
|
||||||
// The sizes of the ByteTensor mask or bool tensor must match the sizes of the
|
// The sizes of the ByteTensor mask or bool tensor must match the sizes of the
|
||||||
// corresponding dimensions in self
|
// corresponding dimensions in self
|
||||||
for (int64_t j = 0; j < index.dim(); j++) {
|
for (const auto j : c10::irange(index.dim())) {
|
||||||
int64_t srcIdx = result.size() + j;
|
int64_t srcIdx = result.size() + j;
|
||||||
if (index.size(j) != self.size(srcIdx)) {
|
if (index.size(j) != self.size(srcIdx)) {
|
||||||
invalid_mask(self, srcIdx, index, j);
|
invalid_mask(self, srcIdx, index, j);
|
||||||
@ -39,7 +40,7 @@ static C10_UNUSED std::vector<Tensor> expandTensors(const Tensor & self, const t
|
|||||||
}
|
}
|
||||||
// Replace with nonzeros
|
// Replace with nonzeros
|
||||||
auto nonzero = index.nonzero();
|
auto nonzero = index.nonzero();
|
||||||
for (int64_t j = 0; j < index.dim(); j++) {
|
for (const auto j : c10::irange(index.dim())) {
|
||||||
result.emplace_back(nonzero.select(1, j));
|
result.emplace_back(nonzero.select(1, j));
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
@ -1158,7 +1158,7 @@ static void addbmm_impl_(
|
|||||||
}
|
}
|
||||||
|
|
||||||
auto adjusted_beta(beta);
|
auto adjusted_beta(beta);
|
||||||
for (int64_t batch = 0; batch < num_batches; ++batch) {
|
for (const auto batch : c10::irange(num_batches)) {
|
||||||
result.addmm_(batch1[batch], batch2[batch], adjusted_beta, alpha);
|
result.addmm_(batch1[batch], batch2[batch], adjusted_beta, alpha);
|
||||||
adjusted_beta = 1; // accumulate output once
|
adjusted_beta = 1; // accumulate output once
|
||||||
}
|
}
|
||||||
@ -1215,23 +1215,23 @@ inline void baddbmm_cpu_kernel(const Tensor& result, const Tensor& self, const T
|
|||||||
|
|
||||||
int64_t grain_size = std::min(internal::GRAIN_SIZE / (is * js * ks), (int64_t)1);
|
int64_t grain_size = std::min(internal::GRAIN_SIZE / (is * js * ks), (int64_t)1);
|
||||||
parallel_for(0, bs, grain_size, [&](int64_t b_begin, int64_t b_end) {
|
parallel_for(0, bs, grain_size, [&](int64_t b_begin, int64_t b_end) {
|
||||||
for (int64_t b = b_begin; b < b_end; b++) {
|
for (const auto b : c10::irange(b_begin, b_end)) {
|
||||||
auto r1 = r0[b];
|
auto r1 = r0[b];
|
||||||
auto s1 = s0[b];
|
auto s1 = s0[b];
|
||||||
auto m1 = m0[b];
|
auto m1 = m0[b];
|
||||||
for (int64_t i = 0; i < is; i++) {
|
for (const auto i : c10::irange(is)) {
|
||||||
auto r2 = r1[i];
|
auto r2 = r1[i];
|
||||||
auto s2 = s1[i];
|
auto s2 = s1[i];
|
||||||
for (int64_t j = 0; j < js; j++) {
|
for (const auto j : c10::irange(js)) {
|
||||||
scalar_t &r = r2[j];
|
scalar_t &r = r2[j];
|
||||||
if (is_bmm) {
|
if (is_bmm) {
|
||||||
r = 0;
|
r = 0;
|
||||||
for (int64_t k = 0; k < ks; k++) {
|
for (const auto k : c10::irange(ks)) {
|
||||||
r += s2[k] * m1[k][j];
|
r += s2[k] * m1[k][j];
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
r *= beta;
|
r *= beta;
|
||||||
for (int64_t k = 0; k < ks; k++) {
|
for (const auto k : c10::irange(ks)) {
|
||||||
r += alpha * s2[k] * m1[k][j];
|
r += alpha * s2[k] * m1[k][j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1994,10 +1994,11 @@ void compute_T18_scale_square(
|
|||||||
auto mexp_scaled = at::native::compute_T18<scalar_t>(a_scaled);
|
auto mexp_scaled = at::native::compute_T18<scalar_t>(a_scaled);
|
||||||
auto s_cpu = (s.device().type() == at::kCPU)
|
auto s_cpu = (s.device().type() == at::kCPU)
|
||||||
? s : s.to(at::kCPU);
|
? s : s.to(at::kCPU);
|
||||||
for (int64_t i = 0; i < mexp_scaled.size(0); ++i) {
|
for (const auto i : c10::irange(mexp_scaled.size(0))) {
|
||||||
auto s_val = s_cpu.select(0, i).template item<int64_t>();
|
auto s_val = s_cpu.select(0, i).template item<int64_t>();
|
||||||
auto mexp = mexp_scaled.select(0, i);
|
auto mexp = mexp_scaled.select(0, i);
|
||||||
for (int64_t p = 0; p < s_val; ++p) {
|
for (const auto p : c10::irange(s_val)) {
|
||||||
|
(void)p; //Suppress unused variable warning
|
||||||
mexp = at::matmul(mexp, mexp);
|
mexp = at::matmul(mexp, mexp);
|
||||||
}
|
}
|
||||||
mexp_out.select(0, i).copy_(mexp);
|
mexp_out.select(0, i).copy_(mexp);
|
||||||
@ -2265,7 +2266,7 @@ Tensor& nuclear_norm_out(const Tensor& self, IntArrayRef dim, bool keepdim, Tens
|
|||||||
// (e.g. [0, 1, 2, ..., ndim-1])
|
// (e.g. [0, 1, 2, ..., ndim-1])
|
||||||
static std::vector<int64_t> make_dim_list(int64_t ndim) {
|
static std::vector<int64_t> make_dim_list(int64_t ndim) {
|
||||||
std::vector<int64_t> dim_list(ndim);
|
std::vector<int64_t> dim_list(ndim);
|
||||||
for (int64_t ind = 0; ind < ndim; ind++) {
|
for (const auto ind : c10::irange(ndim)) {
|
||||||
dim_list[ind] = ind;
|
dim_list[ind] = ind;
|
||||||
}
|
}
|
||||||
return dim_list;
|
return dim_list;
|
||||||
@ -2818,7 +2819,7 @@ struct KronImpl final {
|
|||||||
a_reshape = c10::SmallVector<int64_t, 10>(2 * maxdim);
|
a_reshape = c10::SmallVector<int64_t, 10>(2 * maxdim);
|
||||||
b_reshape = c10::SmallVector<int64_t, 10>(2 * maxdim);
|
b_reshape = c10::SmallVector<int64_t, 10>(2 * maxdim);
|
||||||
result_reshape = c10::SmallVector<int64_t, 10>(maxdim);
|
result_reshape = c10::SmallVector<int64_t, 10>(maxdim);
|
||||||
for (int64_t i = 0; i < maxdim; i++) {
|
for (const auto i : c10::irange(maxdim)) {
|
||||||
a_reshape[2 * i] = (i >= pad_self ? self.sizes()[i - pad_self] : 1);
|
a_reshape[2 * i] = (i >= pad_self ? self.sizes()[i - pad_self] : 1);
|
||||||
a_reshape[2 * i + 1] = 1;
|
a_reshape[2 * i + 1] = 1;
|
||||||
b_reshape[2 * i] = 1;
|
b_reshape[2 * i] = 1;
|
||||||
@ -2833,7 +2834,7 @@ struct KronImpl final {
|
|||||||
TORCH_INTERNAL_ASSERT(result.defined(), "Cannot call kron_out with an undefined result tensor as the out argument. Please allocate a Tensor before calling kron_out with it.");
|
TORCH_INTERNAL_ASSERT(result.defined(), "Cannot call kron_out with an undefined result tensor as the out argument. Please allocate a Tensor before calling kron_out with it.");
|
||||||
|
|
||||||
c10::SmallVector<int64_t, 10> mul_shape(2 * maxdim);
|
c10::SmallVector<int64_t, 10> mul_shape(2 * maxdim);
|
||||||
for (int64_t i = 0; i < maxdim; i++) {
|
for (const auto i : c10::irange(maxdim)) {
|
||||||
mul_shape[2 * i] = a_reshape[2 * i];
|
mul_shape[2 * i] = a_reshape[2 * i];
|
||||||
mul_shape[2 * i + 1] = b_reshape[2 * i + 1];
|
mul_shape[2 * i + 1] = b_reshape[2 * i + 1];
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <c10/core/ScalarType.h>
|
#include <c10/core/ScalarType.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <ATen/ATen.h>
|
#include <ATen/ATen.h>
|
||||||
#include <ATen/ExpandUtils.h>
|
#include <ATen/ExpandUtils.h>
|
||||||
#include <ATen/TensorUtils.h>
|
#include <ATen/TensorUtils.h>
|
||||||
@ -169,7 +170,8 @@ void batch_iterator_with_broadcasting(const Tensor& a, const Tensor& b, const fu
|
|||||||
auto* b_batch_idx_ptr = data[0];
|
auto* b_batch_idx_ptr = data[0];
|
||||||
auto* a_batch_idx_ptr = data[1];
|
auto* a_batch_idx_ptr = data[1];
|
||||||
|
|
||||||
for (int64_t elem = 0; elem < nelems; ++elem) {
|
for (const auto elem : c10::irange(nelems)) {
|
||||||
|
(void)elem; //Suppress unused variable warning
|
||||||
auto b_curr_linear_batch_idx = *reinterpret_cast<int64_t*>(b_batch_idx_ptr);
|
auto b_curr_linear_batch_idx = *reinterpret_cast<int64_t*>(b_batch_idx_ptr);
|
||||||
auto a_curr_linear_batch_idx = *reinterpret_cast<int64_t*>(a_batch_idx_ptr);
|
auto a_curr_linear_batch_idx = *reinterpret_cast<int64_t*>(a_batch_idx_ptr);
|
||||||
|
|
||||||
@ -332,7 +334,7 @@ static inline Tensor _move_to_end(const Tensor& self, IntArrayRef axes) {
|
|||||||
const int64_t ndim = self.ndimension();
|
const int64_t ndim = self.ndimension();
|
||||||
std::vector<int64_t> perm;
|
std::vector<int64_t> perm;
|
||||||
|
|
||||||
for (int64_t i = 0; i < ndim; i++) {
|
for (const auto i : c10::irange(ndim)) {
|
||||||
auto it = std::find(a.begin(), a.end(), i);
|
auto it = std::find(a.begin(), a.end(), i);
|
||||||
if (it == a.end()) {
|
if (it == a.end()) {
|
||||||
perm.push_back(i);
|
perm.push_back(i);
|
||||||
@ -476,7 +478,7 @@ static inline std::vector<int64_t> create_dim_backshift_permutation(int64_t dim0
|
|||||||
"duplicate or invalid dimensions");
|
"duplicate or invalid dimensions");
|
||||||
std::vector<int64_t> permutation(ndim);
|
std::vector<int64_t> permutation(ndim);
|
||||||
int64_t cur_permuted_dim = 0;
|
int64_t cur_permuted_dim = 0;
|
||||||
for (int64_t dim_ind = 0; dim_ind < ndim; dim_ind++) {
|
for (const auto dim_ind : c10::irange(ndim)) {
|
||||||
if ((dim_ind != dim0) && (dim_ind != dim1)) {
|
if ((dim_ind != dim0) && (dim_ind != dim1)) {
|
||||||
permutation[cur_permuted_dim++] = dim_ind;
|
permutation[cur_permuted_dim++] = dim_ind;
|
||||||
}
|
}
|
||||||
@ -493,7 +495,7 @@ static inline std::vector<int64_t> create_dim_backshift_permutation(int64_t dim0
|
|||||||
static inline std::vector<int64_t> create_reverse_permutation(std::vector<int64_t> permutation) {
|
static inline std::vector<int64_t> create_reverse_permutation(std::vector<int64_t> permutation) {
|
||||||
int64_t ndim = permutation.size();
|
int64_t ndim = permutation.size();
|
||||||
std::vector<int64_t> reverse_permutation(ndim);
|
std::vector<int64_t> reverse_permutation(ndim);
|
||||||
for (int64_t dim_ind = 0; dim_ind < ndim; dim_ind++) {
|
for (const auto dim_ind : c10::irange(ndim)) {
|
||||||
reverse_permutation[permutation[dim_ind]] = dim_ind;
|
reverse_permutation[permutation[dim_ind]] = dim_ind;
|
||||||
}
|
}
|
||||||
return reverse_permutation;
|
return reverse_permutation;
|
||||||
|
@ -11,6 +11,7 @@
|
|||||||
#include <ATen/Parallel.h>
|
#include <ATen/Parallel.h>
|
||||||
#include <ATen/TensorUtils.h>
|
#include <ATen/TensorUtils.h>
|
||||||
#include <ATen/native/Fill.h>
|
#include <ATen/native/Fill.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
#include <numeric>
|
#include <numeric>
|
||||||
#include <type_traits>
|
#include <type_traits>
|
||||||
@ -60,7 +61,7 @@ std::tuple<Tensor, Tensor> ctc_loss_cpu_template(const Tensor& log_probs, const
|
|||||||
std::vector<int64_t> tg_batch_offsets(batch_size);
|
std::vector<int64_t> tg_batch_offsets(batch_size);
|
||||||
if (targets.dim() == 1) { // concatenated targets
|
if (targets.dim() == 1) { // concatenated targets
|
||||||
int64_t pos = 0;
|
int64_t pos = 0;
|
||||||
for (int64_t i = 0; i < batch_size; i++) {
|
for (const auto i : c10::irange(batch_size)) {
|
||||||
tg_batch_offsets[i] = pos;
|
tg_batch_offsets[i] = pos;
|
||||||
pos += target_lengths[i];
|
pos += target_lengths[i];
|
||||||
if (max_target_length < target_lengths[i])
|
if (max_target_length < target_lengths[i])
|
||||||
@ -72,7 +73,7 @@ std::tuple<Tensor, Tensor> ctc_loss_cpu_template(const Tensor& log_probs, const
|
|||||||
else { // batch x max_target_length
|
else { // batch x max_target_length
|
||||||
// dim is 2
|
// dim is 2
|
||||||
int64_t tg_batch_stride = targets.stride(0);
|
int64_t tg_batch_stride = targets.stride(0);
|
||||||
for (int64_t i = 0; i < batch_size; i++) {
|
for (const auto i : c10::irange(batch_size)) {
|
||||||
tg_batch_offsets[i] = i * tg_batch_stride;
|
tg_batch_offsets[i] = i * tg_batch_stride;
|
||||||
if (max_target_length < target_lengths[i])
|
if (max_target_length < target_lengths[i])
|
||||||
max_target_length = target_lengths[i];
|
max_target_length = target_lengths[i];
|
||||||
@ -84,7 +85,7 @@ std::tuple<Tensor, Tensor> ctc_loss_cpu_template(const Tensor& log_probs, const
|
|||||||
" (while checking arguments for ", c, ")");
|
" (while checking arguments for ", c, ")");
|
||||||
}
|
}
|
||||||
int64_t max_input_length = log_probs.size(0);
|
int64_t max_input_length = log_probs.size(0);
|
||||||
for (int64_t b = 0; b < batch_size; b++) {
|
for (const auto b : c10::irange(batch_size)) {
|
||||||
TORCH_CHECK(input_lengths[b] <= max_input_length,
|
TORCH_CHECK(input_lengths[b] <= max_input_length,
|
||||||
"Expected input_lengths to have value at most ", max_input_length, ", but got value ", input_lengths[b],
|
"Expected input_lengths to have value at most ", max_input_length, ", but got value ", input_lengths[b],
|
||||||
" (while checking arguments for ", c, ")");
|
" (while checking arguments for ", c, ")");
|
||||||
@ -103,7 +104,7 @@ std::tuple<Tensor, Tensor> ctc_loss_cpu_template(const Tensor& log_probs, const
|
|||||||
// first the default
|
// first the default
|
||||||
log_alpha.narrow(1, 0, 1).fill_(neginf);
|
log_alpha.narrow(1, 0, 1).fill_(neginf);
|
||||||
at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) {
|
at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) {
|
||||||
for (int64_t b = start; b < end; b++) {
|
for (const auto b : c10::irange(start, end)) {
|
||||||
int64_t input_length = input_lengths[b];
|
int64_t input_length = input_lengths[b];
|
||||||
int64_t target_length = target_lengths[b];
|
int64_t target_length = target_lengths[b];
|
||||||
auto log_probs_a = log_probs_a_global[b];
|
auto log_probs_a = log_probs_a_global[b];
|
||||||
@ -116,7 +117,7 @@ std::tuple<Tensor, Tensor> ctc_loss_cpu_template(const Tensor& log_probs, const
|
|||||||
log_alpha_a[0][1] = log_probs_a[0][get_target_prime(targets_data, tg_batch_offset, tg_target_stride, 1, BLANK)];
|
log_alpha_a[0][1] = log_probs_a[0][get_target_prime(targets_data, tg_batch_offset, tg_target_stride, 1, BLANK)];
|
||||||
|
|
||||||
// now the loop over the inputs
|
// now the loop over the inputs
|
||||||
for (int64_t t=1; t<input_length; t++) {
|
for (const auto t : c10::irange(1, input_length)) {
|
||||||
for (int64_t s=0; s<2*target_length+1; s++) {
|
for (int64_t s=0; s<2*target_length+1; s++) {
|
||||||
auto current_target_prime = get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s, BLANK);
|
auto current_target_prime = get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s, BLANK);
|
||||||
// this loop over s could be parallel/vectorized, too, but the required items are one index apart
|
// this loop over s could be parallel/vectorized, too, but the required items are one index apart
|
||||||
@ -189,7 +190,7 @@ Tensor ctc_loss_backward_cpu_template(const Tensor& grad_out, const Tensor& log_
|
|||||||
if (targets.dim() == 1) { // concatenated targets
|
if (targets.dim() == 1) { // concatenated targets
|
||||||
int64_t pos = 0;
|
int64_t pos = 0;
|
||||||
max_target_length = 0;
|
max_target_length = 0;
|
||||||
for (int64_t i = 0; i < batch_size; i++) {
|
for (const auto i : c10::irange(batch_size)) {
|
||||||
tg_batch_offsets[i] = pos;
|
tg_batch_offsets[i] = pos;
|
||||||
pos += target_lengths[i];
|
pos += target_lengths[i];
|
||||||
if (max_target_length < target_lengths[i])
|
if (max_target_length < target_lengths[i])
|
||||||
@ -200,7 +201,7 @@ Tensor ctc_loss_backward_cpu_template(const Tensor& grad_out, const Tensor& log_
|
|||||||
else { // batch x max_target_length
|
else { // batch x max_target_length
|
||||||
// dim is 2
|
// dim is 2
|
||||||
int64_t tg_batch_stride = targets.stride(0);
|
int64_t tg_batch_stride = targets.stride(0);
|
||||||
for (int64_t i = 0; i < batch_size; i++) {
|
for (const auto i : c10::irange(batch_size)) {
|
||||||
tg_batch_offsets[i] = i * tg_batch_stride;
|
tg_batch_offsets[i] = i * tg_batch_stride;
|
||||||
}
|
}
|
||||||
tg_target_stride = targets.stride(1);
|
tg_target_stride = targets.stride(1);
|
||||||
@ -234,7 +235,7 @@ Tensor ctc_loss_backward_cpu_template(const Tensor& grad_out, const Tensor& log_
|
|||||||
TensorIterator fill_1d_iter_local(fill_1d_iter);
|
TensorIterator fill_1d_iter_local(fill_1d_iter);
|
||||||
TensorIterator fill_log_beta_1d_iter_local(fill_log_beta_1d_iter);
|
TensorIterator fill_log_beta_1d_iter_local(fill_log_beta_1d_iter);
|
||||||
|
|
||||||
for (int64_t b = start; b < end; b++) {
|
for (const auto b : c10::irange(start, end)) {
|
||||||
scalar_t nll = neg_log_likelihood.accessor<scalar_t, 1>()[b];
|
scalar_t nll = neg_log_likelihood.accessor<scalar_t, 1>()[b];
|
||||||
auto grad_a = grad_a_global[b];
|
auto grad_a = grad_a_global[b];
|
||||||
if (zero_infinity && nll == std::numeric_limits<scalar_t>::infinity()) {
|
if (zero_infinity && nll == std::numeric_limits<scalar_t>::infinity()) {
|
||||||
@ -322,8 +323,8 @@ Tensor ctc_loss_backward_cpu_template(const Tensor& grad_out, const Tensor& log_
|
|||||||
// this could be a great target for further vectorization.
|
// this could be a great target for further vectorization.
|
||||||
// grad is the output gradient, nll is the loss. Note that the likelihood -nll is the Z of eq (16)
|
// grad is the output gradient, nll is the loss. Note that the likelihood -nll is the Z of eq (16)
|
||||||
scalar_t gr = grad_out.accessor<scalar_t, 1>()[b];
|
scalar_t gr = grad_out.accessor<scalar_t, 1>()[b];
|
||||||
for (int64_t t = 0; t < input_length; t++) { // or go for the full thing?
|
for (const auto t : c10::irange(input_length)) { // or go for the full thing?
|
||||||
for (int64_t c = 0; c < num_labels; c++) {
|
for (const auto c : c10::irange(num_labels)) {
|
||||||
scalar_t& res = grad_a[t][c];
|
scalar_t& res = grad_a[t][c];
|
||||||
scalar_t lp = log_probs_a[t][c];
|
scalar_t lp = log_probs_a[t][c];
|
||||||
res = (std::exp(lp)-std::exp(res + nll - lp)) * gr;
|
res = (std::exp(lp)-std::exp(res + nll - lp)) * gr;
|
||||||
|
@ -3,6 +3,7 @@
|
|||||||
#include <ATen/Dispatch.h>
|
#include <ATen/Dispatch.h>
|
||||||
#include <ATen/TensorUtils.h>
|
#include <ATen/TensorUtils.h>
|
||||||
#include <ATen/native/LossMulti.h>
|
#include <ATen/native/LossMulti.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
namespace at {
|
namespace at {
|
||||||
namespace native {
|
namespace native {
|
||||||
@ -17,21 +18,21 @@ inline scalar_t multilabel_margin_loss_forward_inner_sum_cpu(
|
|||||||
int64_t dim) {
|
int64_t dim) {
|
||||||
using accscalar_t = at::acc_type<scalar_t, false>;
|
using accscalar_t = at::acc_type<scalar_t, false>;
|
||||||
accscalar_t sum = 0;
|
accscalar_t sum = 0;
|
||||||
for (int64_t ddt = 0; ddt < dim; ddt++) {
|
for (const auto ddt : c10::irange(dim)) {
|
||||||
int64_t target_idx = target_data[ddt];
|
int64_t target_idx = target_data[ddt];
|
||||||
if (target_idx < 0) {
|
if (target_idx < 0) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
is_target_data[target_idx] = 1;
|
is_target_data[target_idx] = 1;
|
||||||
}
|
}
|
||||||
for (int64_t dt = 0; dt < dim; dt++) {
|
for (const auto dt : c10::irange(dim)) {
|
||||||
int64_t target_idx = target_data[dt];
|
int64_t target_idx = target_data[dt];
|
||||||
if (target_idx < 0) {
|
if (target_idx < 0) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
scalar_t input_target = input_data[target_idx];
|
scalar_t input_target = input_data[target_idx];
|
||||||
for (int64_t d = 0; d < dim; d++) {
|
for (const auto d : c10::irange(dim)) {
|
||||||
if (!is_target_data[d]) {
|
if (!is_target_data[d]) {
|
||||||
scalar_t z = 1 - input_target + input_data[d];
|
scalar_t z = 1 - input_target + input_data[d];
|
||||||
if (z > 0) {
|
if (z > 0) {
|
||||||
@ -63,7 +64,8 @@ static void multilabel_margin_loss_forward_out_frame(
|
|||||||
|
|
||||||
accscalar_t sum = 0;
|
accscalar_t sum = 0;
|
||||||
|
|
||||||
for (int64_t t = 0; t < nframe; t++) {
|
for (const auto t : c10::irange(nframe)) {
|
||||||
|
(void)t; //Suppress unused variable warning
|
||||||
sum += multilabel_margin_loss_forward_inner_sum_cpu(
|
sum += multilabel_margin_loss_forward_inner_sum_cpu(
|
||||||
input_data, target_data, is_target_data, dim);
|
input_data, target_data, is_target_data, dim);
|
||||||
|
|
||||||
@ -81,7 +83,7 @@ static void multilabel_margin_loss_forward_out_frame(
|
|||||||
} else {
|
} else {
|
||||||
auto output_acc = output.accessor<scalar_t, 1>();
|
auto output_acc = output.accessor<scalar_t, 1>();
|
||||||
|
|
||||||
for (int64_t t = 0; t < nframe; t++) {
|
for (const auto t : c10::irange(nframe)) {
|
||||||
scalar_t sum = multilabel_margin_loss_forward_inner_sum_cpu(
|
scalar_t sum = multilabel_margin_loss_forward_inner_sum_cpu(
|
||||||
input_data, target_data, is_target_data, dim);
|
input_data, target_data, is_target_data, dim);
|
||||||
|
|
||||||
@ -171,15 +173,16 @@ static void multilabel_margin_loss_backward_out_frame(
|
|||||||
reduction == Reduction::Mean ? 1. / (nframe * dim) : 1. / dim);
|
reduction == Reduction::Mean ? 1. / (nframe * dim) : 1. / dim);
|
||||||
|
|
||||||
scalar_t* grad_input_row_data = grad_input.data_ptr<scalar_t>();
|
scalar_t* grad_input_row_data = grad_input.data_ptr<scalar_t>();
|
||||||
for (int64_t t = 0; t < nframe; t++) {
|
for (const auto t : c10::irange(nframe)) {
|
||||||
for (int64_t dt = 0; dt < dim; dt++) {
|
(void)t; //Suppress unused variable warning
|
||||||
|
for (const auto dt : c10::irange(dim)) {
|
||||||
int64_t target_idx = target_data[dt];
|
int64_t target_idx = target_data[dt];
|
||||||
if (target_idx < 0) {
|
if (target_idx < 0) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
scalar_t input_target = input_data[target_idx];
|
scalar_t input_target = input_data[target_idx];
|
||||||
for (int64_t d = 0; d < dim; d++) {
|
for (const auto d : c10::irange(dim)) {
|
||||||
if (!is_target_data[d]) {
|
if (!is_target_data[d]) {
|
||||||
scalar_t z = 1 - input_target + input_data[d];
|
scalar_t z = 1 - input_target + input_data[d];
|
||||||
if (z > 0) {
|
if (z > 0) {
|
||||||
@ -206,8 +209,8 @@ static void multilabel_margin_loss_backward_out_frame(
|
|||||||
} else {
|
} else {
|
||||||
check_dim_size(grad_output, 1, 0, nframe);
|
check_dim_size(grad_output, 1, 0, nframe);
|
||||||
auto grad_output_acc = grad_output.accessor<scalar_t, 1>();
|
auto grad_output_acc = grad_output.accessor<scalar_t, 1>();
|
||||||
for (int64_t t = 0; t < nframe; t++) {
|
for (const auto t : c10::irange(nframe)) {
|
||||||
for (int64_t d = 0; d < dim; d++) {
|
for (const auto d : c10::irange(dim)) {
|
||||||
grad_input_data[t * dim + d] *= grad_output_acc[t];
|
grad_input_data[t * dim + d] *= grad_output_acc[t];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2,6 +2,7 @@
|
|||||||
#include <ATen/Dispatch.h>
|
#include <ATen/Dispatch.h>
|
||||||
#include <ATen/AccumulateType.h>
|
#include <ATen/AccumulateType.h>
|
||||||
#include <ATen/native/LossMulti.h>
|
#include <ATen/native/LossMulti.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
namespace at {
|
namespace at {
|
||||||
namespace native {
|
namespace native {
|
||||||
@ -18,7 +19,7 @@ inline scalar_t multi_margin_inner_sum_cpu(
|
|||||||
const int64_t target_idx) {
|
const int64_t target_idx) {
|
||||||
const scalar_t input_target = input_data[target_idx];
|
const scalar_t input_target = input_data[target_idx];
|
||||||
scalar_t sum = 0;
|
scalar_t sum = 0;
|
||||||
for (int64_t d = 0; d < dim; d++) {
|
for (const auto d : c10::irange(dim)) {
|
||||||
if (d == target_idx) {
|
if (d == target_idx) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -63,7 +64,7 @@ static inline void multi_margin_loss_cpu_kernel(
|
|||||||
// cannot be handled by TensorAccessor)
|
// cannot be handled by TensorAccessor)
|
||||||
if (reduction == Reduction::None && output.dim() > 0) {
|
if (reduction == Reduction::None && output.dim() > 0) {
|
||||||
auto output_acc = output.accessor<scalar_t, 1>();
|
auto output_acc = output.accessor<scalar_t, 1>();
|
||||||
for (int64_t t = 0; t < nframe; t++) {
|
for (const auto t : c10::irange(nframe)) {
|
||||||
const auto idx = target_index_checked(target_data, t, dim);
|
const auto idx = target_index_checked(target_data, t, dim);
|
||||||
auto sum = multi_margin_inner_sum_cpu(
|
auto sum = multi_margin_inner_sum_cpu(
|
||||||
input_data, weight_data, p, margin, dim, idx);
|
input_data, weight_data, p, margin, dim, idx);
|
||||||
@ -73,7 +74,7 @@ static inline void multi_margin_loss_cpu_kernel(
|
|||||||
} else {
|
} else {
|
||||||
accscalar_t sum = 0;
|
accscalar_t sum = 0;
|
||||||
auto output_acc = output.data_ptr<scalar_t>();
|
auto output_acc = output.data_ptr<scalar_t>();
|
||||||
for (int64_t t = 0; t < nframe; t++) {
|
for (const auto t : c10::irange(nframe)) {
|
||||||
const auto idx = target_index_checked(target_data, t, dim);
|
const auto idx = target_index_checked(target_data, t, dim);
|
||||||
sum += multi_margin_inner_sum_cpu(
|
sum += multi_margin_inner_sum_cpu(
|
||||||
input_data, weight_data, p, margin, dim, idx);
|
input_data, weight_data, p, margin, dim, idx);
|
||||||
@ -149,11 +150,11 @@ static void multi_margin_loss_backward_cpu_kernel(
|
|||||||
int64_t dim,
|
int64_t dim,
|
||||||
int64_t reduction) {
|
int64_t reduction) {
|
||||||
scalar_t* grad_input_row_data = grad_input_data;
|
scalar_t* grad_input_row_data = grad_input_data;
|
||||||
for (int64_t t = 0; t < nframe; t++) {
|
for (const auto t : c10::irange(nframe)) {
|
||||||
int64_t target_idx = target_index_checked(target_data, t, dim);
|
int64_t target_idx = target_index_checked(target_data, t, dim);
|
||||||
scalar_t input_target = input_data[target_idx];
|
scalar_t input_target = input_data[target_idx];
|
||||||
scalar_t grad_input_target = 0;
|
scalar_t grad_input_target = 0;
|
||||||
for (int64_t d = 0; d < dim; d++) {
|
for (const auto d : c10::irange(dim)) {
|
||||||
scalar_t z = margin - input_target + input_data[d];
|
scalar_t z = margin - input_target + input_data[d];
|
||||||
if (d == target_idx) {
|
if (d == target_idx) {
|
||||||
continue;
|
continue;
|
||||||
@ -186,8 +187,8 @@ static void multi_margin_loss_backward_cpu_kernel(
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
auto grad_output_acc = grad_output.accessor<scalar_t, 1>();
|
auto grad_output_acc = grad_output.accessor<scalar_t, 1>();
|
||||||
for (int64_t t = 0; t < nframe; t++) {
|
for (const auto t : c10::irange(nframe)) {
|
||||||
for (int64_t d = 0; d < dim; d++) {
|
for (const auto d : c10::irange(dim)) {
|
||||||
grad_input_data[t * dim + d] *= grad_output_acc[t];
|
grad_input_data[t * dim + d] *= grad_output_acc[t];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -9,6 +9,7 @@
|
|||||||
#include <c10/util/SmallBuffer.h>
|
#include <c10/util/SmallBuffer.h>
|
||||||
|
|
||||||
#include <c10/core/TensorOptions.h>
|
#include <c10/core/TensorOptions.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
namespace at {
|
namespace at {
|
||||||
namespace meta {
|
namespace meta {
|
||||||
@ -155,7 +156,7 @@ static void nll_loss_out_frame(
|
|||||||
auto output_acc = output.accessor<scalar_t, 1>();
|
auto output_acc = output.accessor<scalar_t, 1>();
|
||||||
|
|
||||||
at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) {
|
at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) {
|
||||||
for (auto i = start; i < end; i++) {
|
for (const auto i : c10::irange(start, end)) {
|
||||||
const auto cur_target = target_acc[i];
|
const auto cur_target = target_acc[i];
|
||||||
|
|
||||||
if (cur_target == ignore_index) {
|
if (cur_target == ignore_index) {
|
||||||
@ -215,7 +216,7 @@ static void nll_loss_out_frame(
|
|||||||
scalar_t weight_partial_sums[cascade_sum_num_levels] = {0};
|
scalar_t weight_partial_sums[cascade_sum_num_levels] = {0};
|
||||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
|
// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
|
||||||
scalar_t loss_partial_sums[cascade_sum_num_levels] = {0};
|
scalar_t loss_partial_sums[cascade_sum_num_levels] = {0};
|
||||||
for (int64_t b = 0; b < batch_size; b++) {
|
for (const auto b : c10::irange(batch_size)) {
|
||||||
const int64_t cur_target = target_data[b];
|
const int64_t cur_target = target_data[b];
|
||||||
if (cur_target == ignore_index) {
|
if (cur_target == ignore_index) {
|
||||||
++num_ignored;
|
++num_ignored;
|
||||||
@ -330,7 +331,7 @@ static void nll_loss_backward_out_frame(
|
|||||||
auto grad_input_acc = grad_input.accessor<scalar_t, 2>();
|
auto grad_input_acc = grad_input.accessor<scalar_t, 2>();
|
||||||
auto grad_output_acc = grad_output.accessor<scalar_t, 1>();
|
auto grad_output_acc = grad_output.accessor<scalar_t, 1>();
|
||||||
at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) {
|
at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) {
|
||||||
for (auto i = start; i < end; i++) {
|
for (const auto i : c10::irange(start, end)) {
|
||||||
auto cur_target = target_acc[i];
|
auto cur_target = target_acc[i];
|
||||||
if (cur_target == ignore_index) {
|
if (cur_target == ignore_index) {
|
||||||
continue;
|
continue;
|
||||||
|
@ -5,6 +5,7 @@
|
|||||||
#include <ATen/TensorUtils.h>
|
#include <ATen/TensorUtils.h>
|
||||||
#include <ATen/native/cpu/utils.h>
|
#include <ATen/native/cpu/utils.h>
|
||||||
#include <ATen/native/Resize.h>
|
#include <ATen/native/Resize.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
namespace at {
|
namespace at {
|
||||||
namespace native {
|
namespace native {
|
||||||
@ -109,9 +110,9 @@ static void nll_loss2d_forward_out_frame(
|
|||||||
auto target_acc = target.accessor<int64_t, 3>();
|
auto target_acc = target.accessor<int64_t, 3>();
|
||||||
|
|
||||||
at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) {
|
at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) {
|
||||||
for (int64_t b = start; b < end; b++) {
|
for (const auto b : c10::irange(start, end)) {
|
||||||
for (int64_t h = 0; h < H; h++) {
|
for (const auto h : c10::irange(H)) {
|
||||||
for (int64_t w = 0; w < W; w++) {
|
for (const auto w : c10::irange(W)) {
|
||||||
const int64_t cur_target = (int64_t)target_acc[b][h][w];
|
const int64_t cur_target = (int64_t)target_acc[b][h][w];
|
||||||
|
|
||||||
if (cur_target == ignore_index) {
|
if (cur_target == ignore_index) {
|
||||||
@ -176,8 +177,8 @@ static void nll_loss2d_forward_out_frame(
|
|||||||
const int64_t level_mask = level_step - 1;
|
const int64_t level_mask = level_step - 1;
|
||||||
|
|
||||||
int64_t num_ignored = 0;
|
int64_t num_ignored = 0;
|
||||||
for (int64_t b = 0; b < batch_size; b++) {
|
for (const auto b : c10::irange(batch_size)) {
|
||||||
for (int64_t elem = 0; elem < map_size; elem++) {
|
for (const auto elem : c10::irange(map_size)) {
|
||||||
const int64_t cur_target = target_data[b * map_size + elem];
|
const int64_t cur_target = target_data[b * map_size + elem];
|
||||||
if (cur_target == ignore_index) {
|
if (cur_target == ignore_index) {
|
||||||
++num_ignored;
|
++num_ignored;
|
||||||
@ -286,9 +287,9 @@ static void nll_loss2d_backward_out_frame(
|
|||||||
auto target_acc = target.accessor<int64_t, 3>();
|
auto target_acc = target.accessor<int64_t, 3>();
|
||||||
|
|
||||||
at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) {
|
at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) {
|
||||||
for (int64_t b = start; b < end; b++) {
|
for (const auto b : c10::irange(start, end)) {
|
||||||
for (int64_t h = 0; h < H; h++) {
|
for (const auto h : c10::irange(H)) {
|
||||||
for (int64_t w = 0; w < W; w++) {
|
for (const auto w : c10::irange(W)) {
|
||||||
const int64_t cur_target = target_acc[b][h][w];
|
const int64_t cur_target = target_acc[b][h][w];
|
||||||
if (cur_target == ignore_index) {
|
if (cur_target == ignore_index) {
|
||||||
continue;
|
continue;
|
||||||
@ -329,8 +330,8 @@ static void nll_loss2d_backward_out_frame(
|
|||||||
: grad_output_value);
|
: grad_output_value);
|
||||||
|
|
||||||
at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) {
|
at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) {
|
||||||
for (int64_t b = start; b < end; b++) {
|
for (const auto b : c10::irange(start, end)) {
|
||||||
for (int64_t elem = 0; elem < map_size; elem++) {
|
for (const auto elem : c10::irange(map_size)) {
|
||||||
const int64_t t = target_data[b * map_size + elem];
|
const int64_t t = target_data[b * map_size + elem];
|
||||||
|
|
||||||
if (t != ignore_index) {
|
if (t != ignore_index) {
|
||||||
|
@ -60,6 +60,7 @@ bool _nnpack_available() {
|
|||||||
#include <caffe2/utils/threadpool/pthreadpool-cpp.h>
|
#include <caffe2/utils/threadpool/pthreadpool-cpp.h>
|
||||||
#include <ATen/native/ConvUtils.h>
|
#include <ATen/native/ConvUtils.h>
|
||||||
#include <ATen/Parallel.h>
|
#include <ATen/Parallel.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
namespace at {
|
namespace at {
|
||||||
namespace native {
|
namespace native {
|
||||||
@ -238,7 +239,7 @@ Tensor _nnpack_spatial_convolution(
|
|||||||
const size_t input_size_per_batch = input_channels * input_size.width * input_size.height;
|
const size_t input_size_per_batch = input_channels * input_size.width * input_size.height;
|
||||||
const size_t output_size_per_batch = output_channels * output_size.width * output_size.height;
|
const size_t output_size_per_batch = output_channels * output_size.width * output_size.height;
|
||||||
|
|
||||||
for (size_t batch = 0u; batch < batch_size; ++batch) {
|
for (const auto batch : c10::irange(0u, batch_size)) {
|
||||||
const nnp_status status = nnp_convolution_inference(
|
const nnp_status status = nnp_convolution_inference(
|
||||||
algorithm,
|
algorithm,
|
||||||
nnp_convolution_transform_strategy_compute,
|
nnp_convolution_transform_strategy_compute,
|
||||||
|
@ -100,7 +100,7 @@ Tensor refine_names(const Tensor& self, DimnameList names) {
|
|||||||
self_names.size(), " and ", names.size(), " respectively).");
|
self_names.size(), " and ", names.size(), " respectively).");
|
||||||
check_names_valid_for(self, names);
|
check_names_valid_for(self, names);
|
||||||
|
|
||||||
for (size_t idx = 0; idx < self_names.size(); idx++) {
|
for (const auto idx : c10::irange(self_names.size())) {
|
||||||
const auto& self_name = self_names[idx];
|
const auto& self_name = self_names[idx];
|
||||||
const auto& out_name = names[idx];
|
const auto& out_name = names[idx];
|
||||||
if (self_name == out_name || self_name.isWildcard()) {
|
if (self_name == out_name || self_name.isWildcard()) {
|
||||||
@ -221,7 +221,7 @@ Tensor align_to(const Tensor& tensor, DimnameList order, int64_t ellipsis_idx) {
|
|||||||
};
|
};
|
||||||
|
|
||||||
// Fill in the non-ellipsis dimensions
|
// Fill in the non-ellipsis dimensions
|
||||||
for (auto order_idx = 0U; order_idx < order.size(); ++order_idx) {
|
for (const auto order_idx : c10::irange(0U, order.size())) {
|
||||||
auto out_idx = order_idx;
|
auto out_idx = order_idx;
|
||||||
if (order_idx >= ellipsis_idx) {
|
if (order_idx >= ellipsis_idx) {
|
||||||
out_idx = order_idx + num_ellipsis_names;
|
out_idx = order_idx + num_ellipsis_names;
|
||||||
|
@ -10,6 +10,7 @@
|
|||||||
#include <ATen/native/cpu/Loops.h>
|
#include <ATen/native/cpu/Loops.h>
|
||||||
#include <ATen/native/batch_norm.h>
|
#include <ATen/native/batch_norm.h>
|
||||||
#include <ATen/native/Normalization.h>
|
#include <ATen/native/Normalization.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
@ -156,7 +157,7 @@ std::tuple<Tensor,Tensor> batch_norm_cpu_update_stats_template(
|
|||||||
// Reduce all dimensions except dim=1
|
// Reduce all dimensions except dim=1
|
||||||
DimVector reduce_dims(ndim - 1);
|
DimVector reduce_dims(ndim - 1);
|
||||||
reduce_dims[0] = 0;
|
reduce_dims[0] = 0;
|
||||||
for (int64_t i = 2; i < ndim; ++i) {
|
for (const auto i : c10::irange(2, ndim)) {
|
||||||
reduce_dims[i - 1] = i;
|
reduce_dims[i - 1] = i;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -178,7 +179,7 @@ std::tuple<Tensor,Tensor> batch_norm_cpu_update_stats_template(
|
|||||||
batch_norm_cpu_collect_stats_stub(kCPU, _mean, _var_sum, input);
|
batch_norm_cpu_collect_stats_stub(kCPU, _mean, _var_sum, input);
|
||||||
|
|
||||||
parallel_for(0, n_input, 1, [&](int64_t b_begin, int64_t b_end) {
|
parallel_for(0, n_input, 1, [&](int64_t b_begin, int64_t b_end) {
|
||||||
for (int64_t f = b_begin; f < b_end; ++f) {
|
for (const auto f : c10::irange(b_begin, b_end)) {
|
||||||
save_mean_a[f] = _mean_a[f];
|
save_mean_a[f] = _mean_a[f];
|
||||||
save_var_transform_a[f] = VarTransform<accscalar_t>{}(_var_sum_a[f] / n, eps);
|
save_var_transform_a[f] = VarTransform<accscalar_t>{}(_var_sum_a[f] / n, eps);
|
||||||
|
|
||||||
@ -206,7 +207,7 @@ std::tuple<Tensor,Tensor> batch_norm_cpu_update_stats_template(
|
|||||||
|
|
||||||
parallel_for(0, n_input, 1, [&](int64_t b_begin, int64_t b_end) {
|
parallel_for(0, n_input, 1, [&](int64_t b_begin, int64_t b_end) {
|
||||||
TensorIterator iter(reduce_iter);
|
TensorIterator iter(reduce_iter);
|
||||||
for (int64_t f = b_begin; f < b_end; ++f) {
|
for (const auto f : c10::irange(b_begin, b_end)) {
|
||||||
// compute variance per input
|
// compute variance per input
|
||||||
iter.unsafe_replace_operand(0, in_data + channel_stride * f);
|
iter.unsafe_replace_operand(0, in_data + channel_stride * f);
|
||||||
accscalar_t var_sum = 0;
|
accscalar_t var_sum = 0;
|
||||||
@ -283,7 +284,7 @@ std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_cpu_template(
|
|||||||
// Reduce all dimensions except dim=1
|
// Reduce all dimensions except dim=1
|
||||||
DimVector reduce_dims(ndim - 1);
|
DimVector reduce_dims(ndim - 1);
|
||||||
reduce_dims[0] = 0;
|
reduce_dims[0] = 0;
|
||||||
for (int64_t i = 2; i < ndim; ++i) {
|
for (const auto i : c10::irange(2, ndim)) {
|
||||||
reduce_dims[i - 1] = i;
|
reduce_dims[i - 1] = i;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -330,7 +331,7 @@ std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_cpu_template(
|
|||||||
TensorIterator unary_iter_local(unary_iter);
|
TensorIterator unary_iter_local(unary_iter);
|
||||||
TensorIterator binary_iter_local(binary_iter);
|
TensorIterator binary_iter_local(binary_iter);
|
||||||
|
|
||||||
for (int64_t f = b_begin; f < b_end; ++f) {
|
for (const auto f : c10::irange(b_begin, b_end)) {
|
||||||
scalar_t w = weight.defined() ? weight_a[f] : 1;
|
scalar_t w = weight.defined() ? weight_a[f] : 1;
|
||||||
|
|
||||||
scalar_t mean, invstd;
|
scalar_t mean, invstd;
|
||||||
|
@ -77,7 +77,7 @@ std::tuple<Tensor, Tensor> _pack_padded_sequence(const Tensor& _input, const Ten
|
|||||||
// more elements below in our column, we lower the counter (prev_l), and append the new
|
// more elements below in our column, we lower the counter (prev_l), and append the new
|
||||||
// block to the output.
|
// block to the output.
|
||||||
int64_t prev_l = 0;
|
int64_t prev_l = 0;
|
||||||
for (int64_t i = 0; i < batch_size; ++i) {
|
for (const auto i : c10::irange(batch_size)) {
|
||||||
int64_t l = lengths[batch_size - 1 - i];
|
int64_t l = lengths[batch_size - 1 - i];
|
||||||
if (l > prev_l) {
|
if (l > prev_l) {
|
||||||
auto current_batch_size = batch_size - i;
|
auto current_batch_size = batch_size - i;
|
||||||
@ -109,7 +109,7 @@ Tensor _pack_padded_sequence_backward(const Tensor& grad, at::IntArrayRef input_
|
|||||||
int64_t offset = 0;
|
int64_t offset = 0;
|
||||||
int64_t max_seq_len = batch_sizes_t.size(0);
|
int64_t max_seq_len = batch_sizes_t.size(0);
|
||||||
int64_t * batch_sizes = batch_sizes_t.data_ptr<int64_t>();
|
int64_t * batch_sizes = batch_sizes_t.data_ptr<int64_t>();
|
||||||
for (int64_t i = 0; i < max_seq_len; ++i) {
|
for (const auto i : c10::irange(max_seq_len)) {
|
||||||
grad_input[i].slice(0, 0, batch_sizes[i]).copy_(grad.slice(0, offset, offset + batch_sizes[i]));
|
grad_input[i].slice(0, 0, batch_sizes[i]).copy_(grad.slice(0, offset, offset + batch_sizes[i]));
|
||||||
offset += batch_sizes[i];
|
offset += batch_sizes[i];
|
||||||
}
|
}
|
||||||
@ -170,7 +170,8 @@ std::tuple<Tensor, Tensor> _pad_packed_sequence(const Tensor& data, const Tensor
|
|||||||
}
|
}
|
||||||
int64_t dec = prev_batch_size - batch_size;
|
int64_t dec = prev_batch_size - batch_size;
|
||||||
if (dec > 0) {
|
if (dec > 0) {
|
||||||
for (int64_t j = 0; j < dec; ++j) {
|
for (const auto j : c10::irange(dec)) {
|
||||||
|
(void)j; //Suppress unused variable warning
|
||||||
(*lengths--) = i;
|
(*lengths--) = i;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -206,7 +207,7 @@ Tensor pad_sequence(TensorList sequences, bool batch_first, double padding_value
|
|||||||
out_dims.insert(out_dims.end(), trailing_dims.begin(), trailing_dims.end());
|
out_dims.insert(out_dims.end(), trailing_dims.begin(), trailing_dims.end());
|
||||||
|
|
||||||
Tensor out = at::full(out_dims, padding_value, sequences[0].options());
|
Tensor out = at::full(out_dims, padding_value, sequences[0].options());
|
||||||
for (int64_t i = 0; i < sequences_size; i++) {
|
for (const auto i : c10::irange(sequences_size)) {
|
||||||
const Tensor currseq = sequences[i];
|
const Tensor currseq = sequences[i];
|
||||||
const int64_t length_i = currseq.size(0);
|
const int64_t length_i = currseq.size(0);
|
||||||
// use index notation to prevent duplicate references to the tensor
|
// use index notation to prevent duplicate references to the tensor
|
||||||
|
@ -2,6 +2,7 @@
|
|||||||
#include <ATen/NativeFunctions.h>
|
#include <ATen/NativeFunctions.h>
|
||||||
#include <ATen/div_rtn.h>
|
#include <ATen/div_rtn.h>
|
||||||
#include <ATen/native/DispatchStub.h>
|
#include <ATen/native/DispatchStub.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
@ -212,7 +213,7 @@ pool3d_shape_check(
|
|||||||
TORCH_CHECK(ndim == 4 || ndim == 5,
|
TORCH_CHECK(ndim == 4 || ndim == 5,
|
||||||
fn_name, ": Expected 4D or 5D tensor for input, but got: ", input.sizes());
|
fn_name, ": Expected 4D or 5D tensor for input, but got: ", input.sizes());
|
||||||
|
|
||||||
for (int64_t i = 1; i < ndim; ++i) {
|
for (const auto i : c10::irange(1, ndim)) {
|
||||||
TORCH_CHECK(input.size(i) > 0,
|
TORCH_CHECK(input.size(i) > 0,
|
||||||
fn_name, "Expected input to have non-zero size for non-batch dimensions, but got",
|
fn_name, "Expected input to have non-zero size for non-batch dimensions, but got",
|
||||||
input.sizes(), " with dimension ", i, " being empty.");
|
input.sizes(), " with dimension ", i, " being empty.");
|
||||||
|
@ -206,9 +206,9 @@ void CalcColOffsetsTranspose(
|
|||||||
const int8_t* Bint8,
|
const int8_t* Bint8,
|
||||||
int32_t B_zero_point,
|
int32_t B_zero_point,
|
||||||
int32_t* col_offsets) {
|
int32_t* col_offsets) {
|
||||||
for (int i = 0; i < N; ++i) {
|
for (const auto i : c10::irange(N)) {
|
||||||
int32_t sum = 0;
|
int32_t sum = 0;
|
||||||
for (int j = 0; j < K; ++j) {
|
for (const auto j : c10::irange(K)) {
|
||||||
sum += Bint8[i * K + j];
|
sum += Bint8[i * K + j];
|
||||||
}
|
}
|
||||||
col_offsets[i] = sum - B_zero_point * K;
|
col_offsets[i] = sum - B_zero_point * K;
|
||||||
@ -353,7 +353,7 @@ bool CheckAndSaturate(T max_val, T* element) {
|
|||||||
void HandleWeightsSaturation(int64_t N, float* weight) {
|
void HandleWeightsSaturation(int64_t N, float* weight) {
|
||||||
const float kFp16Max = RawUint16ToFp16(0x7BFF);
|
const float kFp16Max = RawUint16ToFp16(0x7BFF);
|
||||||
bool found_out_of_range = false;
|
bool found_out_of_range = false;
|
||||||
for (int64_t i = 0; i < N; ++i) {
|
for (const auto i : c10::irange(N)) {
|
||||||
if (CheckAndSaturate<float>(kFp16Max, weight + i)) {
|
if (CheckAndSaturate<float>(kFp16Max, weight + i)) {
|
||||||
found_out_of_range = true;
|
found_out_of_range = true;
|
||||||
}
|
}
|
||||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user