mirror of
https://github.com/pytorch/pytorch.git
synced 2025-11-18 09:34:57 +08:00
This reverts commit 9ffc480c5a928eaccb4ac0e1755a1c596674d884. Reverted https://github.com/pytorch/pytorch/pull/166813 on behalf of https://github.com/jeanschmidt due to It was reverted internally 6 days ago, but not reverted on OSS, this is causing conflicts ([comment](https://github.com/pytorch/pytorch/pull/166813#issuecomment-3514328895))
228 lines
8.2 KiB
C++
228 lines
8.2 KiB
C++
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
|
#include <ATen/core/Tensor.h>
|
|
#include <ATen/native/ReduceOps.h>
|
|
#include <ATen/native/ReduceAllOps.h>
|
|
#include <ATen/native/ReduceOpsUtils.h>
|
|
|
|
#include <ATen/Dispatch.h>
|
|
#include <ATen/Parallel.h>
|
|
#include <ATen/TensorIterator.h>
|
|
#include <ATen/OpMathType.h>
|
|
|
|
#include <ATen/native/cpu/Loops.h>
|
|
#include <ATen/native/cpu/zmath.h>
|
|
#include <ATen/cpu/vec/functional.h>
|
|
#include <ATen/cpu/vec/vec.h>
|
|
#include <c10/util/irange.h>
|
|
|
|
namespace at::native {
|
|
namespace {
|
|
|
|
using namespace vec;
|
|
|
|
template <typename scalar_t, typename func_t, typename vec_func_t>
|
|
inline void reduce_all_impl_vec(
|
|
Tensor& output,
|
|
const Tensor& input,
|
|
const scalar_t ident_v,
|
|
func_t op,
|
|
vec_func_t vop) {
|
|
using Vec = Vectorized<opmath_type<scalar_t>>;
|
|
const int64_t input_numel = input.numel();
|
|
auto input_data = input.const_data_ptr<scalar_t>();
|
|
// NOTE: parallel_reduce not support bool type
|
|
scalar_t result = at::parallel_reduce(0, input_numel, internal::GRAIN_SIZE, ident_v,
|
|
[&](int64_t start, int64_t end, const scalar_t /*ident*/) -> scalar_t {
|
|
scalar_t partial_out = vec::reduce_all<scalar_t>(
|
|
[=](Vec x, Vec y) { return vop(x, y); },
|
|
input_data + start,
|
|
end - start);
|
|
return partial_out;
|
|
}, op);
|
|
output.fill_(result);
|
|
}
|
|
|
|
// For operation not support in avx/avx2
|
|
template <typename scalar_t, typename func_t>
|
|
inline void reduce_all_impl(
|
|
Tensor& output,
|
|
const Tensor& input,
|
|
const scalar_t ident_v,
|
|
func_t op) {
|
|
const int64_t input_numel = input.numel();
|
|
auto input_data = input.const_data_ptr<scalar_t>();
|
|
scalar_t result = at::parallel_reduce(0, input_numel, internal::GRAIN_SIZE, ident_v,
|
|
[&](int64_t start, int64_t end, const scalar_t ident) -> scalar_t {
|
|
scalar_t partial_out = ident;
|
|
for (const auto i : c10::irange(start, end)) {
|
|
partial_out = op(partial_out, input_data[i]);
|
|
}
|
|
return partial_out;
|
|
}, op);
|
|
output.fill_(result);
|
|
}
|
|
|
|
void min_all_kernel_impl(Tensor& result, const Tensor& input) {
|
|
if (input.scalar_type() == ScalarType::Bool) {
|
|
TensorIterator iter = TensorIteratorConfig()
|
|
.add_input(input)
|
|
.build();
|
|
bool result_data = true;
|
|
cpu_serial_kernel(iter, [&](const bool a) -> void {
|
|
result_data = result_data && a;
|
|
});
|
|
result.fill_(result_data);
|
|
} else if(input.scalar_type() == ScalarType::Long) {
|
|
// for int64_t, vectorized implementation have performance issue,
|
|
// just use scalar path
|
|
reduce_all_impl<int64_t>(result, input, upper_bound<int64_t>(),
|
|
[=](int64_t a, int64_t b) -> int64_t { return min_impl(a, b); });
|
|
} else {
|
|
AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(), "min_all", [&] {
|
|
using Vec = Vectorized<opmath_type<scalar_t>>;
|
|
reduce_all_impl_vec<scalar_t>(result, input, upper_bound<scalar_t>(),
|
|
[=] (scalar_t a , scalar_t b) -> scalar_t { return min_impl(a, b); },
|
|
[=](Vec a, Vec b) -> Vec { return minimum(a, b); });
|
|
});
|
|
}
|
|
}
|
|
|
|
void max_all_kernel_impl(Tensor& result, const Tensor& input) {
|
|
if (input.scalar_type() == ScalarType::Bool) {
|
|
TensorIterator iter = TensorIteratorConfig()
|
|
.add_input(input)
|
|
.build();
|
|
bool result_data = false;
|
|
cpu_serial_kernel(iter, [&](const bool a) -> void {
|
|
result_data = result_data || a;
|
|
});
|
|
result.fill_(result_data);
|
|
} else if (input.scalar_type() == ScalarType::Long) {
|
|
// for int64_t, vectorized implementation have performance issue,
|
|
// just use scalar path
|
|
reduce_all_impl<int64_t>(result, input, lower_bound<int64_t>(),
|
|
[=](int64_t a, int64_t b) -> int64_t { return max_impl(a, b); });
|
|
} else {
|
|
AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(), "max_all", [&] {
|
|
using Vec = Vectorized<opmath_type<scalar_t>>;
|
|
reduce_all_impl_vec<scalar_t>(result, input, lower_bound<scalar_t>(),
|
|
[=] (scalar_t a , scalar_t b) -> scalar_t { return max_impl(a, b); },
|
|
[=](Vec a, Vec b) -> Vec { return maximum(a, b); });
|
|
});
|
|
}
|
|
}
|
|
|
|
// For operation not support in avx/avx2
|
|
template <typename scalar_t, typename func_t1, typename func_t2>
|
|
inline void reduce_all_impl_two_outputs(
|
|
Tensor& output1,
|
|
Tensor& output2,
|
|
const Tensor& input,
|
|
const std::pair<scalar_t, scalar_t>& ident_v,
|
|
func_t1 reduce_chunk_func,
|
|
func_t2 reduce_acc_func) {
|
|
using scalar_t_pair = std::pair<scalar_t, scalar_t>;
|
|
const int64_t input_numel = input.numel();
|
|
auto input_data = input.const_data_ptr<scalar_t>();
|
|
scalar_t_pair result = at::parallel_reduce(0, input_numel, internal::GRAIN_SIZE, ident_v,
|
|
[&](int64_t start, int64_t end, const scalar_t_pair& ident) -> scalar_t_pair {
|
|
scalar_t_pair partial_out(ident);
|
|
for (const auto i : c10::irange(start, end)) {
|
|
partial_out = reduce_chunk_func(partial_out, input_data[i]);
|
|
}
|
|
return partial_out;
|
|
},
|
|
reduce_acc_func
|
|
);
|
|
output1.fill_(result.first);
|
|
output2.fill_(result.second);
|
|
}
|
|
|
|
template <typename scalar_t, typename func_t, typename vec_func_t1, typename vec_func_t2>
|
|
inline void reduce_all_impl_vec_two_outputs(
|
|
Tensor& output1,
|
|
Tensor& output2,
|
|
const Tensor& input,
|
|
const std::pair<scalar_t, scalar_t>& ident_v,
|
|
func_t reduce_acc_func,
|
|
vec_func_t1 reduce_chunk_func1,
|
|
vec_func_t2 reduce_chunk_func2) {
|
|
using Vec = Vectorized<opmath_type<scalar_t>>;
|
|
using scalar_t_pair = std::pair<scalar_t, scalar_t>;
|
|
const int64_t input_numel = input.numel();
|
|
auto input_data = input.const_data_ptr<scalar_t>();
|
|
// NOTE: parallel_reduce not support bool type
|
|
std::pair<scalar_t, scalar_t> result = at::parallel_reduce(0, input_numel, internal::GRAIN_SIZE, ident_v,
|
|
[&](int64_t start, int64_t end, const scalar_t_pair& /* ident */) -> scalar_t_pair {
|
|
scalar_t_pair partial_out = vec::reduce2_all<scalar_t>(
|
|
[=](Vec x, Vec y) { return reduce_chunk_func1(x, y); },
|
|
[=](Vec x, Vec y) { return reduce_chunk_func2(x, y); },
|
|
input_data + start,
|
|
end - start);
|
|
return partial_out;
|
|
},
|
|
reduce_acc_func
|
|
);
|
|
output1.fill_(result.first);
|
|
output2.fill_(result.second);
|
|
}
|
|
|
|
void aminmax_allreduce_kernel(
|
|
const Tensor& input,
|
|
Tensor& min_result,
|
|
Tensor& max_result) {
|
|
if (input.scalar_type() == ScalarType::Bool) {
|
|
TensorIterator iter = TensorIteratorConfig()
|
|
.add_input(input)
|
|
.build();
|
|
bool min_result_data = true;
|
|
bool max_result_data = false;
|
|
cpu_serial_kernel(iter, [&](const bool a) -> void {
|
|
min_result_data = min_result_data && a;
|
|
max_result_data = max_result_data || a;
|
|
});
|
|
min_result.fill_(min_result_data);
|
|
max_result.fill_(max_result_data);
|
|
} else if (input.scalar_type() == ScalarType::Long) {
|
|
// for int64_t, vectorized implementation have performance issue,
|
|
// just use scalar path
|
|
using int64_t_pair = std::pair<int64_t, int64_t>;
|
|
reduce_all_impl_two_outputs<int64_t>(min_result, max_result, input,
|
|
int64_t_pair(upper_bound<int64_t>(), lower_bound<int64_t>()),
|
|
// reduce over chunk
|
|
[=](int64_t_pair a, int64_t b) -> int64_t_pair {
|
|
return int64_t_pair(min_impl(a.first, b), max_impl(a.second, b));
|
|
},
|
|
// combine two inputs
|
|
[=](int64_t_pair a, int64_t_pair b) -> int64_t_pair {
|
|
return int64_t_pair(min_impl(a.first, b.first), max_impl(a.second, b.second));
|
|
}
|
|
);
|
|
} else {
|
|
AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, input.scalar_type(), "aminmax_cpu", [&] {
|
|
using Vec = Vectorized<opmath_type<scalar_t>>;
|
|
using scalar_t_pair = std::pair<scalar_t, scalar_t>;
|
|
reduce_all_impl_vec_two_outputs<scalar_t>(
|
|
min_result,
|
|
max_result,
|
|
input,
|
|
scalar_t_pair(upper_bound<scalar_t>(), lower_bound<scalar_t>()),
|
|
[=] (scalar_t_pair a , scalar_t_pair b) -> scalar_t_pair {
|
|
return scalar_t_pair(
|
|
min_impl(a.first, b.first), max_impl(a.second, b.second));
|
|
},
|
|
[=](Vec a, Vec b) -> Vec { return minimum(a, b); },
|
|
[=](Vec a, Vec b) -> Vec { return maximum(a, b); }
|
|
);
|
|
});
|
|
}
|
|
}
|
|
|
|
} // namespace
|
|
|
|
REGISTER_DISPATCH(min_all_stub, &min_all_kernel_impl)
|
|
REGISTER_DISPATCH(max_all_stub, &max_all_kernel_impl)
|
|
REGISTER_DISPATCH(aminmax_allreduce_stub, &aminmax_allreduce_kernel)
|
|
|
|
} // namespace at::native
|