mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Revert "Add SVE128 ISA (#158932)"
This reverts commit 92284fb2ff44f09a9c7df0d8cf6cac9903e376a4. Reverted https://github.com/pytorch/pytorch/pull/158932 on behalf of https://github.com/malfet due to Hmm, but from OSS point of view, this is a no-op ([comment](https://github.com/pytorch/pytorch/pull/158932#issuecomment-3387961238))
This commit is contained in:
@ -103,9 +103,7 @@ std::string get_cpu_capability() {
|
||||
#elif defined(HAVE_ZVECTOR_CPU_DEFINITION)
|
||||
case native::CPUCapability::ZVECTOR:
|
||||
return "Z VECTOR";
|
||||
#elif defined(HAVE_SVE_CPU_DEFINITION) && defined(HAVE_ARM_BF16_CPU_DEFINITION)
|
||||
case native::CPUCapability::SVE128:
|
||||
return "SVE128";
|
||||
#elif defined(HAVE_SVE256_CPU_DEFINITION) && defined(HAVE_ARM_BF16_CPU_DEFINITION)
|
||||
case native::CPUCapability::SVE256:
|
||||
return "SVE256";
|
||||
#else
|
||||
|
@ -102,31 +102,8 @@ struct VecReduceAllSIMD<float, Op> {
|
||||
#endif // defined(__GNUC__) && (__GNUC__ > 5) && !defined(_MSC_VER) &&
|
||||
// !defined(C10_MOBILE)
|
||||
|
||||
#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
|
||||
#if defined(CPU_CAPABILITY_SVE256)
|
||||
template <typename Op>
|
||||
struct VecReduceAllSIMD<float, Op> {
|
||||
static inline float apply(
|
||||
const Op& vec_fun,
|
||||
const Vectorized<float>& acc_vec) {
|
||||
using Vec = Vectorized<float>;
|
||||
Vec v = acc_vec;
|
||||
// 128-bit shuffle
|
||||
svuint32_t ind = svdupq_n_u32(4, 5, 6, 7);
|
||||
Vec v1 = svtbl_f32(v, ind);
|
||||
v = vec_fun(v, v1);
|
||||
// 64-bit shuffle
|
||||
ind = svdupq_n_u32(2, 3, 0, 1);
|
||||
v1 = svtbl_f32(v, ind);
|
||||
v = vec_fun(v, v1);
|
||||
// 32-bit shuffle
|
||||
ind = svdupq_n_u32(1, 0, 2, 3);
|
||||
v1 = svtbl_f32(v, ind);
|
||||
v = vec_fun(v, v1);
|
||||
return svlasta(svpfalse(), v);
|
||||
}
|
||||
};
|
||||
#else
|
||||
#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
|
||||
!defined(CPU_CAPABILITY_SVE)
|
||||
template <typename Op>
|
||||
struct VecReduceAllSIMD<float, Op> {
|
||||
static inline float apply(
|
||||
@ -163,8 +140,35 @@ struct VecReduceAllSIMD<float, std::plus<Vectorized<float>>> {
|
||||
return vaddvq_f32(acc_vec);
|
||||
}
|
||||
};
|
||||
#endif // defined(CPU_CAPABILITY_SVE256)
|
||||
#endif // defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
|
||||
// && !defined(CPU_CAPABILITY_SVE)
|
||||
|
||||
#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
|
||||
defined(CPU_CAPABILITY_SVE256)
|
||||
template <typename Op>
|
||||
struct VecReduceAllSIMD<float, Op> {
|
||||
static inline float apply(
|
||||
const Op& vec_fun,
|
||||
const Vectorized<float>& acc_vec) {
|
||||
using Vec = Vectorized<float>;
|
||||
Vec v = acc_vec;
|
||||
// 128-bit shuffle
|
||||
svuint32_t ind = svdupq_n_u32(4, 5, 6, 7);
|
||||
Vec v1 = svtbl_f32(v, ind);
|
||||
v = vec_fun(v, v1);
|
||||
// 64-bit shuffle
|
||||
ind = svdupq_n_u32(2, 3, 0, 1);
|
||||
v1 = svtbl_f32(v, ind);
|
||||
v = vec_fun(v, v1);
|
||||
// 32-bit shuffle
|
||||
ind = svdupq_n_u32(1, 0, 2, 3);
|
||||
v1 = svtbl_f32(v, ind);
|
||||
v = vec_fun(v, v1);
|
||||
return svlasta(svpfalse(), v);
|
||||
}
|
||||
};
|
||||
#endif // defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
|
||||
// && defined(CPU_CAPABILITY_SVE256)
|
||||
|
||||
template <typename scalar_t, typename Op>
|
||||
inline scalar_t vec_reduce_all(
|
||||
|
@ -1,21 +1,9 @@
|
||||
#pragma once
|
||||
|
||||
#include <ATen/cpu/vec/intrinsics.h>
|
||||
#include <c10/macros/Macros.h>
|
||||
#include <cstdint>
|
||||
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
|
||||
#if defined(__aarch64__) && \
|
||||
(defined(AT_BUILD_ARM_VEC256_WITH_SLEEF) || \
|
||||
defined(AT_BUILD_ARM_VECSVE_WITH_SLEEF))
|
||||
#define SLEEF_STATIC_LIBS
|
||||
#include <sleef.h>
|
||||
#define USE_SLEEF(sleef_code, non_sleef_code) sleef_code
|
||||
#else
|
||||
#define USE_SLEEF(sleef_code, non_sleef_code) non_sleef_code
|
||||
#endif
|
||||
|
||||
#if defined(CPU_CAPABILITY_SVE)
|
||||
|
||||
// Define the data type of VLS(vector-length specific).
|
||||
|
@ -2,6 +2,7 @@
|
||||
|
||||
#include <ATen/cpu/vec/intrinsics.h>
|
||||
#include <ATen/cpu/vec/sve/sve_helper.h>
|
||||
#include <ATen/cpu/vec/sve/vec_common_sve.h>
|
||||
#include <ATen/cpu/vec/sve/vec_float.h>
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
#include <c10/util/bit_cast.h>
|
||||
|
@ -8,48 +8,13 @@
|
||||
#include <ATen/cpu/vec/sve/sve_helper.h>
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
|
||||
#ifdef CPU_CAPABILITY_SVE128
|
||||
|
||||
#include <ATen/cpu/vec/vec128/vec128_float_neon.h>
|
||||
|
||||
#include <ATen/cpu/vec/vec128/vec128_bfloat16_neon.h>
|
||||
|
||||
#include <ATen/cpu/vec/vec128/vec128_half_neon.h>
|
||||
|
||||
#include <ATen/cpu/vec/vec128/vec128_convert.h>
|
||||
|
||||
#include <ATen/cpu/vec/sve/vec_qint.h>
|
||||
|
||||
#elif defined(CPU_CAPABILITY_SVE)
|
||||
|
||||
#include <ATen/cpu/vec/sve/vec_float.h>
|
||||
|
||||
#if defined(CPU_CAPABILITY_SVE)
|
||||
#include <ATen/cpu/vec/sve/vec_bfloat16.h>
|
||||
|
||||
#include <ATen/cpu/vec/sve/vec_double.h>
|
||||
#include <ATen/cpu/vec/sve/vec_float.h>
|
||||
#include <ATen/cpu/vec/sve/vec_int.h>
|
||||
|
||||
#include <ATen/cpu/vec/sve/vec_qint.h>
|
||||
|
||||
#include <ATen/cpu/vec/vec256/vec256_half.h>
|
||||
|
||||
#include <ATen/cpu/vec/vec256/vec256_convert.h>
|
||||
|
||||
#else // NEON
|
||||
|
||||
#include <ATen/cpu/vec/vec128/vec128_float_neon.h>
|
||||
|
||||
#include <ATen/cpu/vec/vec128/vec128_half_neon.h>
|
||||
|
||||
#include <ATen/cpu/vec/vec128/vec128_bfloat16_neon.h>
|
||||
|
||||
#include <ATen/cpu/vec/vec128/vec128_convert.h>
|
||||
|
||||
#include <ATen/cpu/vec/vec256/vec256_qint.h>
|
||||
|
||||
#endif // defined(CPU_CAPABILITY_SVE128)
|
||||
|
||||
#include <ATen/cpu/vec/functional.h>
|
||||
#endif
|
||||
|
||||
namespace at::vec {
|
||||
// Note [CPU_CAPABILITY namespace]
|
||||
@ -83,6 +48,12 @@ DEFINE_SVE_CAST(int32_t, s32, float, f32)
|
||||
DEFINE_SVE_CAST(int16_t, s16, float, f32)
|
||||
DEFINE_SVE_CAST(float, f32, double, f64)
|
||||
|
||||
#ifdef __ARM_FEATURE_BF16
|
||||
DEFINE_SVE_CAST(int64_t, s64, c10::BFloat16, bf16)
|
||||
DEFINE_SVE_CAST(int32_t, s32, c10::BFloat16, bf16)
|
||||
DEFINE_SVE_CAST(int16_t, s16, c10::BFloat16, bf16)
|
||||
#endif // __ARM_FEATURE_BF16
|
||||
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
template <int64_t scale = 1>
|
||||
@ -202,11 +173,9 @@ std::pair<
|
||||
// group cols crossing lanes:
|
||||
// return {a0, b0, a1, b1, a2, b2, a3, b3}
|
||||
// {a4, b4, a5, b5, a6, b6, a7, b7}
|
||||
svbfloat16_t aReg = a;
|
||||
svbfloat16_t bReg = b;
|
||||
Vectorized<c10::BFloat16> c = svzip1_bf16(aReg, bReg);
|
||||
Vectorized<c10::BFloat16> d = svzip2_bf16(aReg, bReg);
|
||||
return std::make_pair(c, d);
|
||||
return std::make_pair(
|
||||
Vectorized<c10::BFloat16>(svzip1_bf16(a, b)),
|
||||
Vectorized<c10::BFloat16>(svzip2_bf16(a, b)));
|
||||
}
|
||||
#endif // __ARM_FEATURE_BF16
|
||||
|
||||
@ -255,27 +224,12 @@ std::pair<
|
||||
// swap lanes:
|
||||
// return {a0, a1, a2, a3, a4, a5, a6, a7}
|
||||
// {b0, b1, b2, b3, b4, b5, b6, b7}
|
||||
svbfloat16_t aReg = a;
|
||||
svbfloat16_t bReg = b;
|
||||
Vectorized<c10::BFloat16> c = svuzp1_bf16(aReg, bReg);
|
||||
Vectorized<c10::BFloat16> d = svuzp2_bf16(aReg, bReg);
|
||||
return std::make_pair(c, d);
|
||||
return std::make_pair(
|
||||
Vectorized<c10::BFloat16>(svuzp1_bf16((svbfloat16_t)a, (svbfloat16_t)b)),
|
||||
Vectorized<c10::BFloat16>(svuzp2_bf16((svbfloat16_t)a, (svbfloat16_t)b)));
|
||||
}
|
||||
#endif // __ARM_FEATURE_BF16
|
||||
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ FLIP ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
#define DEFINE_FLIP_FUNC(type, sve_func) \
|
||||
inline Vectorized<type> flip(const Vectorized<type>& v) { \
|
||||
return Vectorized<type>(sve_func(v)); \
|
||||
}
|
||||
// Use the macro to define the flip functions
|
||||
DEFINE_FLIP_FUNC(float, svrev_f32)
|
||||
DEFINE_FLIP_FUNC(double, svrev_f64)
|
||||
DEFINE_FLIP_FUNC(int64_t, svrev_s64)
|
||||
DEFINE_FLIP_FUNC(int32_t, svrev_s32)
|
||||
DEFINE_FLIP_FUNC(int16_t, svrev_s16)
|
||||
DEFINE_FLIP_FUNC(int8_t, svrev_s8)
|
||||
|
||||
#endif // defined(CPU_CAPABILITY_SVE)
|
||||
|
||||
} // namespace CPU_CAPABILITY
|
@ -1,8 +1,6 @@
|
||||
#pragma once
|
||||
|
||||
#if defined(__aarch64__)
|
||||
#include <ATen/cpu/vec/vec_common_aarch64.h>
|
||||
#elif defined(CPU_CAPABILITY_AVX512)
|
||||
#if defined(CPU_CAPABILITY_AVX512)
|
||||
#include <ATen/cpu/vec/vec512/vec512.h>
|
||||
#else
|
||||
#include <ATen/cpu/vec/vec128/vec128.h>
|
||||
@ -13,34 +11,6 @@ namespace at::vec {
|
||||
// See Note [CPU_CAPABILITY namespace]
|
||||
inline namespace CPU_CAPABILITY {
|
||||
|
||||
inline std::ostream& operator<<(std::ostream& stream, const c10::qint32& val) {
|
||||
stream << val.val_;
|
||||
return stream;
|
||||
}
|
||||
inline std::ostream& operator<<(std::ostream& stream, const c10::qint8& val) {
|
||||
stream << static_cast<int>(val.val_);
|
||||
return stream;
|
||||
}
|
||||
inline std::ostream& operator<<(std::ostream& stream, const c10::quint8& val) {
|
||||
stream << static_cast<unsigned int>(val.val_);
|
||||
return stream;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
|
||||
T buf[Vectorized<T>::size()];
|
||||
vec.store(buf);
|
||||
stream << "vec[";
|
||||
for (int i = 0; i != Vectorized<T>::size(); i++) {
|
||||
if (i != 0) {
|
||||
stream << ", ";
|
||||
}
|
||||
stream << buf[i];
|
||||
}
|
||||
stream << "]";
|
||||
return stream;
|
||||
}
|
||||
|
||||
inline Vectorized<bool> convert_to_bool(Vectorized<int8_t> x) {
|
||||
__at_align__ bool buffer[x.size()];
|
||||
x.ne(Vectorized<int8_t>(0)).store(buffer);
|
||||
|
@ -2,7 +2,6 @@
|
||||
|
||||
// DO NOT DEFINE STATIC DATA IN THIS HEADER!
|
||||
// See Note [Do not compile initializers with AVX]
|
||||
#include <ATen/cpu/vec/sve/sve_helper.h>
|
||||
#include <ATen/cpu/vec/vec128/vec128_float_neon.h>
|
||||
#include <ATen/cpu/vec/vec128/vec128_reduced_precision_common_neon.h>
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
@ -263,13 +262,6 @@ class Vectorized<c10::BFloat16> : public Vectorized16<
|
||||
c10::bit_cast<at_bfloat16_t>(val6.x),
|
||||
c10::bit_cast<at_bfloat16_t>(val7.x)}) {}
|
||||
|
||||
#ifdef CPU_CAPABILITY_SVE128
|
||||
Vectorized(svbfloat16_t v) : Vectorized16(svget_neonq(v)) {}
|
||||
operator svbfloat16_t() const {
|
||||
return svset_neonq(svundef_bf16(), values);
|
||||
}
|
||||
#endif
|
||||
|
||||
static Vectorized<c10::BFloat16> blendv(
|
||||
const Vectorized<c10::BFloat16>& a,
|
||||
const Vectorized<c10::BFloat16>& b,
|
||||
@ -382,23 +374,6 @@ class Vectorized<c10::BFloat16> : public Vectorized16<
|
||||
Vectorized ge(const Vectorized& other) const;
|
||||
Vectorized lt(const Vectorized& other) const;
|
||||
Vectorized le(const Vectorized& other) const;
|
||||
|
||||
#ifdef CPU_CAPABILITY_SVE128
|
||||
|
||||
template <typename step_t>
|
||||
static Vectorized<BFloat16> arange(
|
||||
BFloat16 base = 0.f,
|
||||
step_t step = static_cast<step_t>(1)) {
|
||||
__at_align__ BFloat16 buffer[size()];
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
buffer[i] = base + i * step;
|
||||
}
|
||||
return svget_neonq(
|
||||
svld1_bf16(ptrue, reinterpret_cast<bfloat16_t*>(buffer)));
|
||||
}
|
||||
|
||||
#endif // CPU_CAPABILITY_SVE128
|
||||
|
||||
}; // Vectorized<c10::BFloat16>
|
||||
|
||||
inline std::tuple<Vectorized<float>, Vectorized<float>> convert_bfloat16_float(
|
||||
@ -422,24 +397,6 @@ inline Vectorized<c10::BFloat16> convert_float_bfloat16(
|
||||
return Vectorized<c10::BFloat16>(at_vcombine_bf16(x1, x2));
|
||||
}
|
||||
|
||||
inline void load_fp32_from_bf16(const BFloat16* data, Vectorized<float>& out) {
|
||||
__at_align__ float values[Vectorized<float>::size()];
|
||||
for (const auto k : c10::irange(Vectorized<float>::size())) {
|
||||
values[k] = data[k];
|
||||
}
|
||||
out = Vectorized<float>::loadu(values);
|
||||
}
|
||||
|
||||
inline void load_fp32_from_bf16(
|
||||
const BFloat16* data,
|
||||
Vectorized<float>& out1,
|
||||
Vectorized<float>& out2) {
|
||||
Vectorized<BFloat16> bf16_vec = Vectorized<BFloat16>::loadu(data);
|
||||
auto floats = convert_bfloat16_float(bf16_vec);
|
||||
out1 = std::get<0>(floats);
|
||||
out2 = std::get<1>(floats);
|
||||
}
|
||||
|
||||
template <typename Op>
|
||||
Vectorized<c10::BFloat16> binary_operator_via_float(
|
||||
Op op,
|
||||
@ -622,12 +579,6 @@ Vectorized<c10::BFloat16> inline fnmsub(
|
||||
return -a * b - c;
|
||||
}
|
||||
|
||||
#else //
|
||||
|
||||
CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16)
|
||||
|
||||
LOAD_FP32_NON_VECTORIZED_INIT(BFloat16, bf16)
|
||||
|
||||
#endif // !defined(C10_MOBILE) && defined(__aarch64__)
|
||||
|
||||
} // namespace CPU_CAPABILITY
|
||||
|
@ -4,7 +4,7 @@
|
||||
|
||||
namespace at::vec {
|
||||
inline namespace CPU_CAPABILITY {
|
||||
#if defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256)
|
||||
#if (defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256))
|
||||
template <typename src_t>
|
||||
struct VecConvert<
|
||||
float,
|
||||
@ -60,7 +60,6 @@ struct VecConvert<float, 1, BFloat16, 1> {
|
||||
}
|
||||
};
|
||||
|
||||
#endif // defined(__aarch64__) && (!defined(CPU_CAPABILITY_SVE) ||
|
||||
// defined(CPU_CAPABILITY_SVE128))
|
||||
#endif // defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256)
|
||||
} // namespace CPU_CAPABILITY
|
||||
} // namespace at::vec
|
||||
|
@ -4,10 +4,13 @@
|
||||
// See Note [Do not compile initializers with AVX]
|
||||
|
||||
#include <ATen/cpu/vec/intrinsics.h>
|
||||
#include <ATen/cpu/vec/sve/sve_helper.h>
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
#if defined(__aarch64__) && defined(AT_BUILD_ARM_VEC256_WITH_SLEEF)
|
||||
#include <sleef.h>
|
||||
#endif
|
||||
|
||||
// Sleef offers vectorized versions of some transcedentals
|
||||
// such as sin, cos, tan etc..
|
||||
// However for now opting for STL, since we are not building
|
||||
@ -32,6 +35,12 @@ inline namespace CPU_CAPABILITY {
|
||||
#error "Big endian is not supported."
|
||||
#endif
|
||||
|
||||
#if defined(AT_BUILD_ARM_VEC256_WITH_SLEEF)
|
||||
#define USE_SLEEF(sleef_code, non_sleef_code) sleef_code
|
||||
#else
|
||||
#define USE_SLEEF(sleef_code, non_sleef_code) non_sleef_code
|
||||
#endif
|
||||
|
||||
template <int index, bool mask_val>
|
||||
struct BlendRegs {
|
||||
static float32x4_t impl(
|
||||
@ -85,12 +94,6 @@ class Vectorized<float> {
|
||||
operator float32x4_t() const {
|
||||
return values;
|
||||
}
|
||||
#ifdef CPU_CAPABILITY_SVE128
|
||||
Vectorized(svfloat32_t v) : values(svget_neonq(v)) {}
|
||||
operator svfloat32_t() const {
|
||||
return svset_neonq(svundef_f32(), values);
|
||||
}
|
||||
#endif
|
||||
template <int64_t mask>
|
||||
static Vectorized<float> blend(
|
||||
const Vectorized<float>& a,
|
||||
|
@ -4,6 +4,7 @@
|
||||
// See Note [Do not compile initializers with AVX]
|
||||
|
||||
#include <ATen/cpu/vec/intrinsics.h>
|
||||
#include <ATen/cpu/vec/vec128/vec128_convert.h>
|
||||
#include <ATen/cpu/vec/vec128/vec128_float_neon.h>
|
||||
#include <ATen/cpu/vec/vec128/vec128_reduced_precision_common_neon.h>
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
@ -24,6 +25,7 @@ inline namespace CPU_CAPABILITY {
|
||||
// https://bugs.llvm.org/show_bug.cgi?id=45824
|
||||
// Most likely we will do aarch32 support with inline asm.
|
||||
#if !defined(C10_MOBILE) && defined(__aarch64__)
|
||||
|
||||
#ifdef __BIG_ENDIAN__
|
||||
#error "Big endian is not supported."
|
||||
#endif
|
||||
@ -419,24 +421,6 @@ Vectorized<c10::Half> inline operator+(
|
||||
#endif
|
||||
}
|
||||
|
||||
inline void load_fp32_from_fp16(const c10::Half* data, Vectorized<float>& out) {
|
||||
__at_align__ float values[Vectorized<float>::size()];
|
||||
for (const auto k : c10::irange(Vectorized<float>::size())) {
|
||||
values[k] = data[k];
|
||||
}
|
||||
out = Vectorized<float>::loadu(values);
|
||||
}
|
||||
|
||||
inline void load_fp32_from_fp16(
|
||||
const c10::Half* data,
|
||||
Vectorized<float>& out1,
|
||||
Vectorized<float>& out2) {
|
||||
Vectorized<c10::Half> f16_vec = Vectorized<c10::Half>::loadu(data);
|
||||
auto floats = convert_half_float(f16_vec);
|
||||
out1 = std::get<0>(floats);
|
||||
out2 = std::get<1>(floats);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<c10::Half> inline operator-(
|
||||
const Vectorized<c10::Half>& a,
|
||||
@ -672,53 +656,6 @@ Vectorized<c10::Half> inline fnmsub(
|
||||
return -a * b - c;
|
||||
#endif
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define CONVERT_NON_VECTORIZED_INIT(type, name) \
|
||||
inline std::tuple<Vectorized<float>, Vectorized<float>> \
|
||||
convert_##name##_float(const Vectorized<type>& a) { \
|
||||
constexpr int64_t K = Vectorized<type>::size(); \
|
||||
__at_align__ float arr[K]; \
|
||||
__at_align__ type arr2[K]; \
|
||||
a.store(arr2); \
|
||||
convert(arr2, arr, K); \
|
||||
return std::make_tuple( \
|
||||
Vectorized<float>::loadu(arr), \
|
||||
Vectorized<float>::loadu(arr + Vectorized<float>::size())); \
|
||||
} \
|
||||
inline Vectorized<type> convert_float_##name( \
|
||||
const Vectorized<float>& a, const Vectorized<float>& b) { \
|
||||
constexpr int64_t K = Vectorized<type>::size(); \
|
||||
__at_align__ float arr[K]; \
|
||||
__at_align__ type arr2[K]; \
|
||||
a.store(arr); \
|
||||
b.store(arr + Vectorized<float>::size()); \
|
||||
convert(arr, arr2, K); \
|
||||
return Vectorized<type>::loadu(arr2); \
|
||||
}
|
||||
|
||||
#define LOAD_FP32_NON_VECTORIZED_INIT(type, name) \
|
||||
inline void load_fp32_from_##name( \
|
||||
const type* data, Vectorized<float>& out) { \
|
||||
__at_align__ float values[Vectorized<float>::size()]; \
|
||||
for (const auto k : c10::irange(Vectorized<float>::size())) { \
|
||||
values[k] = data[k]; \
|
||||
} \
|
||||
out = Vectorized<float>::loadu(values); \
|
||||
} \
|
||||
\
|
||||
inline void load_fp32_from_##name( \
|
||||
const type* data, Vectorized<float>& out1, Vectorized<float>& out2) { \
|
||||
load_fp32_from_##name(data, out1); \
|
||||
data += Vectorized<float>::size(); \
|
||||
load_fp32_from_##name(data, out2); \
|
||||
}
|
||||
|
||||
CONVERT_NON_VECTORIZED_INIT(Half, half)
|
||||
|
||||
LOAD_FP32_NON_VECTORIZED_INIT(Half, fp16)
|
||||
|
||||
#endif // !defined(C10_MOBILE) && defined(__aarch64__)
|
||||
|
||||
} // namespace CPU_CAPABILITY
|
||||
|
@ -9,16 +9,21 @@
|
||||
#if !( \
|
||||
defined(__VSX__) || defined(CPU_CAPABILITY_VSX) || \
|
||||
defined(CPU_CAPABILITY_ZVECTOR))
|
||||
#include <ATen/cpu/vec/vec256/vec256_double.h>
|
||||
#if defined(CPU_CAPABILITY_SVE256)
|
||||
#include <ATen/cpu/vec/sve/vec_common_sve.h>
|
||||
#else
|
||||
// clang-format off
|
||||
#include <ATen/cpu/vec/vec256/vec256_float.h>
|
||||
#include <ATen/cpu/vec/vec256/vec256_double.h>
|
||||
#include <ATen/cpu/vec/vec256/vec256_int.h>
|
||||
#include <ATen/cpu/vec/vec256/vec256_qint.h>
|
||||
#endif
|
||||
#if !defined(CPU_CAPABILITY_SVE256) || !defined(__ARM_FEATURE_BF16)
|
||||
#include <ATen/cpu/vec/vec256/vec256_bfloat16.h>
|
||||
#endif
|
||||
#include <ATen/cpu/vec/vec256/vec256_complex_double.h>
|
||||
#include <ATen/cpu/vec/vec256/vec256_complex_float.h>
|
||||
#include <ATen/cpu/vec/vec256/vec256_half.h>
|
||||
#include <ATen/cpu/vec/vec256/vec256_complex_float.h>
|
||||
#include <ATen/cpu/vec/vec256/vec256_complex_double.h>
|
||||
// clang-format on
|
||||
#elif defined(__VSX__) || defined(CPU_CAPABILITY_VSX)
|
||||
#include <ATen/cpu/vec/vec256/vsx/vec256_common_vsx.h>
|
||||
@ -51,6 +56,34 @@ namespace at::vec {
|
||||
// accessed as `at::vec`.
|
||||
inline namespace CPU_CAPABILITY {
|
||||
|
||||
inline std::ostream& operator<<(std::ostream& stream, const c10::qint32& val) {
|
||||
stream << val.val_;
|
||||
return stream;
|
||||
}
|
||||
inline std::ostream& operator<<(std::ostream& stream, const c10::qint8& val) {
|
||||
stream << static_cast<int>(val.val_);
|
||||
return stream;
|
||||
}
|
||||
inline std::ostream& operator<<(std::ostream& stream, const c10::quint8& val) {
|
||||
stream << static_cast<unsigned int>(val.val_);
|
||||
return stream;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
|
||||
T buf[Vectorized<T>::size()];
|
||||
vec.store(buf);
|
||||
stream << "vec[";
|
||||
for (int i = 0; i != Vectorized<T>::size(); i++) {
|
||||
if (i != 0) {
|
||||
stream << ", ";
|
||||
}
|
||||
stream << buf[i];
|
||||
}
|
||||
stream << "]";
|
||||
return stream;
|
||||
}
|
||||
|
||||
#if defined(CPU_CAPABILITY_AVX2)
|
||||
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX2) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
@ -268,7 +268,9 @@ LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16)
|
||||
|
||||
#else // defined(CPU_CAPABILITY_AVX2)
|
||||
|
||||
#if !(defined(__aarch64__))
|
||||
#if !( \
|
||||
defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
|
||||
!defined(CPU_CAPABILITY_SVE256))
|
||||
CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16)
|
||||
#endif
|
||||
|
||||
|
@ -268,7 +268,9 @@ LOAD_FP32_VECTORIZED_INIT(Half, fp16)
|
||||
|
||||
#else // defined(CPU_CAPABILITY_AVX2)
|
||||
|
||||
#if !defined(__aarch64__) || defined(CPU_CAPABILITY_SVE256)
|
||||
#if !( \
|
||||
defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
|
||||
!defined(CPU_CAPABILITY_SVE256))
|
||||
CONVERT_NON_VECTORIZED_INIT(Half, half)
|
||||
#endif
|
||||
|
||||
|
@ -5,13 +5,6 @@
|
||||
|
||||
#include <ATen/cpu/vec/intrinsics.h>
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
|
||||
#ifdef __aarch64__
|
||||
#if defined(CPU_CAPABILITY_SVE128) || !defined(CPU_CAPABILITY_SVE)
|
||||
#include <ATen/cpu/vec/vec128/vec128_float_neon.h>
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include <ATen/native/quantized/AffineQuantizerBase.h>
|
||||
|
||||
#include <c10/util/irange.h>
|
||||
@ -922,7 +915,7 @@ Vectorized<c10::quint8> inline maximum(
|
||||
return a.maximum(b);
|
||||
}
|
||||
|
||||
#else
|
||||
#elif !defined(CPU_CAPABILITY_SVE256)
|
||||
|
||||
// NOTE: These are low-performance implementations that we fall back on
|
||||
// if we are not building with AVX2. This may not be an issue, because
|
||||
@ -1379,18 +1372,12 @@ Vectorized<c10::quint8> inline maximum(
|
||||
return a.maximum(b);
|
||||
}
|
||||
|
||||
#if defined(__aarch64__) && \
|
||||
(defined(CPU_CAPABILITY_SVE128) || !defined(CPU_CAPABILITY_SVE))
|
||||
#endif // if defined(CPU_CAPABILITY_AVX2)
|
||||
|
||||
#if (defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256))
|
||||
std::pair<Vectorized<float>, Vectorized<float>> inline convert_int8_to_float(
|
||||
at::vec::Vectorized<int8_t> src) {
|
||||
|
||||
#ifdef CPU_CAPABILITY_SVE
|
||||
svint8_t x = src;
|
||||
auto s8x8 = vget_low_s8(svget_neonq(x));
|
||||
#else
|
||||
auto s8x8 = vld1_s8(src.operator const int8_t*());
|
||||
#endif
|
||||
|
||||
auto s16x8 = vmovl_s8(s8x8);
|
||||
|
||||
auto s32x4_hi = vmovl_s16(vget_high_s16(s16x8));
|
||||
@ -1415,14 +1402,7 @@ std::pair<Vectorized<float>, Vectorized<float>> inline convert_int8_to_float(
|
||||
|
||||
Vectorized<float> inline convert_int8_half_register_to_float(
|
||||
at::vec::Vectorized<int8_t> src) {
|
||||
|
||||
#ifdef CPU_CAPABILITY_SVE
|
||||
svint8_t x = src;
|
||||
auto s8x8 = vget_low_s8(svget_neonq(x));
|
||||
#else
|
||||
auto s8x8 = vld1_s8(src.operator const int8_t*());
|
||||
#endif
|
||||
|
||||
auto s16x8 = vmovl_s8(s8x8);
|
||||
|
||||
auto s32x4_lo = vmovl_s16(vget_low_s16(s16x8));
|
||||
@ -1440,8 +1420,5 @@ Vectorized<float> inline convert_int8_half_register_to_float(
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#endif // if defined(CPU_CAPABILITY_AVX2)
|
||||
|
||||
} // namespace CPU_CAPABILITY
|
||||
} // namespace at::vec
|
||||
|
@ -31,6 +31,34 @@ namespace vec {
|
||||
// See Note [CPU_CAPABILITY namespace]
|
||||
inline namespace CPU_CAPABILITY {
|
||||
|
||||
inline std::ostream& operator<<(std::ostream& stream, const c10::qint32& val) {
|
||||
stream << val.val_;
|
||||
return stream;
|
||||
}
|
||||
inline std::ostream& operator<<(std::ostream& stream, const c10::qint8& val) {
|
||||
stream << static_cast<int>(val.val_);
|
||||
return stream;
|
||||
}
|
||||
inline std::ostream& operator<<(std::ostream& stream, const c10::quint8& val) {
|
||||
stream << static_cast<unsigned int>(val.val_);
|
||||
return stream;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
|
||||
T buf[Vectorized<T>::size()];
|
||||
vec.store(buf);
|
||||
stream << "vec[";
|
||||
for (int i = 0; i != Vectorized<T>::size(); i++) {
|
||||
if (i != 0) {
|
||||
stream << ", ";
|
||||
}
|
||||
stream << buf[i];
|
||||
}
|
||||
stream << "]";
|
||||
return stream;
|
||||
}
|
||||
|
||||
#if defined(CPU_CAPABILITY_AVX512)
|
||||
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX512)
|
||||
|
@ -67,7 +67,18 @@ Windows llvm will not have this definition.
|
||||
#endif
|
||||
#define VECTOR_WIDTH 64
|
||||
#define int_vector __m512i
|
||||
#elif defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_SVE256)
|
||||
#elif defined(__aarch64__) && \
|
||||
!defined(CPU_CAPABILITY_SVE) // CPU_CAPABILITY_AVX512
|
||||
// SVE code expects 256-vectors; leave that set for SVE?
|
||||
#if defined(__GNUC__)
|
||||
#define __at_align__ __attribute__((aligned(16)))
|
||||
#elif defined(_WIN32)
|
||||
#define __at_align__ __declspec(align(16))
|
||||
#else
|
||||
#define __at_align__
|
||||
#endif
|
||||
#define VECTOR_WIDTH 16
|
||||
#else // CPU_CAPABILITY_AVX512
|
||||
#if defined(__GNUC__)
|
||||
#define __at_align__ __attribute__((aligned(32)))
|
||||
#elif defined(_WIN32)
|
||||
@ -77,27 +88,7 @@ Windows llvm will not have this definition.
|
||||
#endif
|
||||
#define VECTOR_WIDTH 32
|
||||
#define int_vector __m256i
|
||||
#elif defined(__aarch64__)
|
||||
// Define alignment and vector width for SVE128/Default (e.g., NEON)
|
||||
#if defined(__GNUC__)
|
||||
#define __at_align__ __attribute__((aligned(16)))
|
||||
#elif defined(_WIN32)
|
||||
#define __at_align__ __declspec(align(16))
|
||||
#else
|
||||
#define __at_align__
|
||||
#endif
|
||||
#define VECTOR_WIDTH 16
|
||||
#else
|
||||
// Fallback: define default alignment and vector width
|
||||
#if defined(__GNUC__)
|
||||
#define __at_align__ __attribute__((aligned(32)))
|
||||
#elif defined(_WIN32)
|
||||
#define __at_align__ __declspec(align(32))
|
||||
#else
|
||||
#define __at_align__
|
||||
#endif
|
||||
#define VECTOR_WIDTH 32
|
||||
#endif
|
||||
#endif // CPU_CAPABILITY_AVX512
|
||||
|
||||
namespace at::vec {
|
||||
// See Note [CPU_CAPABILITY namespace]
|
||||
|
@ -1157,103 +1157,103 @@ REGISTER_AVX512_DISPATCH(cholesky_stub, &cholesky_kernel)
|
||||
REGISTER_AVX2_DISPATCH(cholesky_stub, &cholesky_kernel)
|
||||
REGISTER_VSX_DISPATCH(cholesky_stub, &cholesky_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(cholesky_stub, &cholesky_kernel)
|
||||
REGISTER_SVE_DISPATCH(cholesky_stub, &cholesky_kernel)
|
||||
REGISTER_SVE256_DISPATCH(cholesky_stub, &cholesky_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(cholesky_inverse_stub, DEFAULT, &cholesky_inverse_kernel_impl)
|
||||
REGISTER_AVX512_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl)
|
||||
REGISTER_AVX2_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl)
|
||||
REGISTER_VSX_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl)
|
||||
REGISTER_ZVECTOR_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl)
|
||||
REGISTER_SVE_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl)
|
||||
REGISTER_SVE256_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(linalg_eig_stub, DEFAULT, &linalg_eig_kernel)
|
||||
REGISTER_AVX512_DISPATCH(linalg_eig_stub, &linalg_eig_kernel)
|
||||
REGISTER_AVX2_DISPATCH(linalg_eig_stub, &linalg_eig_kernel)
|
||||
REGISTER_VSX_DISPATCH(linalg_eig_stub, &linalg_eig_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(linalg_eig_stub, &linalg_eig_kernel)
|
||||
REGISTER_SVE_DISPATCH(linalg_eig_stub, &linalg_eig_kernel)
|
||||
REGISTER_SVE256_DISPATCH(linalg_eig_stub, &linalg_eig_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(linalg_eigh_stub, DEFAULT, &linalg_eigh_kernel)
|
||||
REGISTER_AVX512_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel)
|
||||
REGISTER_AVX2_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel)
|
||||
REGISTER_VSX_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel)
|
||||
REGISTER_SVE_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel)
|
||||
REGISTER_SVE256_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(geqrf_stub, DEFAULT, &geqrf_kernel)
|
||||
REGISTER_AVX512_DISPATCH(geqrf_stub, &geqrf_kernel)
|
||||
REGISTER_AVX2_DISPATCH(geqrf_stub, &geqrf_kernel)
|
||||
REGISTER_VSX_DISPATCH(geqrf_stub, &geqrf_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(geqrf_stub, &geqrf_kernel)
|
||||
REGISTER_SVE_DISPATCH(geqrf_stub, &geqrf_kernel)
|
||||
REGISTER_SVE256_DISPATCH(geqrf_stub, &geqrf_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(orgqr_stub, DEFAULT, &orgqr_kernel_impl)
|
||||
REGISTER_AVX512_DISPATCH(orgqr_stub, &orgqr_kernel_impl)
|
||||
REGISTER_AVX2_DISPATCH(orgqr_stub, &orgqr_kernel_impl)
|
||||
REGISTER_VSX_DISPATCH(orgqr_stub, &orgqr_kernel_impl)
|
||||
REGISTER_ZVECTOR_DISPATCH(orgqr_stub, &orgqr_kernel_impl)
|
||||
REGISTER_SVE_DISPATCH(orgqr_stub, &orgqr_kernel_impl)
|
||||
REGISTER_SVE256_DISPATCH(orgqr_stub, &orgqr_kernel_impl)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(ormqr_stub, DEFAULT, &ormqr_kernel)
|
||||
REGISTER_AVX512_DISPATCH(ormqr_stub, &ormqr_kernel)
|
||||
REGISTER_AVX2_DISPATCH(ormqr_stub, &ormqr_kernel)
|
||||
REGISTER_VSX_DISPATCH(ormqr_stub, &ormqr_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(ormqr_stub, &ormqr_kernel)
|
||||
REGISTER_SVE_DISPATCH(ormqr_stub, &ormqr_kernel)
|
||||
REGISTER_SVE256_DISPATCH(ormqr_stub, &ormqr_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(lstsq_stub, DEFAULT, &lstsq_kernel)
|
||||
REGISTER_AVX512_DISPATCH(lstsq_stub, &lstsq_kernel)
|
||||
REGISTER_AVX2_DISPATCH(lstsq_stub, &lstsq_kernel)
|
||||
REGISTER_VSX_DISPATCH(lstsq_stub, &lstsq_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(lstsq_stub, &lstsq_kernel)
|
||||
REGISTER_SVE_DISPATCH(lstsq_stub, &lstsq_kernel)
|
||||
REGISTER_SVE256_DISPATCH(lstsq_stub, &lstsq_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(triangular_solve_stub, DEFAULT, &triangular_solve_kernel)
|
||||
REGISTER_AVX512_DISPATCH(triangular_solve_stub, &triangular_solve_kernel)
|
||||
REGISTER_AVX2_DISPATCH(triangular_solve_stub, &triangular_solve_kernel)
|
||||
REGISTER_VSX_DISPATCH(triangular_solve_stub, &triangular_solve_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(triangular_solve_stub, &triangular_solve_kernel)
|
||||
REGISTER_SVE_DISPATCH(triangular_solve_stub, &triangular_solve_kernel)
|
||||
REGISTER_SVE256_DISPATCH(triangular_solve_stub, &triangular_solve_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(lu_factor_stub, DEFAULT, &lu_factor_kernel)
|
||||
REGISTER_AVX512_DISPATCH(lu_factor_stub, &lu_factor_kernel)
|
||||
REGISTER_AVX2_DISPATCH(lu_factor_stub, &lu_factor_kernel)
|
||||
REGISTER_VSX_DISPATCH(lu_factor_stub, &lu_factor_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(lu_factor_stub, &lu_factor_kernel)
|
||||
REGISTER_SVE_DISPATCH(lu_factor_stub, &lu_factor_kernel)
|
||||
REGISTER_SVE256_DISPATCH(lu_factor_stub, &lu_factor_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(ldl_factor_stub, DEFAULT, &ldl_factor_kernel)
|
||||
REGISTER_AVX512_DISPATCH(ldl_factor_stub, &ldl_factor_kernel)
|
||||
REGISTER_AVX2_DISPATCH(ldl_factor_stub, &ldl_factor_kernel)
|
||||
REGISTER_VSX_DISPATCH(ldl_factor_stub, &ldl_factor_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(ldl_factor_stub, &ldl_factor_kernel)
|
||||
REGISTER_SVE_DISPATCH(ldl_factor_stub, &ldl_factor_kernel)
|
||||
REGISTER_SVE256_DISPATCH(ldl_factor_stub, &ldl_factor_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(ldl_solve_stub, DEFAULT, &ldl_solve_kernel)
|
||||
REGISTER_AVX512_DISPATCH(ldl_solve_stub, &ldl_solve_kernel)
|
||||
REGISTER_AVX2_DISPATCH(ldl_solve_stub, &ldl_solve_kernel)
|
||||
REGISTER_VSX_DISPATCH(ldl_solve_stub, &ldl_solve_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(ldl_solve_stub, &ldl_solve_kernel)
|
||||
REGISTER_SVE_DISPATCH(ldl_solve_stub, &ldl_solve_kernel)
|
||||
REGISTER_SVE256_DISPATCH(ldl_solve_stub, &ldl_solve_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(lu_solve_stub, DEFAULT, &lu_solve_kernel)
|
||||
REGISTER_AVX512_DISPATCH(lu_solve_stub, &lu_solve_kernel)
|
||||
REGISTER_AVX2_DISPATCH(lu_solve_stub, &lu_solve_kernel)
|
||||
REGISTER_VSX_DISPATCH(lu_solve_stub, &lu_solve_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(lu_solve_stub, &lu_solve_kernel)
|
||||
REGISTER_SVE_DISPATCH(lu_solve_stub, &lu_solve_kernel)
|
||||
REGISTER_SVE256_DISPATCH(lu_solve_stub, &lu_solve_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(svd_stub, DEFAULT, &svd_kernel)
|
||||
REGISTER_AVX512_DISPATCH(svd_stub, &svd_kernel)
|
||||
REGISTER_AVX2_DISPATCH(svd_stub, &svd_kernel)
|
||||
REGISTER_VSX_DISPATCH(svd_stub, &svd_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(svd_stub, &svd_kernel)
|
||||
REGISTER_SVE_DISPATCH(svd_stub, &svd_kernel)
|
||||
REGISTER_SVE256_DISPATCH(svd_stub, &svd_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(unpack_pivots_stub, DEFAULT, &unpack_pivots_cpu_kernel)
|
||||
REGISTER_AVX512_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel)
|
||||
REGISTER_AVX2_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel)
|
||||
REGISTER_VSX_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel)
|
||||
REGISTER_SVE_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel)
|
||||
REGISTER_SVE256_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel)
|
||||
} // namespace at::native
|
||||
|
@ -39,21 +39,19 @@ static CPUCapability compute_cpu_capability() {
|
||||
}
|
||||
#elif defined(HAVE_SVE_CPU_DEFINITION)
|
||||
int sve_vl = cpuinfo_get_max_arm_sve_length(); //Returns maximum SVE VL supported by your HW.
|
||||
if (envar == "sve") {
|
||||
// Select SVE capability based on the maximum SVE VL supported by the HW.
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
if (envar == "sve256") {
|
||||
if (sve_vl == 256) {
|
||||
#ifdef HAVE_ARM_BF16_CPU_DEFINITION
|
||||
if (cpuinfo_has_arm_bf16()) {
|
||||
return CPUCapability::SVE256;
|
||||
}
|
||||
} else if (sve_vl == 128) {
|
||||
if (cpuinfo_has_arm_bf16()) {
|
||||
return CPUCapability::SVE128;
|
||||
}
|
||||
} else {
|
||||
TORCH_WARN("SVE capability not available on hardware. Falling back to DEFAULT");
|
||||
return CPUCapability::DEFAULT;
|
||||
#endif
|
||||
}
|
||||
TORCH_WARN("SVE256 capability not available on hardware. Falling back to DEFAULT");
|
||||
return CPUCapability::DEFAULT;
|
||||
}
|
||||
#endif
|
||||
#else
|
||||
#ifdef HAVE_AVX512_CPU_DEFINITION
|
||||
if (envar == "avx512") {
|
||||
@ -115,11 +113,6 @@ static CPUCapability compute_cpu_capability() {
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
#ifdef HAVE_SVE128_CPU_DEFINITION
|
||||
if (sve_vl == 128) { // Check for SVE128
|
||||
return CPUCapability::SVE128;
|
||||
}
|
||||
#endif
|
||||
// Return the default CPU capability.
|
||||
return CPUCapability::DEFAULT;
|
||||
}
|
||||
@ -154,9 +147,6 @@ DispatchResult DispatchStubImpl::try_get_call_ptr(
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
, void *SVE256
|
||||
#endif
|
||||
#ifdef HAVE_SVE128_CPU_DEFINITION
|
||||
, void *SVE128
|
||||
#endif
|
||||
) {
|
||||
constexpr auto supported_devices = c10::array_of<c10::DeviceType>(
|
||||
c10::DeviceType::CPU,
|
||||
@ -194,9 +184,6 @@ DispatchResult DispatchStubImpl::try_get_call_ptr(
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
, SVE256
|
||||
#endif
|
||||
#ifdef HAVE_SVE128_CPU_DEFINITION
|
||||
, SVE128
|
||||
#endif
|
||||
);
|
||||
if (!std::holds_alternative<ErrorType>(result)) {
|
||||
@ -255,9 +242,6 @@ void* DispatchStubImpl::get_call_ptr(
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
, void *SVE256
|
||||
#endif
|
||||
#ifdef HAVE_SVE128_CPU_DEFINITION
|
||||
, void *SVE128
|
||||
#endif
|
||||
) {
|
||||
|
||||
auto result = try_get_call_ptr(
|
||||
@ -282,10 +266,6 @@ void* DispatchStubImpl::get_call_ptr(
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
,
|
||||
SVE256
|
||||
#endif
|
||||
#ifdef HAVE_SVE128_CPU_DEFINITION
|
||||
,
|
||||
SVE128
|
||||
#endif
|
||||
);
|
||||
if (std::holds_alternative<ErrorType>(result)) {
|
||||
@ -320,9 +300,6 @@ DispatchResult DispatchStubImpl::try_choose_cpu_impl(
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
, void *SVE256
|
||||
#endif
|
||||
#ifdef HAVE_SVE128_CPU_DEFINITION
|
||||
, void *SVE128
|
||||
#endif
|
||||
){
|
||||
|
||||
@ -365,16 +342,6 @@ DispatchResult DispatchStubImpl::try_choose_cpu_impl(
|
||||
return DispatchResult(SVE256);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#ifdef HAVE_SVE128_CPU_DEFINITION
|
||||
if (capability >= static_cast<int>(CPUCapability::SVE128)) {
|
||||
if (C10_UNLIKELY(!SVE128)) {
|
||||
// dispatch to DEFAULT, since the SVE kernel is missing
|
||||
return DEFAULT != nullptr ? DispatchResult(DEFAULT) : ErrorType::MissingDeviceKernel;
|
||||
} else {
|
||||
return DispatchResult(SVE128);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
return DEFAULT != nullptr ? DispatchResult(DEFAULT) : ErrorType::MissingDeviceKernel;
|
||||
}
|
||||
@ -396,9 +363,6 @@ void* DispatchStubImpl::choose_cpu_impl(
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
, void *SVE256
|
||||
#endif
|
||||
#ifdef HAVE_SVE128_CPU_DEFINITION
|
||||
, void *SVE128
|
||||
#endif
|
||||
) {
|
||||
auto capability = static_cast<int>(get_cpu_capability());
|
||||
(void)capability;
|
||||
@ -444,17 +408,6 @@ void* DispatchStubImpl::choose_cpu_impl(
|
||||
return SVE256;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#ifdef HAVE_SVE128_CPU_DEFINITION
|
||||
if (capability >= static_cast<int>(CPUCapability::SVE128)) {
|
||||
if (C10_UNLIKELY(!SVE128)) {
|
||||
// dispatch to DEFAULT, since the SVE kernel is missing
|
||||
TORCH_INTERNAL_ASSERT(DEFAULT, "DispatchStub: missing default kernel");
|
||||
return DEFAULT;
|
||||
} else {
|
||||
return SVE128;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
TORCH_INTERNAL_ASSERT(DEFAULT, "DispatchStub: missing default kernel");
|
||||
return DEFAULT;
|
||||
|
@ -64,9 +64,8 @@ enum class CPUCapability {
|
||||
VSX = 1,
|
||||
#elif defined(HAVE_ZVECTOR_CPU_DEFINITION)
|
||||
ZVECTOR = 1,
|
||||
#elif defined(HAVE_SVE_CPU_DEFINITION) && defined(HAVE_ARM_BF16_CPU_DEFINITION)
|
||||
#elif defined(HAVE_SVE256_CPU_DEFINITION) && defined(HAVE_ARM_BF16_CPU_DEFINITION)
|
||||
SVE256 = 1,
|
||||
SVE128 = 2,
|
||||
#else
|
||||
AVX2 = 1,
|
||||
AVX512 = 2,
|
||||
@ -118,9 +117,6 @@ struct TORCH_API DispatchStubImpl {
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
, void *SVE256
|
||||
#endif
|
||||
#ifdef HAVE_SVE128_CPU_DEFINITION
|
||||
, void *SVE128
|
||||
#endif
|
||||
);
|
||||
|
||||
@ -142,9 +138,6 @@ struct TORCH_API DispatchStubImpl {
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
, void *SVE256
|
||||
#endif
|
||||
#ifdef HAVE_SVE128_CPU_DEFINITION
|
||||
, void *SVE128
|
||||
#endif
|
||||
);
|
||||
|
||||
@ -166,9 +159,6 @@ struct TORCH_API DispatchStubImpl {
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
, void *SVE256
|
||||
#endif
|
||||
#ifdef HAVE_SVE128_CPU_DEFINITION
|
||||
, void *SVE128
|
||||
#endif
|
||||
);
|
||||
|
||||
@ -193,9 +183,6 @@ struct TORCH_API DispatchStubImpl {
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
, void *SVE256
|
||||
#endif
|
||||
#ifdef HAVE_SVE128_CPU_DEFINITION
|
||||
, void *SVE128
|
||||
#endif
|
||||
);
|
||||
|
||||
@ -253,9 +240,6 @@ private:
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
, reinterpret_cast<void*>(SVE256)
|
||||
#endif
|
||||
#ifdef HAVE_SVE128_CPU_DEFINITION
|
||||
, reinterpret_cast<void*>(SVE128)
|
||||
#endif
|
||||
)
|
||||
);
|
||||
@ -317,9 +301,6 @@ public:
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
, reinterpret_cast<void*>(SVE256)
|
||||
#endif
|
||||
#ifdef HAVE_SVE128_CPU_DEFINITION
|
||||
, reinterpret_cast<void*>(SVE128)
|
||||
#endif
|
||||
);
|
||||
if (std::holds_alternative<ErrorType>(result)){
|
||||
@ -344,9 +325,6 @@ public:
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
static TORCH_API FnPtr SVE256;
|
||||
#endif
|
||||
#ifdef HAVE_SVE128_CPU_DEFINITION
|
||||
static TORCH_API FnPtr SVE128;
|
||||
#endif
|
||||
private:
|
||||
DispatchStubImpl impl;
|
||||
};
|
||||
@ -454,12 +432,6 @@ struct RegisterPRIVATEUSE1Dispatch {
|
||||
#define REGISTER_SVE256_DISPATCH(name, fn)
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_SVE128_CPU_DEFINITION
|
||||
#define REGISTER_SVE128_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, SVE128, fn)
|
||||
#else
|
||||
#define REGISTER_SVE128_DISPATCH(name, fn)
|
||||
#endif
|
||||
|
||||
// Macro to register the same kernel for all CPU arch types. This is useful
|
||||
// if a kernel does not benefit from being recompiled across different arch types.
|
||||
#define REGISTER_ALL_CPU_DISPATCH(name, fn) \
|
||||
@ -468,11 +440,6 @@ struct RegisterPRIVATEUSE1Dispatch {
|
||||
REGISTER_AVX2_DISPATCH(name, fn) \
|
||||
REGISTER_VSX_DISPATCH(name, fn) \
|
||||
REGISTER_ZVECTOR_DISPATCH(name, fn) \
|
||||
REGISTER_SVE256_DISPATCH(name, fn) \
|
||||
REGISTER_SVE128_DISPATCH(name, fn)
|
||||
|
||||
#define REGISTER_SVE_DISPATCH(name, fn) \
|
||||
REGISTER_SVE128_DISPATCH(name, fn) \
|
||||
REGISTER_SVE256_DISPATCH(name, fn)
|
||||
|
||||
#define REGISTER_NO_CPU_DISPATCH(name) \
|
||||
@ -515,7 +482,6 @@ struct RegisterPRIVATEUSE1Dispatch {
|
||||
// REGISTER_DISPATCH now dispatches an AVX512 kernel to nullptr but registers other dispatches.
|
||||
// ALSO_REGISTER_AVX512_DISPATCH should be used for ensuring AVX512 dispatch, among others.
|
||||
// ALSO_REGISTER_SVE256_DISPATCH should be used for ensuring SVE256 dispatch, among others.
|
||||
// ALSO_REGISTER_SVE128_DISPATCH should be used for ensuring SVE128 dispatch, among others.
|
||||
#ifdef CPU_CAPABILITY_AVX512
|
||||
#define REGISTER_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, CPU_CAPABILITY, ((void*)(fn) ? nullptr : nullptr))
|
||||
#else
|
||||
@ -523,7 +489,6 @@ struct RegisterPRIVATEUSE1Dispatch {
|
||||
#endif
|
||||
#define ALSO_REGISTER_AVX512_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, CPU_CAPABILITY, fn)
|
||||
#define ALSO_REGISTER_SVE256_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, CPU_CAPABILITY, fn)
|
||||
#define ALSO_REGISTER_SVE128_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, CPU_CAPABILITY, fn)
|
||||
#endif
|
||||
} // namespace at::native
|
||||
|
||||
|
@ -466,7 +466,7 @@ REGISTER_AVX2_DISPATCH(_segment_reduce_lengths_stub, &_segment_reduce_lengths_cp
|
||||
REGISTER_AVX512_DISPATCH(_segment_reduce_lengths_stub, &_segment_reduce_lengths_cpu_kernel)
|
||||
REGISTER_VSX_DISPATCH(_segment_reduce_lengths_stub, &_segment_reduce_lengths_cpu_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(_segment_reduce_lengths_stub, &_segment_reduce_lengths_cpu_kernel)
|
||||
REGISTER_SVE_DISPATCH(_segment_reduce_lengths_stub, &_segment_reduce_lengths_cpu_kernel)
|
||||
REGISTER_SVE256_DISPATCH(_segment_reduce_lengths_stub, &_segment_reduce_lengths_cpu_kernel)
|
||||
|
||||
// offsets dispatches
|
||||
REGISTER_ARCH_DISPATCH(
|
||||
@ -477,7 +477,7 @@ REGISTER_AVX2_DISPATCH(_segment_reduce_offsets_stub, &_segment_reduce_offsets_cp
|
||||
REGISTER_AVX512_DISPATCH(_segment_reduce_offsets_stub, &_segment_reduce_offsets_cpu_kernel)
|
||||
REGISTER_VSX_DISPATCH(_segment_reduce_offsets_stub, &_segment_reduce_offsets_cpu_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(_segment_reduce_offsets_stub, &_segment_reduce_offsets_cpu_kernel)
|
||||
REGISTER_SVE_DISPATCH(_segment_reduce_offsets_stub, &_segment_reduce_offsets_cpu_kernel)
|
||||
REGISTER_SVE256_DISPATCH(_segment_reduce_offsets_stub, &_segment_reduce_offsets_cpu_kernel)
|
||||
|
||||
// Currently some computation is being duplicated across forward and backward.
|
||||
// TODO: Cache indices in forward pass to reuse in backward
|
||||
@ -548,7 +548,7 @@ REGISTER_VSX_DISPATCH(
|
||||
REGISTER_ZVECTOR_DISPATCH(
|
||||
_segment_reduce_lengths_backward_stub,
|
||||
&_segment_reduce_cpu_lengths_backward_kernel)
|
||||
REGISTER_SVE_DISPATCH(
|
||||
REGISTER_SVE256_DISPATCH(
|
||||
_segment_reduce_lengths_backward_stub,
|
||||
&_segment_reduce_cpu_lengths_backward_kernel)
|
||||
|
||||
@ -568,7 +568,7 @@ REGISTER_VSX_DISPATCH(
|
||||
REGISTER_ZVECTOR_DISPATCH(
|
||||
_segment_reduce_offsets_backward_stub,
|
||||
&_segment_reduce_cpu_offsets_backward_kernel)
|
||||
REGISTER_SVE_DISPATCH(
|
||||
REGISTER_SVE256_DISPATCH(
|
||||
_segment_reduce_offsets_backward_stub,
|
||||
&_segment_reduce_cpu_offsets_backward_kernel)
|
||||
|
||||
|
@ -212,7 +212,7 @@ std::pair<vec::Vectorized<float>, vec::Vectorized<float>> fmadd(
|
||||
const vec::Vectorized<c10::Half>& b,
|
||||
const vec::Vectorized<float>& acc_low,
|
||||
const vec::Vectorized<float>& acc_high) {
|
||||
#if defined(__aarch64__) && ((defined(__ARM_FEATURE_FP16_FML) && !defined(__ARM_FEATURE_SVE)) || (defined(CPU_CAPABILITY_SVE128)))
|
||||
#if defined(__ARM_FEATURE_FP16_FML) && !defined(CPU_CAPABILITY_SVE)
|
||||
return std::make_pair(vfmlalq_low_f16(acc_low, a, b), vfmlalq_high_f16(acc_high, a, b));
|
||||
#else
|
||||
const auto [a_float_low, a_float_high] = convert_half_float(a);
|
||||
@ -233,7 +233,7 @@ std::pair<vec::Vectorized<float>, vec::Vectorized<float>> fmadd(
|
||||
|
||||
// Return a + b_low * c_low + b_high * c_high
|
||||
vec::Vectorized<float> fmadd(vec::Vectorized<float> a, vec::Vectorized<Half> b, vec::Vectorized<Half> c) {
|
||||
#if defined(__aarch64__) && ((defined(__ARM_FEATURE_FP16_FML) && !defined(__ARM_FEATURE_SVE)) || (defined(CPU_CAPABILITY_SVE128)))
|
||||
#if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_FML) && !defined(__ARM_FEATURE_SVE)
|
||||
// NOTE: this instruction is an optional instruction in ARM v8.2 and
|
||||
// v8.3, but mandatory in v8.4 per
|
||||
// https://developer.arm.com/documentation/ddi0596/2021-03/SIMD-FP-Instructions/FMLAL--FMLAL2--vector---Floating-point-fused-Multiply-Add-Long-to-accumulator--vector--?lang=en
|
||||
|
@ -165,7 +165,7 @@ REGISTER_AVX2_DISPATCH(fft_fill_with_conjugate_symmetry_stub, &_fft_fill_with_co
|
||||
REGISTER_AVX512_DISPATCH(fft_fill_with_conjugate_symmetry_stub, &_fft_fill_with_conjugate_symmetry_cpu_)
|
||||
REGISTER_ZVECTOR_DISPATCH(fft_fill_with_conjugate_symmetry_stub, &_fft_fill_with_conjugate_symmetry_cpu_)
|
||||
REGISTER_VSX_DISPATCH(fft_fill_with_conjugate_symmetry_stub, &_fft_fill_with_conjugate_symmetry_cpu_)
|
||||
REGISTER_SVE_DISPATCH(fft_fill_with_conjugate_symmetry_stub, &_fft_fill_with_conjugate_symmetry_cpu_)
|
||||
REGISTER_SVE256_DISPATCH(fft_fill_with_conjugate_symmetry_stub, &_fft_fill_with_conjugate_symmetry_cpu_)
|
||||
|
||||
// _out variants can be shared between PocketFFT and MKL
|
||||
Tensor& _fft_r2c_mkl_out(const Tensor& self, IntArrayRef dim, int64_t normalization,
|
||||
|
@ -27,6 +27,6 @@ REGISTER_AVX512_DISPATCH(flatten_indices_stub, &flatten_indices_cpu_kernel)
|
||||
REGISTER_AVX2_DISPATCH(flatten_indices_stub, &flatten_indices_cpu_kernel)
|
||||
REGISTER_VSX_DISPATCH(flatten_indices_stub, &flatten_indices_cpu_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(flatten_indices_stub, &flatten_indices_cpu_kernel)
|
||||
REGISTER_SVE_DISPATCH(flatten_indices_stub, &flatten_indices_cpu_kernel)
|
||||
REGISTER_SVE256_DISPATCH(flatten_indices_stub, &flatten_indices_cpu_kernel)
|
||||
|
||||
} // namespace at::native
|
||||
|
@ -161,19 +161,19 @@ REGISTER_AVX512_DISPATCH(mul_sparse_sparse_out_stub, &mul_sparse_sparse_out_cpu_
|
||||
REGISTER_AVX2_DISPATCH(mul_sparse_sparse_out_stub, &mul_sparse_sparse_out_cpu_kernel)
|
||||
REGISTER_VSX_DISPATCH(mul_sparse_sparse_out_stub, &mul_sparse_sparse_out_cpu_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(mul_sparse_sparse_out_stub, &mul_sparse_sparse_out_cpu_kernel)
|
||||
REGISTER_SVE_DISPATCH(mul_sparse_sparse_out_stub, &mul_sparse_sparse_out_cpu_kernel)
|
||||
REGISTER_SVE256_DISPATCH(mul_sparse_sparse_out_stub, &mul_sparse_sparse_out_cpu_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(sparse_mask_intersection_out_stub, DEFAULT, &sparse_mask_intersection_out_cpu_kernel)
|
||||
REGISTER_AVX512_DISPATCH(sparse_mask_intersection_out_stub, &sparse_mask_intersection_out_cpu_kernel)
|
||||
REGISTER_AVX2_DISPATCH(sparse_mask_intersection_out_stub, &sparse_mask_intersection_out_cpu_kernel)
|
||||
REGISTER_VSX_DISPATCH(sparse_mask_intersection_out_stub, &sparse_mask_intersection_out_cpu_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(sparse_mask_intersection_out_stub, &sparse_mask_intersection_out_cpu_kernel)
|
||||
REGISTER_SVE_DISPATCH(sparse_mask_intersection_out_stub, &sparse_mask_intersection_out_cpu_kernel)
|
||||
REGISTER_SVE256_DISPATCH(sparse_mask_intersection_out_stub, &sparse_mask_intersection_out_cpu_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(sparse_mask_projection_out_stub, DEFAULT, &sparse_mask_projection_out_cpu_kernel)
|
||||
REGISTER_AVX512_DISPATCH(sparse_mask_projection_out_stub, &sparse_mask_projection_out_cpu_kernel)
|
||||
REGISTER_AVX2_DISPATCH(sparse_mask_projection_out_stub, &sparse_mask_projection_out_cpu_kernel)
|
||||
REGISTER_VSX_DISPATCH(sparse_mask_projection_out_stub, &sparse_mask_projection_out_cpu_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(sparse_mask_projection_out_stub, &sparse_mask_projection_out_cpu_kernel)
|
||||
REGISTER_SVE_DISPATCH(sparse_mask_projection_out_stub, &sparse_mask_projection_out_cpu_kernel)
|
||||
REGISTER_SVE256_DISPATCH(sparse_mask_projection_out_stub, &sparse_mask_projection_out_cpu_kernel)
|
||||
}
|
||||
|
@ -448,7 +448,7 @@ REGISTER_AVX2_DISPATCH(_fused_sdp_choice_stub, &_fused_sdp_choice_cpp)
|
||||
REGISTER_AVX512_DISPATCH(_fused_sdp_choice_stub, &_fused_sdp_choice_cpp)
|
||||
REGISTER_VSX_DISPATCH(_fused_sdp_choice_stub, &_fused_sdp_choice_cpp)
|
||||
REGISTER_ZVECTOR_DISPATCH(_fused_sdp_choice_stub, &_fused_sdp_choice_cpp)
|
||||
REGISTER_SVE_DISPATCH(_fused_sdp_choice_stub, &_fused_sdp_choice_cpp)
|
||||
REGISTER_SVE256_DISPATCH(_fused_sdp_choice_stub, &_fused_sdp_choice_cpp)
|
||||
REGISTER_HPU_DISPATCH(_fused_sdp_choice_stub, &_fused_sdp_choice_meta)
|
||||
|
||||
int64_t _fused_sdp_choice_meta(
|
||||
|
@ -15,7 +15,6 @@ from ..cpu_vec_isa import (
|
||||
VecAVX512,
|
||||
VecISA,
|
||||
VecNEON,
|
||||
VecSVE128,
|
||||
VecSVE256,
|
||||
)
|
||||
from ..utils import IndentedBuffer, parallel_num_threads
|
||||
@ -425,14 +424,6 @@ def do_not_use_with_small_m_for_int8_woq(config, m, n, k, alpha, num_threads, **
|
||||
output_dtype=torch.float,
|
||||
compute_dtype=torch.float,
|
||||
),
|
||||
*generate_gemm_config(
|
||||
VecSVE128,
|
||||
[(4, 24, 1), (4, 16, 1), (8, 8, 1)],
|
||||
input_dtype=torch.float,
|
||||
input2_dtype=torch.float,
|
||||
output_dtype=torch.float,
|
||||
compute_dtype=torch.float,
|
||||
),
|
||||
*generate_gemm_config(
|
||||
VecSVE256,
|
||||
[(4, 24, 1), (4, 16, 1), (8, 8, 1)],
|
||||
|
@ -156,7 +156,7 @@ cdll.LoadLibrary("__lib_path__")
|
||||
@dataclasses.dataclass
|
||||
class VecNEON(VecISA):
|
||||
_bit_width = 128 # This is required to leverage the compute implemented in aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h
|
||||
_macro = ["CPU_CAPABILITY_NEON", "AT_BUILD_ARM_VECSVE_WITH_SLEEF"]
|
||||
_macro = ["CPU_CAPABILITY_NEON", "AT_BUILD_ARM_VEC256_WITH_SLEEF"]
|
||||
_arch_flags = "" # Unused
|
||||
_dtype_nelements = {torch.float: 4, torch.bfloat16: 8, torch.float16: 8}
|
||||
|
||||
@ -168,29 +168,9 @@ class VecNEON(VecISA):
|
||||
__hash__: Callable[[VecISA], Any] = VecISA.__hash__ # type: ignore[assignment]
|
||||
|
||||
|
||||
class VecSVE128(VecISA):
|
||||
# SVE with 128-bit width
|
||||
_bit_width = 128
|
||||
_macro = [
|
||||
"CPU_CAPABILITY_SVE",
|
||||
"CPU_CAPABILITY_SVE128",
|
||||
"AT_BUILD_ARM_VECSVE_WITH_SLEEF",
|
||||
"__ARM_FEATURE_BF16",
|
||||
]
|
||||
_arch_flags = "-march=armv9-a+sve2+fp16+fp16fml+bf16 -msve-vector-bits=128"
|
||||
_dtype_nelements = {torch.float: 4, torch.bfloat16: 8, torch.float16: 8}
|
||||
|
||||
def __str__(self) -> str:
|
||||
if config.is_fbcode():
|
||||
return "sve128"
|
||||
return "asimd"
|
||||
|
||||
__hash__: Callable[[VecISA], Any] = VecISA.__hash__ # type: ignore[assignment]
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class VecSVE256(VecISA):
|
||||
# SVE with 256-bit width
|
||||
# this function can be repurposed for SVE with variable vec length
|
||||
_bit_width = 256
|
||||
_macro = [
|
||||
"CPU_CAPABILITY_SVE",
|
||||
@ -430,7 +410,6 @@ supported_vec_isa_list = [
|
||||
VecAVX512(),
|
||||
VecAVX2(),
|
||||
VecNEON(),
|
||||
VecSVE128(),
|
||||
VecSVE256(),
|
||||
]
|
||||
|
||||
@ -496,8 +475,6 @@ def valid_vec_isa_list() -> list[VecISA]:
|
||||
elif arch == "aarch64":
|
||||
if torch.backends.cpu.get_cpu_capability() == "SVE256":
|
||||
isa_list.append(VecSVE256())
|
||||
elif torch.backends.cpu.get_cpu_capability() == "SVE128":
|
||||
isa_list.append(VecSVE128())
|
||||
else:
|
||||
isa_list.append(VecNEON())
|
||||
|
||||
|
@ -16,7 +16,6 @@ def get_cpu_capability() -> str:
|
||||
- "NO AVX"
|
||||
- "AVX2"
|
||||
- "AVX512"
|
||||
- "SVE128"
|
||||
- "SVE256"
|
||||
"""
|
||||
return torch._C._get_cpu_capability()
|
||||
|
@ -34,8 +34,7 @@
|
||||
|
||||
#if defined(CPU_CAPABILITY_AVX512) || defined(CPU_CAPABILITY_AVX2) || \
|
||||
defined(CPU_CAPABILITY_ZVECTOR) || defined(CPU_CAPABILITY_NEON) || \
|
||||
defined(CPU_CAPABILITY_VSX) || defined(CPU_CAPABILITY_SVE128) || \
|
||||
defined(CPU_CAPABILITY_SVE256)
|
||||
defined(CPU_CAPABILITY_VSX) || defined(CPU_CAPABILITY_SVE256)
|
||||
#define INDUCTOR_USE_VECTOR_TYPES() 1
|
||||
#else
|
||||
#define INDUCTOR_USE_VECTOR_TYPES() 0
|
||||
|
@ -6,6 +6,7 @@
|
||||
#include <ATen/Parallel.h>
|
||||
#include <ATen/ScalarOps.h>
|
||||
#include <ATen/TensorUtils.h>
|
||||
#include <ATen/cpu/vec/functional.h>
|
||||
#include <ATen/cpu/vec/vec.h>
|
||||
#include <ATen/native/EmbeddingBag.h>
|
||||
#include <ATen/native/Fill.h>
|
||||
|
Reference in New Issue
Block a user