Compare commits

...

4 Commits

Author SHA1 Message Date
d7a8794105 Update
[ghstack-poisoned]
2025-10-28 16:37:02 +00:00
842a714d33 Update (base update)
[ghstack-poisoned]
2025-10-28 16:37:02 +00:00
62151f29e0 Update
[ghstack-poisoned]
2025-10-20 11:15:17 +00:00
4f67ce5eba Update (base update)
[ghstack-poisoned]
2025-10-20 11:15:17 +00:00
7 changed files with 259 additions and 59 deletions

View File

@ -116,10 +116,10 @@ class Vectorized<int64_t> : public Vectorizedi {
__at_align__ int64_t tmp_values[size()];
// Ensure uninitialized memory does not change the output value See
// https://github.com/pytorch/pytorch/issues/32502 for more details. We do
// not initialize arrays to zero using "={0}" because gcc would compile it
// not initialize arrays to one using "={1}" because gcc would compile it
// to two instructions while a loop would be compiled to one instruction.
for (const auto i : c10::irange(size())) {
tmp_values[i] = 0;
tmp_values[i] = 1;
}
std::memcpy(tmp_values, ptr, count * sizeof(int64_t));
return loadu(tmp_values);
@ -266,10 +266,10 @@ class Vectorized<int32_t> : public Vectorizedi {
__at_align__ int32_t tmp_values[size()];
// Ensure uninitialized memory does not change the output value See
// https://github.com/pytorch/pytorch/issues/32502 for more details. We do
// not initialize arrays to zero using "={0}" because gcc would compile it
// not initialize arrays to one using "={1}" because gcc would compile it
// to two instructions while a loop would be compiled to one instruction.
for (const auto i : c10::irange(size())) {
tmp_values[i] = 0;
tmp_values[i] = 1;
}
std::memcpy(tmp_values, ptr, count * sizeof(int32_t));
return loadu(tmp_values);
@ -566,10 +566,10 @@ class Vectorized<int16_t> : public Vectorizedi {
__at_align__ int16_t tmp_values[size()];
// Ensure uninitialized memory does not change the output value See
// https://github.com/pytorch/pytorch/issues/32502 for more details. We do
// not initialize arrays to zero using "={0}" because gcc would compile it
// not initialize arrays to one using "={1}" because gcc would compile it
// to two instructions while a loop would be compiled to one instruction.
for (const auto i : c10::irange(size())) {
tmp_values[i] = 0;
tmp_values[i] = 1;
}
std::memcpy(tmp_values, ptr, count * sizeof(int16_t));
return loadu(tmp_values);
@ -914,10 +914,10 @@ class Vectorized8 : public Vectorizedi {
__at_align__ T tmp_values[size()];
// Ensure uninitialized memory does not change the output value See
// https://github.com/pytorch/pytorch/issues/32502 for more details. We do
// not initialize arrays to zero using "={0}" because gcc would compile it
// not initialize arrays to one using "={1}" because gcc would compile it
// to two instructions while a loop would be compiled to one instruction.
for (const auto i : c10::irange(size())) {
tmp_values[i] = 0;
tmp_values[i] = 1;
}
std::memcpy(tmp_values, ptr, count * sizeof(T));
return loadu(tmp_values);

View File

@ -130,7 +130,8 @@ class Vectorized<int64_t> : public Vectorizedi {
return _mm512_loadu_si512(reinterpret_cast<const __m512i*>(ptr));
} else {
__mmask8 mask = (1ULL << count) - 1;
return _mm512_maskz_loadu_epi64(mask, ptr);
auto ones = _mm512_set1_epi64(1);
return _mm512_mask_loadu_epi64(ones, mask, ptr);
}
}
void store(void* ptr, int count = size()) const {
@ -332,7 +333,8 @@ class Vectorized<int32_t> : public Vectorizedi {
return _mm512_loadu_si512(reinterpret_cast<const __m512i*>(ptr));
} else {
__mmask16 mask = (1ULL << count) - 1;
return _mm512_maskz_loadu_epi32(mask, ptr);
auto ones = _mm512_set1_epi32(1);
return _mm512_mask_loadu_epi32(ones, mask, ptr);
}
}
void store(void* ptr, int count = size()) const {
@ -660,7 +662,8 @@ class Vectorized<int16_t> : public Vectorizedi {
return _mm512_loadu_si512(reinterpret_cast<const __m512i*>(ptr));
} else {
__mmask32 mask = (1ULL << count) - 1;
return _mm512_maskz_loadu_epi16(mask, ptr);
auto ones = _mm512_set1_epi16(1);
return _mm512_mask_loadu_epi16(ones, mask, ptr);
}
}
void store(void* ptr, int count = size()) const {
@ -1101,7 +1104,8 @@ class Vectorized8 : public Vectorizedi {
return loadu_one_fourth(ptr);
} else {
__mmask64 mask = (1ULL << count) - 1;
return _mm512_maskz_loadu_epi8(mask, ptr);
auto ones = _mm512_set1_epi8(1);
return _mm512_mask_loadu_epi8(ones, mask, ptr);
}
}
void store(void* ptr, int count = size()) const {

View File

@ -165,6 +165,19 @@ class VecMask {
return VectorizedN<T, N>(VectorizedN<T, N>::loadu(mask));
}
template <typename U>
static VecMask<T, N> from(U* b, int count) {
using int_t = int_same_size_t<T>;
__at_align__ T mask[size()];
#ifndef __msvc_cl__
#pragma unroll
#endif
for (int i = 0; i < count; i++) {
*(int_t*)(mask + i) = b[i] ? ~(int_t)0 : (int_t)0;
}
return VectorizedN<T, N>(VectorizedN<T, N>::loadu(mask, count));
}
static VecMask<T, N> blendv(
const VecMask<T, N>& c,
const VecMask<T, N>& b,

View File

@ -187,12 +187,13 @@ class VectorizedN {
static VectorizedN<T, N> loadu(const void* ptr, int64_t count) {
VectorizedN<T, N> result;
for (int i = 0; i < N; ++i) {
result.values[i] = Vectorized<T>::loadu(
ptr, std::min(count, (int64_t)Vectorized<T>::size()));
ptr = static_cast<const T*>(ptr) + Vectorized<T>::size();
count -= Vectorized<T>::size();
if (count <= 0) {
break;
if (count > 0) {
result.values[i] = Vectorized<T>::loadu(
ptr, std::min(count, (int64_t)Vectorized<T>::size()));
ptr = static_cast<const T*>(ptr) + Vectorized<T>::size();
count -= Vectorized<T>::size();
} else {
result.values[i] = Vectorized<T>((T)1);
}
}
return result;

View File

@ -1543,22 +1543,26 @@ class CPUReproTests(TestCase):
with config.patch({"cpp.simdlen": None}):
torch._dynamo.reset()
metrics.reset()
self.common(
fn,
(
x,
scale,
zero_point,
use_dequant,
use_quant,
quant_min,
quant_max,
dtype,
dequant_out_dtype,
),
inputs = (
x,
scale,
zero_point,
use_dequant,
use_quant,
quant_min,
quant_max,
dtype,
dequant_out_dtype,
)
self.common(fn, inputs)
check_metrics_vec_kernel_count(1)
# Check that both main and tail loops are vectorized
if dtype in [torch.float8_e4m3fn, torch.float8_e5m2]:
compiled_fn = torch.compile(fn)
_, code = run_and_get_cpp_code(compiled_fn, *inputs)
FileCheck().check_count("loadu", 2, exactly=True).run(code)
@requires_vectorization
def test_dequant_quant_lowering_uint8(self):
self._test_dequant_quant_lowering_helper(torch.uint8)
@ -4657,6 +4661,23 @@ class CPUReproTests(TestCase):
self.common(fn, (x,))
check_metrics_vec_kernel_count(1)
# Tail vectorization case
x = torch.rand(37)
torch._dynamo.reset()
metrics.reset()
with torch.no_grad():
expected = fn(x)
compiled_fn = torch.compile(fn)
actual, code = run_and_get_cpp_code(compiled_fn, x)
self.assertEqual(expected, actual)
# 1 generated vec kernel
check_metrics_vec_kernel_count(1)
# Check that both main and tail loops are vectorized
if _can_check_vec_metrics():
FileCheck().check_count(
"at::vec::VecMask<float,1>::from", 2, exactly=True
).run(code)
@torch._dynamo.config.patch(dynamic_shapes=True)
@torch._dynamo.config.patch(assume_static_by_default=False)
def test_symbolic_shape_scalar_value_reduction(self):
@ -4678,6 +4699,23 @@ class CPUReproTests(TestCase):
self.common(fn, (x,))
check_metrics_vec_kernel_count(1)
# Tail vectorization case
x = torch.randint(0, 100, (37, 37), dtype=torch.int32)
torch._dynamo.reset()
metrics.reset()
with torch.no_grad():
expected = fn(x)
compiled_fn = torch.compile(fn)
actual, code = run_and_get_cpp_code(compiled_fn, x)
self.assertEqual(expected, actual)
# 1 generated vec kernel
check_metrics_vec_kernel_count(1)
# Check that both main and tail loops are vectorized
if _can_check_vec_metrics():
FileCheck().check_count(
"at::vec::Vectorized<int32_t>::loadu", 2, exactly=True
).run(code)
def test_int32_reduction_vec(self):
def fn(x):
return x.sum(dim=1)
@ -4687,6 +4725,23 @@ class CPUReproTests(TestCase):
self.common(fn, (x,))
check_metrics_vec_kernel_count(1)
# Tail vectorization case
x = torch.randint(0, 100, (37, 37), dtype=torch.int32)
torch._dynamo.reset()
metrics.reset()
with torch.no_grad():
expected = fn(x)
compiled_fn = torch.compile(fn)
actual, code = run_and_get_cpp_code(compiled_fn, x)
self.assertEqual(expected, actual)
# 1 generated vec kernel
check_metrics_vec_kernel_count(1)
# Check that both main and tail loops are vectorized
if _can_check_vec_metrics():
FileCheck().check_count(
"at::vec::Vectorized<int32_t>::loadu", 2, exactly=True
).run(code)
def test_uint32_pointwise_vec(self):
def fn(x):
return x * x
@ -4716,6 +4771,23 @@ class CPUReproTests(TestCase):
self.common(fn, (x,))
check_metrics_vec_kernel_count(1)
# Tail vectorization case
x = torch.randint(0, 100, (37, 37), dtype=torch.int64)
torch._dynamo.reset()
metrics.reset()
with torch.no_grad():
expected = fn(x)
compiled_fn = torch.compile(fn)
actual, code = run_and_get_cpp_code(compiled_fn, x)
self.assertEqual(expected, actual)
# 1 generated vec kernel
check_metrics_vec_kernel_count(1)
# Check that both main and tail loops are vectorized
if _can_check_vec_metrics():
FileCheck().check_count(
"at::vec::VectorizedN<int64_t,2>::loadu", 2, exactly=True
).run(code)
def test_int64_reduction_vec(self):
def fn(x):
return x.sum(dim=1)
@ -4725,6 +4797,23 @@ class CPUReproTests(TestCase):
self.common(fn, (x,))
check_metrics_vec_kernel_count(1)
# Tail vectorization case
x = torch.randint(0, 100, (37, 37), dtype=torch.int64)
torch._dynamo.reset()
metrics.reset()
with torch.no_grad():
expected = fn(x)
compiled_fn = torch.compile(fn)
actual, code = run_and_get_cpp_code(compiled_fn, x)
self.assertEqual(expected, actual)
# 1 generated vec kernel
check_metrics_vec_kernel_count(1)
# Check that both main and tail loops are vectorized
if _can_check_vec_metrics():
FileCheck().check_count(
"at::vec::VectorizedN<int64_t,2>::loadu", 2, exactly=True
).run(code)
def test_uint64_pointwise_vec(self):
def fn(x):
return x * x
@ -4810,6 +4899,23 @@ class CPUReproTests(TestCase):
self.common(fn, (x,))
check_metrics_vec_kernel_count(1)
# Tail vectorization case
x = torch.randn((37, 37), dtype=torch.double)
torch._dynamo.reset()
metrics.reset()
with torch.no_grad():
expected = fn(x)
compiled_fn = torch.compile(fn)
actual, code = run_and_get_cpp_code(compiled_fn, x)
self.assertEqual(expected, actual)
# 1 generated vec kernel
check_metrics_vec_kernel_count(1)
# Check that both main and tail loops are vectorized
if _can_check_vec_metrics():
FileCheck().check_count(
"at::vec::VectorizedN<double,2>::loadu", 2, exactly=True
).run(code)
def test_double_reduction_vec(self):
def fn(x):
return x.sum(dim=1)
@ -4819,6 +4925,23 @@ class CPUReproTests(TestCase):
self.common(fn, (x,))
check_metrics_vec_kernel_count(1)
# Tail vectorization case
x = torch.randn((37, 37), dtype=torch.double)
torch._dynamo.reset()
metrics.reset()
with torch.no_grad():
expected = fn(x)
compiled_fn = torch.compile(fn)
actual, code = run_and_get_cpp_code(compiled_fn, x)
self.assertEqual(expected, actual)
# 1 generated vec kernel
check_metrics_vec_kernel_count(1)
# Check that both main and tail loops are vectorized
if _can_check_vec_metrics():
FileCheck().check_count(
"at::vec::VectorizedN<double,2>::loadu", 2, exactly=True
).run(code)
def test_convert_fp32_to_double_vec(self):
def fn(x):
return x.to(torch.double)
@ -4828,6 +4951,23 @@ class CPUReproTests(TestCase):
self.common(fn, (x,))
check_metrics_vec_kernel_count(1)
# Tail vectorization case
x = torch.randn(37, 37)
torch._dynamo.reset()
metrics.reset()
with torch.no_grad():
expected = fn(x)
compiled_fn = torch.compile(fn)
actual, code = run_and_get_cpp_code(compiled_fn, x)
self.assertEqual(expected, actual)
# 1 generated vec kernel
check_metrics_vec_kernel_count(1)
# Check that both main and tail loops are vectorized
if _can_check_vec_metrics():
FileCheck().check_count(
"at::vec::convert<double,2,float,1>", 2, exactly=True
).run(code)
def test_convert_double_to_fp32_vec(self):
def fn(x):
return x.to(torch.float32)
@ -4837,6 +4977,23 @@ class CPUReproTests(TestCase):
self.common(fn, (x,))
check_metrics_vec_kernel_count(1)
# Tail vectorization case
x = torch.randn((37, 37), dtype=torch.double)
torch._dynamo.reset()
metrics.reset()
with torch.no_grad():
expected = fn(x)
compiled_fn = torch.compile(fn)
actual, code = run_and_get_cpp_code(compiled_fn, x)
self.assertEqual(expected, actual)
# 1 generated vec kernel
check_metrics_vec_kernel_count(1)
# Check that both main and tail loops are vectorized
if _can_check_vec_metrics():
FileCheck().check_count(
"at::vec::convert<float,1,double,2>", 2, exactly=True
).run(code)
def test_no_redundant_to_dtypes_between_fused_scheduler_node(self):
# https://github.com/pytorch/pytorch/issues/115260
p0 = torch.tensor([1.0879], dtype=torch.float16)
@ -5267,7 +5424,7 @@ class CPUReproTests(TestCase):
_, code = run_and_get_cpp_code(opt_fn, x)
FileCheck().check_count(
"return at::vec::VectorizedN<int64_t,2>::loadu(tmpbuf.data(),",
4,
8,
exactly=True,
).run(code)

View File

@ -158,14 +158,6 @@ VECTORIZABLE_DTYPES: list[torch.dtype] = [
torch.float8_e5m2,
]
MASKED_VECTORIZABLE_DTYPES: list[torch.dtype] = [
torch.float,
torch.bfloat16,
torch.float16,
torch.uint8,
torch.int8,
]
def reduction_init(reduction_type, dtype):
if dtype in DTYPE_LOWP_FP:
@ -1876,8 +1868,7 @@ class CppVecOverrides(CppOverrides):
with code.indent():
code.writeline(f"tmpbuf_out[i] = {res};")
if output_mask:
assert not kernel.tail_size
load_args = "tmpbuf_out.data()"
load_args = f"tmpbuf_out.data(), {cexpr_index(size)}"
load_fn = f"at::vec::VecMask<{cdtype},{n_vec}>::from"
else:
load_args = f"tmpbuf_out.data(), {cexpr_index(size)}"
@ -2739,7 +2730,7 @@ class CppVecKernel(CppKernel):
loadbuf = f"{var} + {cexpr_index(index)}" if index != 0 else var
if dtype == torch.bool:
# TODO: should we consider load mask here?
line = f"{self._get_mask_type()}::from({loadbuf})"
line = f"{self._get_mask_type()}::from({loadbuf}, {cexpr_index(self.num_elems)})"
else:
line = (
f"{load_mask_str}.template loadu<{cpp_type},{num_vectors}>({loadbuf})"
@ -2982,7 +2973,10 @@ class CppVecKernel(CppKernel):
cdtype = DTYPE_TO_CPP[dtype]
index = ops.index_expr(index, torch.int64).value
assert isinstance(index, CppCSEVariable) and index.is_vec
line = f"atomic_add_vec<{cdtype}, {n_idx}, {n_src}>({var}, {index}, {value});"
if self.tail_size:
line = f"atomic_add_vec<{cdtype}, {n_idx}, {n_src}>({var}, {index}, {value}, {cexpr_index(self.tail_size)});"
else:
line = f"atomic_add_vec<{cdtype}, {n_idx}, {n_src}>({var}, {index}, {value});"
self.stores.writeline(DeferredLine(name, line))
else:
raise NotImplementedError(f"store mode={mode}")
@ -3450,7 +3444,10 @@ class CppVecKernel(CppKernel):
if isinstance(next_value, CppCSEVariable):
assert next_value.dtype == torch.bool
(next_value,) = unify_mask_base_type(V.kernel.compute, (next_value,))
return f"{var} | {next_value}"
if self.tail_size:
return f"any_masked_reduce({var}, {next_value}, {cexpr_index(self.tail_size)})"
else:
return f"{var} | {next_value}"
else:
raise NotImplementedError
@ -4357,13 +4354,6 @@ class CppKernelProxy(CppKernel):
fn_list, var_sizes_list
)
assert len(tiling_factors) == len(tiling_indices)
# <TODO> This should be removed after full support for vectorization is implemented.
could_masked_vec = True
all_dtypes = _get_dtype_from_loopbodies(_get_loop_body(fn_list))
if any(dtype not in MASKED_VECTORIZABLE_DTYPES for dtype in all_dtypes):
# can be removed after masked vectorizable dtype are same with vectorizable dtype
could_masked_vec = False
_inner_loop_reduction_outer_not = False
_outer_loop = None
if tiling_indices:
@ -4390,7 +4380,7 @@ class CppKernelProxy(CppKernel):
)
tail_size = loop.size - loop.tiled_size
vec_kernel.active_ranges = {loop.var: (0, loop.tiled_size)}
if config.cpp.enable_loop_tail_vec and could_masked_vec:
if config.cpp.enable_loop_tail_vec:
tail_kernel = codegen_kernel(
self.vec_kernel_cls,
tiling_factors[0],
@ -4437,7 +4427,7 @@ class CppKernelProxy(CppKernel):
inner_loop.var: inner_ranges["main"],
}
tail_kernel = []
if config.cpp.enable_loop_tail_vec and could_masked_vec:
if config.cpp.enable_loop_tail_vec:
for outer_r, inner_r in (
("main", "tail"),
("tail", "main"),

View File

@ -296,23 +296,50 @@ inline T cascade_sum_combine(
}
template <typename T>
T max_masked_reduce(const T& a, const T& b, const int64_t tail_size) {
inline T max_masked_reduce(const T& a, const T& b, const int64_t tail_size) {
auto out = at::vec::maximum(a, b);
return T::set(a, out, tail_size);
}
template <>
inline at::vec::VecMask<float, 1> max_masked_reduce(
const at::vec::VecMask<float, 1>& a,
const at::vec::VecMask<float, 1>& b,
const int64_t tail_size) {
auto out = a | b;
return at::vec::VecMask<float, 1>::set(a, out, tail_size);
}
template <typename T>
T min_masked_reduce(const T& a, const T& b, const int64_t tail_size) {
inline T min_masked_reduce(const T& a, const T& b, const int64_t tail_size) {
auto out = at::vec::minimum(a, b);
return T::set(a, out, tail_size);
}
template <>
inline at::vec::VecMask<float, 1> min_masked_reduce(
const at::vec::VecMask<float, 1>& a,
const at::vec::VecMask<float, 1>& b,
const int64_t tail_size) {
auto out = a & b;
return at::vec::VecMask<float, 1>::set(a, out, tail_size);
}
template <typename T>
T sum_masked_reduce(const T& a, const T& b, const int64_t tail_size) {
inline T sum_masked_reduce(const T& a, const T& b, const int64_t tail_size) {
auto out = a + b;
return T::set(a, out, tail_size);
}
template <>
inline at::vec::VecMask<float, 1> sum_masked_reduce(
const at::vec::VecMask<float, 1>& a,
const at::vec::VecMask<float, 1>& b,
const int64_t tail_size) {
auto out = a | b;
return at::vec::VecMask<float, 1>::set(a, out, tail_size);
}
template <typename T>
T prod_masked_reduce(const T& a, const T& b, const int64_t tail_size) {
auto out = a * b;
@ -324,6 +351,12 @@ T xor_sum_masked_reduce(const T& a, const T& b, const int64_t tail_size) {
auto out = a ^ b;
return T::set(a, out, tail_size);
}
template <typename T1, typename T2>
T1 any_masked_reduce(const T1& a, const T2& b, const int64_t tail_size) {
T1 out = a | b;
return T1::set(a, out, tail_size);
}
#endif
// Refer to
@ -859,14 +892,16 @@ template <typename T, int NI, int NV>
void atomic_add_vec(
T* addr,
at::vec::VectorizedN<int64_t, NI> index,
at::vec::VectorizedN<T, NV> offset) {
at::vec::VectorizedN<T, NV> offset,
std::optional<int64_t> tail_size = std::nullopt) {
constexpr int len = at::vec::VectorizedN<int64_t, NI>::size();
static_assert(len <= at::vec::VectorizedN<T, NV>::size());
__at_align__ std::array<T, len> tmpbuf;
__at_align__ std::array<int64_t, len> tmpidx;
offset.store(tmpbuf.data(), len);
index.store(tmpidx.data(), len);
for (int i = 0; i < len; i++) {
int size = tail_size.has_value() ? tail_size.value() : len;
for (int i = 0; i < size; i++) {
atomic_add(addr + tmpidx[i], tmpbuf[i]);
}
}