Update

[ghstack-poisoned]
Update (base update)
2025-10-30 03:34:56 +08:00 · 2025-10-28 16:37:02 +00:00 · 2025-10-28 16:37:02 +00:00 · 2025-10-20 11:15:17 +00:00 · 2025-10-20 11:15:17 +00:00
7 changed files with 259 additions and 59 deletions
--- a/aten/src/ATen/cpu/vec/vec256/vec256_int.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_int.h
@ -116,10 +116,10 @@ class Vectorized<int64_t> : public Vectorizedi {
    __at_align__ int64_t tmp_values[size()];
    // Ensure uninitialized memory does not change the output value See
    // https://github.com/pytorch/pytorch/issues/32502 for more details. We do
-    // not initialize arrays to zero using "={0}" because gcc would compile it
+    // not initialize arrays to one using "={1}" because gcc would compile it
    // to two instructions while a loop would be compiled to one instruction.
    for (const auto i : c10::irange(size())) {
-      tmp_values[i] = 0;
+      tmp_values[i] = 1;
    }
    std::memcpy(tmp_values, ptr, count * sizeof(int64_t));
    return loadu(tmp_values);
@ -266,10 +266,10 @@ class Vectorized<int32_t> : public Vectorizedi {
    __at_align__ int32_t tmp_values[size()];
    // Ensure uninitialized memory does not change the output value See
    // https://github.com/pytorch/pytorch/issues/32502 for more details. We do
-    // not initialize arrays to zero using "={0}" because gcc would compile it
+    // not initialize arrays to one using "={1}" because gcc would compile it
    // to two instructions while a loop would be compiled to one instruction.
    for (const auto i : c10::irange(size())) {
-      tmp_values[i] = 0;
+      tmp_values[i] = 1;
    }
    std::memcpy(tmp_values, ptr, count * sizeof(int32_t));
    return loadu(tmp_values);
@ -566,10 +566,10 @@ class Vectorized<int16_t> : public Vectorizedi {
    __at_align__ int16_t tmp_values[size()];
    // Ensure uninitialized memory does not change the output value See
    // https://github.com/pytorch/pytorch/issues/32502 for more details. We do
-    // not initialize arrays to zero using "={0}" because gcc would compile it
+    // not initialize arrays to one using "={1}" because gcc would compile it
    // to two instructions while a loop would be compiled to one instruction.
    for (const auto i : c10::irange(size())) {
-      tmp_values[i] = 0;
+      tmp_values[i] = 1;
    }
    std::memcpy(tmp_values, ptr, count * sizeof(int16_t));
    return loadu(tmp_values);
@ -914,10 +914,10 @@ class Vectorized8 : public Vectorizedi {
    __at_align__ T tmp_values[size()];
    // Ensure uninitialized memory does not change the output value See
    // https://github.com/pytorch/pytorch/issues/32502 for more details. We do
-    // not initialize arrays to zero using "={0}" because gcc would compile it
+    // not initialize arrays to one using "={1}" because gcc would compile it
    // to two instructions while a loop would be compiled to one instruction.
    for (const auto i : c10::irange(size())) {
-      tmp_values[i] = 0;
+      tmp_values[i] = 1;
    }
    std::memcpy(tmp_values, ptr, count * sizeof(T));
    return loadu(tmp_values);
--- a/aten/src/ATen/cpu/vec/vec512/vec512_int.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_int.h
@ -130,7 +130,8 @@ class Vectorized<int64_t> : public Vectorizedi {
      return _mm512_loadu_si512(reinterpret_cast<const __m512i*>(ptr));
    } else {
      __mmask8 mask = (1ULL << count) - 1;
-      return _mm512_maskz_loadu_epi64(mask, ptr);
+      auto ones = _mm512_set1_epi64(1);
+      return _mm512_mask_loadu_epi64(ones, mask, ptr);
    }
  }
  void store(void* ptr, int count = size()) const {
@ -332,7 +333,8 @@ class Vectorized<int32_t> : public Vectorizedi {
      return _mm512_loadu_si512(reinterpret_cast<const __m512i*>(ptr));
    } else {
      __mmask16 mask = (1ULL << count) - 1;
-      return _mm512_maskz_loadu_epi32(mask, ptr);
+      auto ones = _mm512_set1_epi32(1);
+      return _mm512_mask_loadu_epi32(ones, mask, ptr);
    }
  }
  void store(void* ptr, int count = size()) const {
@ -660,7 +662,8 @@ class Vectorized<int16_t> : public Vectorizedi {
      return _mm512_loadu_si512(reinterpret_cast<const __m512i*>(ptr));
    } else {
      __mmask32 mask = (1ULL << count) - 1;
-      return _mm512_maskz_loadu_epi16(mask, ptr);
+      auto ones = _mm512_set1_epi16(1);
+      return _mm512_mask_loadu_epi16(ones, mask, ptr);
    }
  }
  void store(void* ptr, int count = size()) const {
@ -1101,7 +1104,8 @@ class Vectorized8 : public Vectorizedi {
      return loadu_one_fourth(ptr);
    } else {
      __mmask64 mask = (1ULL << count) - 1;
-      return _mm512_maskz_loadu_epi8(mask, ptr);
+      auto ones = _mm512_set1_epi8(1);
+      return _mm512_mask_loadu_epi8(ones, mask, ptr);
    }
  }
  void store(void* ptr, int count = size()) const {
--- a/aten/src/ATen/cpu/vec/vec_mask.h
+++ b/aten/src/ATen/cpu/vec/vec_mask.h
@ -165,6 +165,19 @@ class VecMask {
    return VectorizedN<T, N>(VectorizedN<T, N>::loadu(mask));
  }

+  template <typename U>
+  static VecMask<T, N> from(U* b, int count) {
+    using int_t = int_same_size_t<T>;
+    __at_align__ T mask[size()];
+#ifndef __msvc_cl__
+#pragma unroll
+#endif
+    for (int i = 0; i < count; i++) {
+      *(int_t*)(mask + i) = b[i] ? ~(int_t)0 : (int_t)0;
+    }
+    return VectorizedN<T, N>(VectorizedN<T, N>::loadu(mask, count));
+  }
+
  static VecMask<T, N> blendv(
      const VecMask<T, N>& c,
      const VecMask<T, N>& b,
--- a/aten/src/ATen/cpu/vec/vec_n.h
+++ b/aten/src/ATen/cpu/vec/vec_n.h
@ -187,12 +187,13 @@ class VectorizedN {
  static VectorizedN<T, N> loadu(const void* ptr, int64_t count) {
    VectorizedN<T, N> result;
    for (int i = 0; i < N; ++i) {
-      result.values[i] = Vectorized<T>::loadu(
-          ptr, std::min(count, (int64_t)Vectorized<T>::size()));
-      ptr = static_cast<const T*>(ptr) + Vectorized<T>::size();
-      count -= Vectorized<T>::size();
-      if (count <= 0) {
-        break;
+      if (count > 0) {
+        result.values[i] = Vectorized<T>::loadu(
+            ptr, std::min(count, (int64_t)Vectorized<T>::size()));
+        ptr = static_cast<const T*>(ptr) + Vectorized<T>::size();
+        count -= Vectorized<T>::size();
+      } else {
+        result.values[i] = Vectorized<T>((T)1);
      }
    }
    return result;
--- a/test/inductor/test_cpu_repro.py
+++ b/test/inductor/test_cpu_repro.py
@ -1543,22 +1543,26 @@ class CPUReproTests(TestCase):
            with config.patch({"cpp.simdlen": None}):
                torch._dynamo.reset()
                metrics.reset()
-                self.common(
-                    fn,
-                    (
-                        x,
-                        scale,
-                        zero_point,
-                        use_dequant,
-                        use_quant,
-                        quant_min,
-                        quant_max,
-                        dtype,
-                        dequant_out_dtype,
-                    ),
+                inputs = (
+                    x,
+                    scale,
+                    zero_point,
+                    use_dequant,
+                    use_quant,
+                    quant_min,
+                    quant_max,
+                    dtype,
+                    dequant_out_dtype,
                )
+                self.common(fn, inputs)
                check_metrics_vec_kernel_count(1)

+                # Check that both main and tail loops are vectorized
+                if dtype in [torch.float8_e4m3fn, torch.float8_e5m2]:
+                    compiled_fn = torch.compile(fn)
+                    _, code = run_and_get_cpp_code(compiled_fn, *inputs)
+                    FileCheck().check_count("loadu", 2, exactly=True).run(code)
+
    @requires_vectorization
    def test_dequant_quant_lowering_uint8(self):
        self._test_dequant_quant_lowering_helper(torch.uint8)
@ -4657,6 +4661,23 @@ class CPUReproTests(TestCase):
        self.common(fn, (x,))
        check_metrics_vec_kernel_count(1)

+        # Tail vectorization case
+        x = torch.rand(37)
+        torch._dynamo.reset()
+        metrics.reset()
+        with torch.no_grad():
+            expected = fn(x)
+            compiled_fn = torch.compile(fn)
+            actual, code = run_and_get_cpp_code(compiled_fn, x)
+            self.assertEqual(expected, actual)
+            # 1 generated vec kernel
+            check_metrics_vec_kernel_count(1)
+            # Check that both main and tail loops are vectorized
+            if _can_check_vec_metrics():
+                FileCheck().check_count(
+                    "at::vec::VecMask<float,1>::from", 2, exactly=True
+                ).run(code)
+
    @torch._dynamo.config.patch(dynamic_shapes=True)
    @torch._dynamo.config.patch(assume_static_by_default=False)
    def test_symbolic_shape_scalar_value_reduction(self):
@ -4678,6 +4699,23 @@ class CPUReproTests(TestCase):
        self.common(fn, (x,))
        check_metrics_vec_kernel_count(1)

+        # Tail vectorization case
+        x = torch.randint(0, 100, (37, 37), dtype=torch.int32)
+        torch._dynamo.reset()
+        metrics.reset()
+        with torch.no_grad():
+            expected = fn(x)
+            compiled_fn = torch.compile(fn)
+            actual, code = run_and_get_cpp_code(compiled_fn, x)
+            self.assertEqual(expected, actual)
+            # 1 generated vec kernel
+            check_metrics_vec_kernel_count(1)
+            # Check that both main and tail loops are vectorized
+            if _can_check_vec_metrics():
+                FileCheck().check_count(
+                    "at::vec::Vectorized<int32_t>::loadu", 2, exactly=True
+                ).run(code)
+
    def test_int32_reduction_vec(self):
        def fn(x):
            return x.sum(dim=1)
@ -4687,6 +4725,23 @@ class CPUReproTests(TestCase):
        self.common(fn, (x,))
        check_metrics_vec_kernel_count(1)

+        # Tail vectorization case
+        x = torch.randint(0, 100, (37, 37), dtype=torch.int32)
+        torch._dynamo.reset()
+        metrics.reset()
+        with torch.no_grad():
+            expected = fn(x)
+            compiled_fn = torch.compile(fn)
+            actual, code = run_and_get_cpp_code(compiled_fn, x)
+            self.assertEqual(expected, actual)
+            # 1 generated vec kernel
+            check_metrics_vec_kernel_count(1)
+            # Check that both main and tail loops are vectorized
+            if _can_check_vec_metrics():
+                FileCheck().check_count(
+                    "at::vec::Vectorized<int32_t>::loadu", 2, exactly=True
+                ).run(code)
+
    def test_uint32_pointwise_vec(self):
        def fn(x):
            return x * x
@ -4716,6 +4771,23 @@ class CPUReproTests(TestCase):
        self.common(fn, (x,))
        check_metrics_vec_kernel_count(1)

+        # Tail vectorization case
+        x = torch.randint(0, 100, (37, 37), dtype=torch.int64)
+        torch._dynamo.reset()
+        metrics.reset()
+        with torch.no_grad():
+            expected = fn(x)
+            compiled_fn = torch.compile(fn)
+            actual, code = run_and_get_cpp_code(compiled_fn, x)
+            self.assertEqual(expected, actual)
+            # 1 generated vec kernel
+            check_metrics_vec_kernel_count(1)
+            # Check that both main and tail loops are vectorized
+            if _can_check_vec_metrics():
+                FileCheck().check_count(
+                    "at::vec::VectorizedN<int64_t,2>::loadu", 2, exactly=True
+                ).run(code)
+
    def test_int64_reduction_vec(self):
        def fn(x):
            return x.sum(dim=1)
@ -4725,6 +4797,23 @@ class CPUReproTests(TestCase):
        self.common(fn, (x,))
        check_metrics_vec_kernel_count(1)

+        # Tail vectorization case
+        x = torch.randint(0, 100, (37, 37), dtype=torch.int64)
+        torch._dynamo.reset()
+        metrics.reset()
+        with torch.no_grad():
+            expected = fn(x)
+            compiled_fn = torch.compile(fn)
+            actual, code = run_and_get_cpp_code(compiled_fn, x)
+            self.assertEqual(expected, actual)
+            # 1 generated vec kernel
+            check_metrics_vec_kernel_count(1)
+            # Check that both main and tail loops are vectorized
+            if _can_check_vec_metrics():
+                FileCheck().check_count(
+                    "at::vec::VectorizedN<int64_t,2>::loadu", 2, exactly=True
+                ).run(code)
+
    def test_uint64_pointwise_vec(self):
        def fn(x):
            return x * x
@ -4810,6 +4899,23 @@ class CPUReproTests(TestCase):
        self.common(fn, (x,))
        check_metrics_vec_kernel_count(1)

+        # Tail vectorization case
+        x = torch.randn((37, 37), dtype=torch.double)
+        torch._dynamo.reset()
+        metrics.reset()
+        with torch.no_grad():
+            expected = fn(x)
+            compiled_fn = torch.compile(fn)
+            actual, code = run_and_get_cpp_code(compiled_fn, x)
+            self.assertEqual(expected, actual)
+            # 1 generated vec kernel
+            check_metrics_vec_kernel_count(1)
+            # Check that both main and tail loops are vectorized
+            if _can_check_vec_metrics():
+                FileCheck().check_count(
+                    "at::vec::VectorizedN<double,2>::loadu", 2, exactly=True
+                ).run(code)
+
    def test_double_reduction_vec(self):
        def fn(x):
            return x.sum(dim=1)
@ -4819,6 +4925,23 @@ class CPUReproTests(TestCase):
        self.common(fn, (x,))
        check_metrics_vec_kernel_count(1)

+        # Tail vectorization case
+        x = torch.randn((37, 37), dtype=torch.double)
+        torch._dynamo.reset()
+        metrics.reset()
+        with torch.no_grad():
+            expected = fn(x)
+            compiled_fn = torch.compile(fn)
+            actual, code = run_and_get_cpp_code(compiled_fn, x)
+            self.assertEqual(expected, actual)
+            # 1 generated vec kernel
+            check_metrics_vec_kernel_count(1)
+            # Check that both main and tail loops are vectorized
+            if _can_check_vec_metrics():
+                FileCheck().check_count(
+                    "at::vec::VectorizedN<double,2>::loadu", 2, exactly=True
+                ).run(code)
+
    def test_convert_fp32_to_double_vec(self):
        def fn(x):
            return x.to(torch.double)
@ -4828,6 +4951,23 @@ class CPUReproTests(TestCase):
        self.common(fn, (x,))
        check_metrics_vec_kernel_count(1)

+        # Tail vectorization case
+        x = torch.randn(37, 37)
+        torch._dynamo.reset()
+        metrics.reset()
+        with torch.no_grad():
+            expected = fn(x)
+            compiled_fn = torch.compile(fn)
+            actual, code = run_and_get_cpp_code(compiled_fn, x)
+            self.assertEqual(expected, actual)
+            # 1 generated vec kernel
+            check_metrics_vec_kernel_count(1)
+            # Check that both main and tail loops are vectorized
+            if _can_check_vec_metrics():
+                FileCheck().check_count(
+                    "at::vec::convert<double,2,float,1>", 2, exactly=True
+                ).run(code)
+
    def test_convert_double_to_fp32_vec(self):
        def fn(x):
            return x.to(torch.float32)
@ -4837,6 +4977,23 @@ class CPUReproTests(TestCase):
        self.common(fn, (x,))
        check_metrics_vec_kernel_count(1)

+        # Tail vectorization case
+        x = torch.randn((37, 37), dtype=torch.double)
+        torch._dynamo.reset()
+        metrics.reset()
+        with torch.no_grad():
+            expected = fn(x)
+            compiled_fn = torch.compile(fn)
+            actual, code = run_and_get_cpp_code(compiled_fn, x)
+            self.assertEqual(expected, actual)
+            # 1 generated vec kernel
+            check_metrics_vec_kernel_count(1)
+            # Check that both main and tail loops are vectorized
+            if _can_check_vec_metrics():
+                FileCheck().check_count(
+                    "at::vec::convert<float,1,double,2>", 2, exactly=True
+                ).run(code)
+
    def test_no_redundant_to_dtypes_between_fused_scheduler_node(self):
        # https://github.com/pytorch/pytorch/issues/115260
        p0 = torch.tensor([1.0879], dtype=torch.float16)
@ -5267,7 +5424,7 @@ class CPUReproTests(TestCase):
        _, code = run_and_get_cpp_code(opt_fn, x)
        FileCheck().check_count(
            "return at::vec::VectorizedN<int64_t,2>::loadu(tmpbuf.data(),",
-            4,
+            8,
            exactly=True,
        ).run(code)

--- a/torch/_inductor/codegen/cpp.py
+++ b/torch/_inductor/codegen/cpp.py
@ -158,14 +158,6 @@ VECTORIZABLE_DTYPES: list[torch.dtype] = [
    torch.float8_e5m2,
 ]

-MASKED_VECTORIZABLE_DTYPES: list[torch.dtype] = [
-    torch.float,
-    torch.bfloat16,
-    torch.float16,
-    torch.uint8,
-    torch.int8,
-]
-

 def reduction_init(reduction_type, dtype):
    if dtype in DTYPE_LOWP_FP:
@ -1876,8 +1868,7 @@ class CppVecOverrides(CppOverrides):
                with code.indent():
                    code.writeline(f"tmpbuf_out[i] = {res};")
                if output_mask:
-                    assert not kernel.tail_size
-                    load_args = "tmpbuf_out.data()"
+                    load_args = f"tmpbuf_out.data(), {cexpr_index(size)}"
                    load_fn = f"at::vec::VecMask<{cdtype},{n_vec}>::from"
                else:
                    load_args = f"tmpbuf_out.data(), {cexpr_index(size)}"
@ -2739,7 +2730,7 @@ class CppVecKernel(CppKernel):
        loadbuf = f"{var} + {cexpr_index(index)}" if index != 0 else var
        if dtype == torch.bool:
            # TODO: should we consider load mask here?
-            line = f"{self._get_mask_type()}::from({loadbuf})"
+            line = f"{self._get_mask_type()}::from({loadbuf}, {cexpr_index(self.num_elems)})"
        else:
            line = (
                f"{load_mask_str}.template loadu<{cpp_type},{num_vectors}>({loadbuf})"
@ -2982,7 +2973,10 @@ class CppVecKernel(CppKernel):
                cdtype = DTYPE_TO_CPP[dtype]
                index = ops.index_expr(index, torch.int64).value
                assert isinstance(index, CppCSEVariable) and index.is_vec
-                line = f"atomic_add_vec<{cdtype}, {n_idx}, {n_src}>({var}, {index}, {value});"
+                if self.tail_size:
+                    line = f"atomic_add_vec<{cdtype}, {n_idx}, {n_src}>({var}, {index}, {value}, {cexpr_index(self.tail_size)});"
+                else:
+                    line = f"atomic_add_vec<{cdtype}, {n_idx}, {n_src}>({var}, {index}, {value});"
                self.stores.writeline(DeferredLine(name, line))
        else:
            raise NotImplementedError(f"store mode={mode}")
@ -3450,7 +3444,10 @@ class CppVecKernel(CppKernel):
            if isinstance(next_value, CppCSEVariable):
                assert next_value.dtype == torch.bool
                (next_value,) = unify_mask_base_type(V.kernel.compute, (next_value,))
-            return f"{var} | {next_value}"
+            if self.tail_size:
+                return f"any_masked_reduce({var}, {next_value}, {cexpr_index(self.tail_size)})"
+            else:
+                return f"{var} | {next_value}"
        else:
            raise NotImplementedError

@ -4357,13 +4354,6 @@ class CppKernelProxy(CppKernel):
                fn_list, var_sizes_list
            )
            assert len(tiling_factors) == len(tiling_indices)
-            # <TODO> This should be removed after full support for vectorization is implemented.
-            could_masked_vec = True
-            all_dtypes = _get_dtype_from_loopbodies(_get_loop_body(fn_list))
-            if any(dtype not in MASKED_VECTORIZABLE_DTYPES for dtype in all_dtypes):
-                # can be removed after masked vectorizable dtype are same with vectorizable dtype
-                could_masked_vec = False
-
            _inner_loop_reduction_outer_not = False
            _outer_loop = None
            if tiling_indices:
@ -4390,7 +4380,7 @@ class CppKernelProxy(CppKernel):
                )
                tail_size = loop.size - loop.tiled_size
                vec_kernel.active_ranges = {loop.var: (0, loop.tiled_size)}
-                if config.cpp.enable_loop_tail_vec and could_masked_vec:
+                if config.cpp.enable_loop_tail_vec:
                    tail_kernel = codegen_kernel(
                        self.vec_kernel_cls,
                        tiling_factors[0],
@ -4437,7 +4427,7 @@ class CppKernelProxy(CppKernel):
                    inner_loop.var: inner_ranges["main"],
                }
                tail_kernel = []
-                if config.cpp.enable_loop_tail_vec and could_masked_vec:
+                if config.cpp.enable_loop_tail_vec:
                    for outer_r, inner_r in (
                        ("main", "tail"),
                        ("tail", "main"),
--- a/torch/csrc/inductor/cpp_prefix.h
+++ b/torch/csrc/inductor/cpp_prefix.h
@ -296,23 +296,50 @@ inline T cascade_sum_combine(
 }

 template <typename T>
-T max_masked_reduce(const T& a, const T& b, const int64_t tail_size) {
+inline T max_masked_reduce(const T& a, const T& b, const int64_t tail_size) {
  auto out = at::vec::maximum(a, b);
  return T::set(a, out, tail_size);
 }

+template <>
+inline at::vec::VecMask<float, 1> max_masked_reduce(
+    const at::vec::VecMask<float, 1>& a,
+    const at::vec::VecMask<float, 1>& b,
+    const int64_t tail_size) {
+  auto out = a | b;
+  return at::vec::VecMask<float, 1>::set(a, out, tail_size);
+}
+
 template <typename T>
-T min_masked_reduce(const T& a, const T& b, const int64_t tail_size) {
+inline T min_masked_reduce(const T& a, const T& b, const int64_t tail_size) {
  auto out = at::vec::minimum(a, b);
  return T::set(a, out, tail_size);
 }

+template <>
+inline at::vec::VecMask<float, 1> min_masked_reduce(
+    const at::vec::VecMask<float, 1>& a,
+    const at::vec::VecMask<float, 1>& b,
+    const int64_t tail_size) {
+  auto out = a & b;
+  return at::vec::VecMask<float, 1>::set(a, out, tail_size);
+}
+
 template <typename T>
-T sum_masked_reduce(const T& a, const T& b, const int64_t tail_size) {
+inline T sum_masked_reduce(const T& a, const T& b, const int64_t tail_size) {
  auto out = a + b;
  return T::set(a, out, tail_size);
 }

+template <>
+inline at::vec::VecMask<float, 1> sum_masked_reduce(
+    const at::vec::VecMask<float, 1>& a,
+    const at::vec::VecMask<float, 1>& b,
+    const int64_t tail_size) {
+  auto out = a | b;
+  return at::vec::VecMask<float, 1>::set(a, out, tail_size);
+}
+
 template <typename T>
 T prod_masked_reduce(const T& a, const T& b, const int64_t tail_size) {
  auto out = a * b;
@ -324,6 +351,12 @@ T xor_sum_masked_reduce(const T& a, const T& b, const int64_t tail_size) {
  auto out = a ^ b;
  return T::set(a, out, tail_size);
 }
+
+template <typename T1, typename T2>
+T1 any_masked_reduce(const T1& a, const T2& b, const int64_t tail_size) {
+  T1 out = a | b;
+  return T1::set(a, out, tail_size);
+}
 #endif

 // Refer to
@ -859,14 +892,16 @@ template <typename T, int NI, int NV>
 void atomic_add_vec(
    T* addr,
    at::vec::VectorizedN<int64_t, NI> index,
-    at::vec::VectorizedN<T, NV> offset) {
+    at::vec::VectorizedN<T, NV> offset,
+    std::optional<int64_t> tail_size = std::nullopt) {
  constexpr int len = at::vec::VectorizedN<int64_t, NI>::size();
  static_assert(len <= at::vec::VectorizedN<T, NV>::size());
  __at_align__ std::array<T, len> tmpbuf;
  __at_align__ std::array<int64_t, len> tmpidx;
  offset.store(tmpbuf.data(), len);
  index.store(tmpidx.data(), len);
-  for (int i = 0; i < len; i++) {
+  int size = tail_size.has_value() ? tail_size.value() : len;
+  for (int i = 0; i < size; i++) {
    atomic_add(addr + tmpidx[i], tmpbuf[i]);
  }
 }
Author	SHA1	Message	Date
Sun, Jiayi	d7a8794105	Update [ghstack-poisoned]	2025-10-28 16:37:02 +00:00
Sun, Jiayi	842a714d33	Update (base update) [ghstack-poisoned]	2025-10-28 16:37:02 +00:00
Sun, Jiayi	62151f29e0	Update [ghstack-poisoned]	2025-10-20 11:15:17 +00:00
Sun, Jiayi	4f67ce5eba	Update (base update) [ghstack-poisoned]	2025-10-20 11:15:17 +00:00