silu-v1: Fix EPS not being used during max-reduction (#25069)

Signed-off-by: elvircrn <elvircrn@gmail.com>
2025-10-20 23:03:52 +08:00 · 2025-09-18 12:25:12 +02:00
parent 4f02b77de4
commit abdfcd4f3d
1 changed files with 1 additions and 2 deletions
--- a/csrc/quantization/activation_kernels.cu
+++ b/csrc/quantization/activation_kernels.cu
@ -365,7 +365,6 @@ __global__ void silu_mul_fp8_quant_deep_gemm_kernel(
  int32_t compute_pipeline_offset_64 = 0;

  for (int32_t t = n_tokens_lower; t < n_tokens_upper; ++t) {
-    __nv_bfloat16 y_max_bf16 = EPS;
    __nv_bfloat162 results_bf162[2];

    cp_async_wait<NUM_STAGES - 2>();
@ -405,7 +404,7 @@ __global__ void silu_mul_fp8_quant_deep_gemm_kernel(
    auto _y_max2 =
        __hmax2(__habs2(results_bf162[0]), __habs2(results_bf162[1]));

-    y_max_bf16 = __hmax(_y_max2.x, _y_max2.y);
+    __nv_bfloat16 y_max_bf16 = __hmax(EPS, __hmax(_y_max2.x, _y_max2.y));

    // An entire group is assigned to a single warp, so a simple warp reduce
    // is used.