[Bugfix] Fix illegal memory access in FP8 MoE kernel (#6382)

2025-10-20 14:53:52 +08:00 · 2024-07-12 14:33:33 -07:00
parent 21b2dcedab
commit 75f64d8b94
1 changed files with 5 additions and 3 deletions
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@ -492,12 +492,14 @@ def fused_experts(hidden_states: torch.Tensor,
        if tokens_in_chunk == 0:
            break

-        if tokens_in_chunk < CHUNK_SIZE:
-            # will only happen in the last chunk
+        if tokens_in_chunk < CHUNK_SIZE and chunk > 0:
+            # Adjust the intermediate cache size and config for the last
+            # chunk. Note that in most cases we only have one chunk
+            # so the cache size and config are already set correctly and
+            # do not need to be adjusted.
            intermediate_cache1 = intermediate_cache1[:tokens_in_chunk]
            intermediate_cache2 = intermediate_cache2[:tokens_in_chunk]
            intermediate_cache3 = intermediate_cache3[:tokens_in_chunk]
-            # reload config to get better performance on the last chunk
            config = get_config_func(tokens_in_chunk)

        curr_topk_ids = topk_ids[begin_chunk_idx:end_chunk_idx]