Update Flashinfer to 0.2.14.post1 (#23537)

Signed-off-by: Siyuan Fu <siyuanf@nvidia.com> Signed-off-by: siyuanf <siyuanf@nvidia.com> Signed-off-by: Weiliang Liu <weiliangl@nvidia.com> Signed-off-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: Siyuan Fu <siyuanf@nvidia.com> Co-authored-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-10-20 23:03:52 +08:00 · 2025-08-26 09:30:44 +08:00
parent 906e461ed6
commit ae067888d6
5 changed files with 14 additions and 7 deletions
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@ -373,7 +373,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 # Install FlashInfer from source
 ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
 # Keep this in sync with "flashinfer" extra in setup.py
-ARG FLASHINFER_GIT_REF="v0.2.12"
+ARG FLASHINFER_GIT_REF="v0.2.14.post1"
 # Flag to control whether to compile FlashInfer AOT kernels
 # Set to "true" to enable AOT compilation:
 # docker build --build-arg FLASHINFER_AOT_COMPILE=true ...
--- a/setup.py
+++ b/setup.py
@ -694,7 +694,7 @@ setup(
                  "mistral_common[audio]"],  # Required for audio processing
        "video": [],  # Kept for backwards compatibility
        # FlashInfer should be updated together with the Dockerfile
-        "flashinfer": ["flashinfer-python==0.2.12"],
+        "flashinfer": ["flashinfer-python==0.2.14.post1"],
        # Optional deps for AMD FP4 quantization support
        "petit-kernel": ["petit-kernel"],
    },
--- a/vllm/compilation/collective_fusion.py
+++ b/vllm/compilation/collective_fusion.py
@ -465,7 +465,8 @@ if flashinfer_comm is not None:
                quant_out=quant_out,
                scale_out=scale_out,
                # in vllm we only support swizzled layout
-                layout_code=flashinfer_comm.FP4QuantizationSFLayout.SWIZZLED,
+                layout_code=flashinfer_comm.QuantizationSFLayout.
+                SWIZZLED_128x4,
                scale_factor=scale_factor,
            )
        else:
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@ -6,6 +6,7 @@ import torch
 from torch.nn.parameter import Parameter

 from vllm import envs
+from vllm.config import get_current_vllm_config
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEConfig,
                                                  FusedMoEMethodBase)
@ -113,6 +114,8 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
        self.topk_indices_dtype = None
        self.moe = moe
        self.use_marlin = self._should_use_marlin()
+        self.max_capture_size = get_current_vllm_config(
+        ).compilation_config.max_capture_size

        if current_platform.is_device_capability(100) and not has_flashinfer():
            logger.warning_once(
@ -520,7 +523,8 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
                x_scale = None
            else:
                x_quant, x_scale = mxfp8_quantize(x, False)  # to mxfp8
-                x_scale = x_scale.view(torch.float8_e4m3fn).reshape(-1)
+                x_scale = x_scale.view(torch.float8_e4m3fn).reshape(
+                    *x.shape[:-1], -1)
            trtllm_gen_output = trtllm_fp4_block_scale_moe(
                router_logits.to(torch.bfloat16),
                None,  # routing_bias
@ -549,6 +553,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
                self._get_tile_tokens_dim(x, top_k),
                1 if renormalize else 0,  # routing_method_type, renormalize
                True,  # do finalize
+                tune_max_num_tokens=self.max_capture_size,
            )[0]
            return trtllm_gen_output
        else:
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@ -310,6 +310,10 @@ class Worker(WorkerBase):
            logger.info("Compile and warming up model for size %d", size)
            self.model_runner._dummy_run(size, skip_eplb=True)

+        # Warmup and tune the kernels used during model execution before
+        # cuda graph capture.
+        kernel_warmup(self)
+
        if not self.model_config.enforce_eager:
            self.model_runner.capture_model()

@ -334,9 +338,6 @@ class Worker(WorkerBase):
                self.model_runner._dummy_sampler_run(
                    hidden_states=last_hidden_states)

-        # Warmup kernels used during model execution
-        kernel_warmup(self)
-
        # Reset the seed to ensure that the random state is not affected by
        # the model initialization and profiling.
        set_random_seed(self.model_config.seed)