diff --git a/vllm/model_executor/warmup/kernel_warmup.py b/vllm/model_executor/warmup/kernel_warmup.py index 28792338f0..09972e6b11 100644 --- a/vllm/model_executor/warmup/kernel_warmup.py +++ b/vllm/model_executor/warmup/kernel_warmup.py @@ -36,9 +36,9 @@ def kernel_warmup(worker: "Worker"): max_tokens = worker.scheduler_config.max_num_batched_tokens deep_gemm_warmup(model, max_tokens) - # FlashInfer autotune for Hopper (SM 9.0) and Blackwell (SM 10.0) GPUs - if has_flashinfer() and current_platform.has_device_capability(90): - flashinfer_autotune(worker.model_runner) + # # FlashInfer autotune for Hopper (SM 9.0) and Blackwell (SM 10.0) GPUs + # if has_flashinfer() and current_platform.has_device_capability(90): + # flashinfer_autotune(worker.model_runner) # FlashInfer attention warmup # Only warmup if the model has FlashInfer attention groups