[Bugfix] Switch bailout logic for kv-cache-dtype with SM100 Flashinfer (#20934)

Signed-off-by: Pavani Majety <pmajety@nvidia.com>
This commit is contained in:
Pavani Majety
2025-07-14 20:27:50 -07:00
committed by GitHub
parent 016b8d1b7f
commit 9ad0a4588b

View File

@ -1418,14 +1418,15 @@ class EngineArgs:
and not envs.is_set("VLLM_ATTENTION_BACKEND")
) or envs.VLLM_ATTENTION_BACKEND == "FLASH_ATTN_VLLM_V1"
supported = False
if current_platform.is_rocm():
if current_platform.is_rocm() or (
current_platform.is_cuda()
and current_platform.is_device_capability(100)):
supported = True
elif fp8_attention and will_use_fa:
from vllm.attention.utils.fa_utils import (
flash_attn_supports_fp8)
supported = flash_attn_supports_fp8()
elif envs.VLLM_USE_TRTLLM_DECODE_ATTENTION:
supported = True
if not supported:
_raise_or_fallback(feature_name="--kv-cache-dtype",
recommend_to_remove=False)