mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 14:53:52 +08:00
[Bugfix] Switch bailout logic for kv-cache-dtype with SM100 Flashinfer (#20934)
Signed-off-by: Pavani Majety <pmajety@nvidia.com>
This commit is contained in:
@ -1418,14 +1418,15 @@ class EngineArgs:
|
||||
and not envs.is_set("VLLM_ATTENTION_BACKEND")
|
||||
) or envs.VLLM_ATTENTION_BACKEND == "FLASH_ATTN_VLLM_V1"
|
||||
supported = False
|
||||
if current_platform.is_rocm():
|
||||
if current_platform.is_rocm() or (
|
||||
current_platform.is_cuda()
|
||||
and current_platform.is_device_capability(100)):
|
||||
supported = True
|
||||
elif fp8_attention and will_use_fa:
|
||||
from vllm.attention.utils.fa_utils import (
|
||||
flash_attn_supports_fp8)
|
||||
supported = flash_attn_supports_fp8()
|
||||
elif envs.VLLM_USE_TRTLLM_DECODE_ATTENTION:
|
||||
supported = True
|
||||
|
||||
if not supported:
|
||||
_raise_or_fallback(feature_name="--kv-cache-dtype",
|
||||
recommend_to_remove=False)
|
||||
|
Reference in New Issue
Block a user