Add warning for Attention backends that do not support irope yet (#16212)

This commit is contained in:
Yong Hoon Shin
2025-04-07 20:59:26 -07:00
committed by GitHub
parent ad971af8c7
commit 05a015d6a5
8 changed files with 52 additions and 0 deletions

View File

@ -38,9 +38,12 @@ from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping,
from vllm.attention.layer import Attention
from vllm.attention.ops.paged_attn import PagedAttention
from vllm.config import VllmConfig, get_current_vllm_config
from vllm.logger import init_logger
from vllm.utils import (async_tensor_h2d, get_kv_cache_torch_dtype,
make_tensor_with_pad)
logger = init_logger(__name__)
if TYPE_CHECKING:
from vllm.worker.model_runner import (ModelInputForGPUBuilder,
ModelInputForGPUWithSamplingMetadata)
@ -907,7 +910,12 @@ class FlashInferImpl(AttentionImpl):
blocksparse_params: Optional[Dict[str, Any]] = None,
logits_soft_cap: Optional[float] = None,
attn_type: str = AttentionType.DECODER,
use_irope: bool = False,
) -> None:
if use_irope:
logger.warning_once(
"Using irope in FlashInfer is not supported yet, it will fall"
" back to global attention for long context.")
self.num_heads = num_heads
self.head_size = head_size
self.scale = float(scale)

View File

@ -108,8 +108,13 @@ class HPUAttentionImpl(AttentionImpl, torch.nn.Module):
blocksparse_params: Optional[Dict[str, Any]] = None,
max_seq_len: int = 4096,
attn_type: str = AttentionType.DECODER,
use_irope: bool = False,
) -> None:
super(AttentionImpl, self).__init__()
if use_irope:
logger.warning_once(
"Using irope in HPU is not supported yet, it will fall back "
"to global attention for long context.")
self.kv_cache_dtype = kv_cache_dtype
self.num_heads = num_heads
self.head_size = head_size

View File

@ -14,6 +14,9 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
from vllm.attention.backends.utils import CommonAttentionState
from vllm.attention.ops.paged_attn import (PagedAttention,
PagedAttentionMetadata)
from vllm.logger import init_logger
logger = init_logger(__name__)
_PARTITION_SIZE = 512
@ -119,7 +122,12 @@ class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]):
blocksparse_params: Optional[Dict[str, Any]] = None,
logits_soft_cap: Optional[float] = None,
attn_type: str = AttentionType.DECODER,
use_irope: bool = False,
) -> None:
if use_irope:
logger.warning_once(
"Using irope in Ipex is not supported yet, it will fall"
" back to global attention for long context.")
if blocksparse_params is not None:
raise ValueError(
"IPEX backend does not support block-sparse attention.")

View File

@ -11,6 +11,9 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
AttentionMetadata, AttentionType,
is_quantized_kv_cache)
from vllm.attention.backends.utils import CommonAttentionState
from vllm.logger import init_logger
logger = init_logger(__name__)
class PallasAttentionBackend(AttentionBackend):
@ -105,7 +108,12 @@ class PallasAttentionBackendImpl(AttentionImpl):
blocksparse_params: Optional[Dict[str, Any]] = None,
logits_soft_cap: Optional[float] = None,
attn_type: str = AttentionType.DECODER,
use_irope: bool = False,
) -> None:
if use_irope:
logger.warning_once(
"Using irope in Pallas is not supported yet, it will fall back "
"to global attention for long context.")
self.num_heads = num_heads
self.head_size = head_size
self.scale = float(scale)

View File

@ -462,7 +462,12 @@ class ROCmFlashAttentionImpl(AttentionImpl):
blocksparse_params: Optional[Dict[str, Any]] = None,
logits_soft_cap: Optional[float] = None,
attn_type: str = AttentionType.DECODER,
use_irope: bool = False,
) -> None:
if use_irope:
logger.warning_once(
"Using irope in ROCm Flash Attention is not supported yet, it "
"will fail back to global attention for long context.")
if blocksparse_params is not None:
raise ValueError(
"ROCmFlashAttention does not support blocksparse attention.")

View File

@ -404,6 +404,7 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]):
blocksparse_params: Optional[Dict[str, Any]] = None,
logits_soft_cap: Optional[float] = None,
attn_type: str = AttentionType.DECODER,
use_irope: bool = False,
) -> None:
if blocksparse_params is not None:
raise ValueError(
@ -411,6 +412,10 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]):
if logits_soft_cap is not None:
logger.warning_once("Torch SPDA does not support logits soft cap. "
"Outputs may be slightly off.")
if use_irope:
logger.warning_once(
"Using irope in Torch SPDA is not supported yet, it will fall"
" back to global attention for long context.")
self.num_heads = num_heads
self.head_size = head_size
self.scale = float(scale)

View File

@ -389,6 +389,7 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]):
blocksparse_params: Optional[Dict[str, Any]] = None,
logits_soft_cap: Optional[float] = None,
attn_type: str = AttentionType.DECODER,
use_irope: bool = False,
) -> None:
if blocksparse_params is not None:
raise ValueError(
@ -396,6 +397,10 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]):
if logits_soft_cap is not None:
logger.warning_once("XFormers does not support logits soft cap. "
"Outputs may be slightly off.")
if use_irope:
logger.warning_once(
"Using irope in XFormers is not supported yet, it will fall"
" back to global attention for long context.")
self.num_heads = num_heads
self.head_size = head_size
self.scale = float(scale)

View File

@ -10,6 +10,9 @@ import torch_xla.experimental.custom_kernel # noqa: F401
from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
AttentionLayer, AttentionType)
from vllm.attention.backends.utils import CommonAttentionState
from vllm.logger import init_logger
logger = init_logger(__name__)
class PallasAttentionBackend(AttentionBackend):
@ -80,7 +83,12 @@ class PallasAttentionBackendImpl(AttentionImpl):
blocksparse_params: Optional[dict[str, Any]] = None,
logits_soft_cap: Optional[float] = None,
attn_type: str = AttentionType.DECODER,
use_irope: bool = False,
) -> None:
if use_irope:
logger.warning_once(
"Using irope in Pallas is not supported yet, it will fall back "
"to global attention for long context.")
if blocksparse_params is not None:
raise ValueError("Paged attention Pallas kernel does "
"not support block-sparse attention.")