mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 14:53:52 +08:00
Removing the block manager v1. This is the initial piece of prefix-caching-centric design. In order to achieve prefix-caching-centric design, we need to simplify the code path so that we only use v2 block manager (which has much higher performance on prefix caching).
27 lines
890 B
Python
27 lines
890 B
Python
"""Block manager utils."""
|
|
from vllm.sequence import SequenceGroup
|
|
from vllm.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE,
|
|
STR_NOT_IMPL_ENC_DEC_SWA)
|
|
|
|
|
|
def check_no_caching_or_swa_for_blockmgr_encdec(
|
|
block_mgr, seq_group: SequenceGroup) -> None:
|
|
'''
|
|
Enforce that prefix caching & sliding-window attention (SWA)
|
|
are currently unsupported *specifically* for encoder/decoder models.
|
|
|
|
Raises NotImplementedError if unsupported scenario is detected.
|
|
|
|
Arguments:
|
|
|
|
* block_mgr: BlockSpaceManager instance
|
|
* seq_group: SequenceGroup passed to block_mgr
|
|
'''
|
|
|
|
if seq_group.is_encoder_decoder():
|
|
if block_mgr.max_block_sliding_window is not None:
|
|
raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_SWA)
|
|
|
|
if block_mgr.enable_caching:
|
|
raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE)
|