mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 23:03:52 +08:00
[CI Failure] fix models/language/pooling/test_auto_prefix_cache_support.py (#24636)
Signed-off-by: wang.yuqi <noooop@126.com>
This commit is contained in:
@ -3558,6 +3558,10 @@ class VllmConfig:
|
|||||||
disable_chunked_prefill_reasons.append(
|
disable_chunked_prefill_reasons.append(
|
||||||
"Only \"last\" pooling supports chunked "
|
"Only \"last\" pooling supports chunked "
|
||||||
"prefill and prefix caching; disabling both.")
|
"prefill and prefix caching; disabling both.")
|
||||||
|
if not getattr(self.model_config.hf_config, "is_causal", True):
|
||||||
|
disable_chunked_prefill_reasons.append(
|
||||||
|
"Only models using causal attention supports chunked "
|
||||||
|
"prefill and prefix caching; disabling both.")
|
||||||
elif self.model_config.is_encoder_decoder:
|
elif self.model_config.is_encoder_decoder:
|
||||||
self.scheduler_config.max_num_encoder_input_tokens = \
|
self.scheduler_config.max_num_encoder_input_tokens = \
|
||||||
MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(self.model_config)
|
MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(self.model_config)
|
||||||
|
Reference in New Issue
Block a user