Drop 0.10.2 (#3284)

Drop v0.10.2 support, we support vLLM 0.11.0rc3 now.
- vLLM version: v0.11.0rc3
- vLLM main:
https://github.com/vllm-project/vllm/commit/releases/v0.11.0

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
wangxiyuan
2025-10-09 10:28:38 +08:00
committed by GitHub
parent 2dde1268c7
commit f12f76d7ba
17 changed files with 202 additions and 653 deletions

View File

@ -42,8 +42,6 @@ from vllm.model_executor.models.qwen2_5_vl import (
from vllm.model_executor.models.utils import maybe_prefix from vllm.model_executor.models.utils import maybe_prefix
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm_ascend.utils import vllm_version_is
MIN_PAD_SIZE = 64 # min_size to pad weight MIN_PAD_SIZE = 64 # min_size to pad weight
MAX_PAD_SIZE = 128 # max_size to pad weight MAX_PAD_SIZE = 128 # max_size to pad weight
@ -498,20 +496,12 @@ class AscendQwen2_5_VLForConditionalGeneration(
super().__init__(vllm_config=vllm_config, prefix=prefix) super().__init__(vllm_config=vllm_config, prefix=prefix)
config: Qwen2_5_VLConfig = vllm_config.model_config.hf_config config: Qwen2_5_VLConfig = vllm_config.model_config.hf_config
quant_config = vllm_config.quant_config quant_config = vllm_config.quant_config
if vllm_version_is("0.10.2"): self.visual = AscendQwen2_5_VisionTransformer(
self.visual = AscendQwen2_5_VisionTransformer( vision_config=config.vision_config,
vision_config=config.vision_config, norm_eps=getattr(config, "rms_norm_eps", 1e-6),
norm_eps=getattr(config, "rms_norm_eps", 1e-6), quant_config=quant_config,
quant_config=self._maybe_ignore_quant_config(quant_config), prefix=maybe_prefix(prefix, "visual"),
prefix=maybe_prefix(prefix, "visual"), )
)
else:
self.visual = AscendQwen2_5_VisionTransformer(
vision_config=config.vision_config,
norm_eps=getattr(config, "rms_norm_eps", 1e-6),
quant_config=quant_config,
prefix=maybe_prefix(prefix, "visual"),
)
def _process_image_input(self, image_input) -> tuple[torch.Tensor, ...]: def _process_image_input(self, image_input) -> tuple[torch.Tensor, ...]:

View File

@ -68,7 +68,6 @@ from vllm.model_executor.models.utils import WeightsMapper, maybe_prefix
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm_ascend.models.qwen2_5_vl import AscendQwen2_5_VisionRotaryEmbedding from vllm_ascend.models.qwen2_5_vl import AscendQwen2_5_VisionRotaryEmbedding
from vllm_ascend.utils import vllm_version_is
class AscendQwen2_5_VisionAttention_Without_Padding(Qwen2_5_VisionAttention): class AscendQwen2_5_VisionAttention_Without_Padding(Qwen2_5_VisionAttention):
@ -484,20 +483,12 @@ class AscendQwen2_5_VLForConditionalGeneration_Without_Padding(
super().__init__(vllm_config=vllm_config, prefix=prefix) super().__init__(vllm_config=vllm_config, prefix=prefix)
config: Qwen2_5_VLConfig = vllm_config.model_config.hf_config config: Qwen2_5_VLConfig = vllm_config.model_config.hf_config
quant_config = vllm_config.quant_config quant_config = vllm_config.quant_config
if vllm_version_is("0.10.2"): self.visual = AscendQwen2_5_VisionTransformer_Without_Padding(
self.visual = AscendQwen2_5_VisionTransformer_Without_Padding( vision_config=config.vision_config,
vision_config=config.vision_config, norm_eps=getattr(config, "rms_norm_eps", 1e-6),
norm_eps=getattr(config, "rms_norm_eps", 1e-6), quant_config=quant_config,
quant_config=self._maybe_ignore_quant_config(quant_config), prefix=maybe_prefix(prefix, "visual"),
prefix=maybe_prefix(prefix, "visual"), )
)
else:
self.visual = AscendQwen2_5_VisionTransformer_Without_Padding(
vision_config=config.vision_config,
norm_eps=getattr(config, "rms_norm_eps", 1e-6),
quant_config=quant_config,
prefix=maybe_prefix(prefix, "visual"),
)
def _process_image_input(self, image_input) -> tuple[torch.Tensor, ...]: def _process_image_input(self, image_input) -> tuple[torch.Tensor, ...]:
@ -563,20 +554,12 @@ class AscendQwen3VLForConditionalGeneration(Qwen3VLForConditionalGeneration):
super().__init__(vllm_config=vllm_config, prefix=prefix) super().__init__(vllm_config=vllm_config, prefix=prefix)
config: Qwen3VLConfig = vllm_config.model_config.hf_config config: Qwen3VLConfig = vllm_config.model_config.hf_config
quant_config = vllm_config.quant_config quant_config = vllm_config.quant_config
if vllm_version_is("0.10.2"): self.visual = AscendQwen3_VisionTransformer(
self.visual = AscendQwen3_VisionTransformer( config.vision_config,
config.vision_config, norm_eps=getattr(config, "rms_norm_eps", 1e-6),
norm_eps=getattr(config, "rms_norm_eps", 1e-6), quant_config=quant_config,
quant_config=self._maybe_ignore_quant_config(quant_config), prefix=maybe_prefix(prefix, "visual"),
prefix=maybe_prefix(prefix, "visual"), use_data_parallel=self.use_data_parallel)
use_data_parallel=self.use_data_parallel)
else:
self.visual = AscendQwen3_VisionTransformer(
config.vision_config,
norm_eps=getattr(config, "rms_norm_eps", 1e-6),
quant_config=quant_config,
prefix=maybe_prefix(prefix, "visual"),
use_data_parallel=self.use_data_parallel)
@MULTIMODAL_REGISTRY.register_processor(Qwen3VLMultiModalProcessor, @MULTIMODAL_REGISTRY.register_processor(Qwen3VLMultiModalProcessor,
@ -613,19 +596,10 @@ class AscendQwen3VLMoeForConditionalGeneration(
multimodal_config = vllm_config.model_config.multimodal_config multimodal_config = vllm_config.model_config.multimodal_config
self.multimodal_config = multimodal_config self.multimodal_config = multimodal_config
self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data" self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"
if vllm_version_is("0.10.2"): self.visual = AscendQwen3_VisionTransformer(
self.visual = AscendQwen3_VisionTransformer( config.vision_config,
config.vision_config, norm_eps=getattr(config, "rms_norm_eps", 1e-6),
norm_eps=getattr(config, "rms_norm_eps", 1e-6), quant_config=quant_config,
quant_config=self._maybe_ignore_quant_config(quant_config), prefix=maybe_prefix(prefix, "visual"),
prefix=maybe_prefix(prefix, "visual"), use_data_parallel=self.use_data_parallel,
use_data_parallel=self.use_data_parallel, )
)
else:
self.visual = AscendQwen3_VisionTransformer(
config.vision_config,
norm_eps=getattr(config, "rms_norm_eps", 1e-6),
quant_config=quant_config,
prefix=maybe_prefix(prefix, "visual"),
use_data_parallel=self.use_data_parallel,
)

View File

@ -40,8 +40,6 @@ from vllm.model_executor.models.qwen2_vl import (
from vllm.model_executor.models.utils import maybe_prefix from vllm.model_executor.models.utils import maybe_prefix
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm_ascend.utils import vllm_version_is
MIN_PAD_SIZE = 64 # min_size to pad weight MIN_PAD_SIZE = 64 # min_size to pad weight
MAX_PAD_SIZE = 128 # max_size to pad weight MAX_PAD_SIZE = 128 # max_size to pad weight
@ -345,18 +343,9 @@ class AscendQwen2VLForConditionalGeneration(Qwen2VLForConditionalGeneration):
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__(vllm_config=vllm_config, prefix=prefix) super().__init__(vllm_config=vllm_config, prefix=prefix)
if vllm_version_is("0.10.2"): self.visual = AscendQwen2VisionTransformer(
self.visual = AscendQwen2VisionTransformer( self.config.vision_config,
self.config.vision_config, norm_eps=getattr(self.config, "rms_norm_eps", 1e-6),
norm_eps=getattr(self.config, "rms_norm_eps", 1e-6), quant_config=vllm_config.quant_config,
quant_config=self._maybe_ignore_quant_config( prefix=maybe_prefix(prefix, "visual"),
vllm_config.quant_config), )
prefix=maybe_prefix(prefix, "visual"),
)
else:
self.visual = AscendQwen2VisionTransformer(
self.config.vision_config,
norm_eps=getattr(self.config, "rms_norm_eps", 1e-6),
quant_config=vllm_config.quant_config,
prefix=maybe_prefix(prefix, "visual"),
)

View File

@ -47,7 +47,6 @@ from vllm.model_executor.models.utils import (
make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) make_empty_intermediate_tensors_factory, make_layers, maybe_prefix)
from vllm_ascend.ops.fused_moe import AscendFusedMoE from vllm_ascend.ops.fused_moe import AscendFusedMoE
from vllm_ascend.utils import vllm_version_is
class CustomSparseMoeBlock(Qwen3MoeSparseMoeBlock): class CustomSparseMoeBlock(Qwen3MoeSparseMoeBlock):
@ -170,14 +169,8 @@ class CustomQwen3MoeDecoderLayer(Qwen3MoeDecoderLayer):
quant_config=quant_config, quant_config=quant_config,
prefix=f"{prefix}.mlp") prefix=f"{prefix}.mlp")
else: else:
if vllm_version_is("0.10.2"): self.mlp = Qwen3MoeSparseMoeBlock(vllm_config=vllm_config,
self.mlp = Qwen3MoeSparseMoeBlock( prefix=f"{prefix}.mlp")
config=config,
quant_config=quant_config,
prefix=f"{prefix}.mlp")
else:
self.mlp = Qwen3MoeSparseMoeBlock(vllm_config=vllm_config,
prefix=f"{prefix}.mlp")
else: else:
self.mlp = Qwen3MoeMLP(hidden_size=config.hidden_size, self.mlp = Qwen3MoeMLP(hidden_size=config.hidden_size,
intermediate_size=config.intermediate_size, intermediate_size=config.intermediate_size,

View File

@ -43,8 +43,7 @@ from vllm_ascend.ops.moe.experts_selector import select_experts
from vllm_ascend.ops.moe.moe_comm_method import setup_moe_comm_method from vllm_ascend.ops.moe.moe_comm_method import setup_moe_comm_method
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ,
get_all_reduce_merge_state, get_all_reduce_merge_state,
get_rm_router_logits_state, is_310p, get_rm_router_logits_state, is_310p)
vllm_version_is)
class AscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod): class AscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
@ -275,25 +274,14 @@ class AscendFusedMoE(FusedMoE):
if self.scoring_func != "softmax" and not self.use_grouped_topk: if self.scoring_func != "softmax" and not self.use_grouped_topk:
raise ValueError("Only softmax scoring function is supported for " raise ValueError("Only softmax scoring function is supported for "
"non-grouped topk.") "non-grouped topk.")
if vllm_version_is("0.10.2"): moe = FusedMoEConfig(
moe = FusedMoEConfig.make( num_experts=self.global_num_experts,
num_experts=self.global_num_experts, experts_per_token=top_k,
experts_per_token=top_k, hidden_dim=hidden_size,
hidden_dim=hidden_size, num_local_experts=self.local_num_experts,
num_local_experts=self.local_num_experts, moe_parallel_config=self.moe_parallel_config,
moe_parallel_config=self.moe_parallel_config, in_dtype=params_dtype,
# TODO (bnell): this needs to be fixed for quantized types. )
in_dtype=params_dtype,
quant_config=quant_config)
else:
moe = FusedMoEConfig(
num_experts=self.global_num_experts,
experts_per_token=top_k,
hidden_dim=hidden_size,
num_local_experts=self.local_num_experts,
moe_parallel_config=self.moe_parallel_config,
in_dtype=params_dtype,
)
self.moe_config = moe self.moe_config = moe
# TODO: The self.moe_config.tp_size here is not correct, fixme soon # TODO: The self.moe_config.tp_size here is not correct, fixme soon

View File

@ -26,8 +26,6 @@ from vllm.distributed.parallel_state import (
from vllm.forward_context import get_forward_context from vllm.forward_context import get_forward_context
from vllm.model_executor.layers.fused_moe import FusedMoEConfig from vllm.model_executor.layers.fused_moe import FusedMoEConfig
from vllm_ascend.utils import vllm_version_is
class FusedMoEPrepareAndFinalize(ABC): class FusedMoEPrepareAndFinalize(ABC):
""" """
@ -416,12 +414,8 @@ class FusedMoEPrepareAndFinalizeWithNaiveMulticast(FusedMoEPrepareAndFinalize):
self.enable_shared_expert_dp = enable_shared_expert_dp self.enable_shared_expert_dp = enable_shared_expert_dp
if self.moe_config.dp_size > 1: if self.moe_config.dp_size > 1:
if vllm_version_is("0.10.2"): self.cu_tokens_across_dp_cpu = get_forward_context(
self.cu_tokens_across_dp_cpu = get_forward_context( ).dp_metadata.cu_tokens_across_sp(1)
).dp_metadata.cu_tokens_across_dp_cpu
else:
self.cu_tokens_across_dp_cpu = get_forward_context(
).dp_metadata.cu_tokens_across_sp(1)
hidden_states = self._naive_multicast(hidden_states, hidden_states = self._naive_multicast(hidden_states,
self.cu_tokens_across_dp_cpu) self.cu_tokens_across_dp_cpu)
if rm_router_logits: if rm_router_logits:

View File

@ -16,8 +16,6 @@ from vllm.model_executor.layers.quantization.base_config import \
from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm_ascend.utils import vllm_version_is
class AscendAttention(Attention, nn.Module, AttentionLayerBase): class AscendAttention(Attention, nn.Module, AttentionLayerBase):
"""Attention layer. """Attention layer.
@ -69,12 +67,10 @@ class AscendAttention(Attention, nn.Module, AttentionLayerBase):
if cache_config is not None: if cache_config is not None:
kv_cache_dtype = cache_config.cache_dtype kv_cache_dtype = cache_config.cache_dtype
block_size = cache_config.block_size block_size = cache_config.block_size
is_attention_free = cache_config.is_attention_free
calculate_kv_scales = cache_config.calculate_kv_scales calculate_kv_scales = cache_config.calculate_kv_scales
else: else:
kv_cache_dtype = "auto" kv_cache_dtype = "auto"
block_size = 16 block_size = 16
is_attention_free = False
calculate_kv_scales = False calculate_kv_scales = False
if num_kv_heads is None: if num_kv_heads is None:
num_kv_heads = num_heads num_kv_heads = num_heads
@ -135,23 +131,13 @@ class AscendAttention(Attention, nn.Module, AttentionLayerBase):
# weight and activation dtype. # weight and activation dtype.
dtype = torch.get_default_dtype() dtype = torch.get_default_dtype()
if attn_backend is None: if attn_backend is None:
if vllm_version_is("0.10.2"): self.attn_backend = get_attn_backend(head_size,
self.attn_backend = get_attn_backend(head_size, dtype,
dtype, kv_cache_dtype,
kv_cache_dtype, block_size,
block_size, use_mla=use_mla,
is_attention_free, use_sfa=use_sfa,
use_mla=use_mla, has_sink=self.has_sink)
use_sfa=use_sfa,
has_sink=self.has_sink)
else:
self.attn_backend = get_attn_backend(head_size,
dtype,
kv_cache_dtype,
block_size,
use_mla=use_mla,
use_sfa=use_sfa,
has_sink=self.has_sink)
else: else:
self.attn_backend = attn_backend self.attn_backend = attn_backend

View File

@ -27,154 +27,72 @@ from vllm.attention.selector import (backend_name_to_enum,
from vllm.platforms import _Backend, current_platform from vllm.platforms import _Backend, current_platform
from vllm.utils import resolve_obj_by_qualname from vllm.utils import resolve_obj_by_qualname
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.10.2"): def get_attn_backend( # type: ignore[misc]
head_size: int,
dtype: torch.dtype,
kv_cache_dtype: Optional[str],
block_size: int,
use_mla: bool = False,
use_sfa: bool = False,
has_sink: bool = False,
) -> type[AttentionBackend]:
"""Selects which attention backend to use and lazily imports it."""
# Accessing envs.* behind an @lru_cache decorator can cause the wrong
# value to be returned from the cache if the value changes between calls.
# To avoid this, we read envs.VLLM_USE_V1 here and pass it explicitly to the
# private function.
return _cached_get_attn_backend(
head_size=head_size,
dtype=dtype,
kv_cache_dtype=kv_cache_dtype,
block_size=block_size,
use_v1=envs.VLLM_USE_V1,
use_mla=use_mla,
use_sfa=use_sfa,
has_sink=has_sink,
)
def get_attn_backend(
head_size: int,
dtype: torch.dtype,
kv_cache_dtype: Optional[str],
block_size: int,
is_attention_free: bool = False,
use_mla: bool = False,
use_sfa: bool = False,
has_sink: bool = False,
) -> type[AttentionBackend]:
"""Selects which attention backend to use and lazily imports it."""
# Accessing envs.* behind an @lru_cache decorator can cause the wrong
# value to be returned from the cache if the value changes between calls.
# To avoid this, we read envs.VLLM_USE_V1 here and pass it explicitly to the
# private function.
return _cached_get_attn_backend(
head_size=head_size,
dtype=dtype,
kv_cache_dtype=kv_cache_dtype,
block_size=block_size,
is_attention_free=is_attention_free,
use_v1=envs.VLLM_USE_V1,
use_mla=use_mla,
use_sfa=use_sfa,
has_sink=has_sink,
)
@cache @cache
def _cached_get_attn_backend( def _cached_get_attn_backend(
head_size: int, head_size: int,
dtype: torch.dtype, dtype: torch.dtype,
kv_cache_dtype: Optional[str], kv_cache_dtype: Optional[str],
block_size: int, block_size: int,
is_attention_free: bool, use_v1: bool = False,
use_v1: bool = False, use_mla: bool = False,
use_mla: bool = False, use_sfa: bool = False,
use_sfa: bool = False, has_sink: bool = False,
has_sink: bool = False, ) -> type[AttentionBackend]:
) -> type[AttentionBackend]: # Check whether a particular choice of backend was
# If there are no attention layers (e.g. we are running Mamba), # previously forced.
# use the placeholder NO_ATTENTION #
if is_attention_free: # THIS SELECTION OVERRIDES THE VLLM_ATTENTION_BACKEND
from vllm.attention.backends.placeholder_attn import \ # ENVIRONMENT VARIABLE.
PlaceholderAttentionBackend selected_backend = None
return PlaceholderAttentionBackend backend_by_global_setting: Optional[_Backend] = (
get_global_forced_attn_backend())
if backend_by_global_setting is not None:
selected_backend = backend_by_global_setting
else:
# Check the environment variable and override if specified
backend_by_env_var: Optional[str] = envs.VLLM_ATTENTION_BACKEND
if backend_by_env_var is not None:
selected_backend = backend_name_to_enum(backend_by_env_var)
if selected_backend is None:
raise ValueError(
f"Invalid attention backend: '{backend_by_env_var}'. "
f"Valid backends are: {list(_Backend.__members__.keys())}")
# Check whether a particular choice of backend was # get device-specific attn_backend
# previously forced. attention_cls = current_platform.get_attn_backend_cls(
# selected_backend, head_size, dtype, kv_cache_dtype, block_size, use_v1,
# THIS SELECTION OVERRIDES THE VLLM_ATTENTION_BACKEND use_mla, use_sfa, has_sink)
# ENVIRONMENT VARIABLE. if not attention_cls:
selected_backend = None raise ValueError(
backend_by_global_setting: Optional[_Backend] = ( f"Invalid attention backend for {current_platform.device_name}")
get_global_forced_attn_backend()) return resolve_obj_by_qualname(attention_cls)
if backend_by_global_setting is not None:
selected_backend = backend_by_global_setting
else:
# Check the environment variable and override if specified
backend_by_env_var: Optional[str] = envs.VLLM_ATTENTION_BACKEND
if backend_by_env_var is not None:
selected_backend = backend_name_to_enum(backend_by_env_var)
if selected_backend is None:
raise ValueError(
f"Invalid attention backend: '{backend_by_env_var}'. "
f"Valid backends are: {list(_Backend.__members__.keys())}"
)
# get device-specific attn_backend
attention_cls = current_platform.get_attn_backend_cls(
selected_backend, head_size, dtype, kv_cache_dtype, block_size,
use_v1, use_mla, use_sfa, has_sink)
if not attention_cls:
raise ValueError(
f"Invalid attention backend for {current_platform.device_name}"
)
return resolve_obj_by_qualname(attention_cls)
else:
def get_attn_backend( # type: ignore[misc]
head_size: int,
dtype: torch.dtype,
kv_cache_dtype: Optional[str],
block_size: int,
use_mla: bool = False,
use_sfa: bool = False,
has_sink: bool = False,
) -> type[AttentionBackend]:
"""Selects which attention backend to use and lazily imports it."""
# Accessing envs.* behind an @lru_cache decorator can cause the wrong
# value to be returned from the cache if the value changes between calls.
# To avoid this, we read envs.VLLM_USE_V1 here and pass it explicitly to the
# private function.
return _cached_get_attn_backend(
head_size=head_size,
dtype=dtype,
kv_cache_dtype=kv_cache_dtype,
block_size=block_size,
use_v1=envs.VLLM_USE_V1,
use_mla=use_mla,
use_sfa=use_sfa,
has_sink=has_sink,
)
@cache
def _cached_get_attn_backend(
head_size: int,
dtype: torch.dtype,
kv_cache_dtype: Optional[str],
block_size: int,
use_v1: bool = False,
use_mla: bool = False,
use_sfa: bool = False,
has_sink: bool = False,
) -> type[AttentionBackend]:
# Check whether a particular choice of backend was
# previously forced.
#
# THIS SELECTION OVERRIDES THE VLLM_ATTENTION_BACKEND
# ENVIRONMENT VARIABLE.
selected_backend = None
backend_by_global_setting: Optional[_Backend] = (
get_global_forced_attn_backend())
if backend_by_global_setting is not None:
selected_backend = backend_by_global_setting
else:
# Check the environment variable and override if specified
backend_by_env_var: Optional[str] = envs.VLLM_ATTENTION_BACKEND
if backend_by_env_var is not None:
selected_backend = backend_name_to_enum(backend_by_env_var)
if selected_backend is None:
raise ValueError(
f"Invalid attention backend: '{backend_by_env_var}'. "
f"Valid backends are: {list(_Backend.__members__.keys())}"
)
# get device-specific attn_backend
attention_cls = current_platform.get_attn_backend_cls(
selected_backend, head_size, dtype, kv_cache_dtype, block_size,
use_v1, use_mla, use_sfa, has_sink)
if not attention_cls:
raise ValueError(
f"Invalid attention backend for {current_platform.device_name}"
)
return resolve_obj_by_qualname(attention_cls)
vllm.attention.get_attn_backend = get_attn_backend vllm.attention.get_attn_backend = get_attn_backend

View File

@ -1,11 +1,10 @@
import torch import torch
from torch.nn.parameter import Parameter from torch.nn.parameter import Parameter
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor.layers.linear import UnquantizedLinearMethod
from vllm.model_executor.utils import set_weight_attrs from vllm.model_executor.utils import set_weight_attrs
from vllm.utils import GiB_bytes from vllm.utils import GiB_bytes
from vllm_ascend.utils import vllm_version_is
logger = init_logger(__name__) logger = init_logger(__name__)
@ -39,6 +38,4 @@ def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int,
set_weight_attrs(weight, extra_weight_attrs) set_weight_attrs(weight, extra_weight_attrs)
if not vllm_version_is("0.10.2"): UnquantizedLinearMethod.create_weights = create_weights
from vllm.model_executor.layers.linear import UnquantizedLinearMethod
UnquantizedLinearMethod.create_weights = create_weights

View File

@ -32,7 +32,7 @@ from vllm_ascend.ascend_config import (check_ascend_config, get_ascend_config,
from vllm_ascend.torchair.utils import (check_torchair_cache_exist, from vllm_ascend.torchair.utils import (check_torchair_cache_exist,
delete_torchair_cache_file) delete_torchair_cache_file)
from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD, is_310p, from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD, is_310p,
update_aclgraph_sizes, vllm_version_is) update_aclgraph_sizes)
if TYPE_CHECKING: if TYPE_CHECKING:
from vllm.config import ModelConfig, VllmConfig from vllm.config import ModelConfig, VllmConfig
@ -131,10 +131,7 @@ class NPUPlatform(Platform):
cache_config = vllm_config.cache_config cache_config = vllm_config.cache_config
scheduler_config = vllm_config.scheduler_config scheduler_config = vllm_config.scheduler_config
ascend_scheduler_config = ascend_config.ascend_scheduler_config ascend_scheduler_config = ascend_config.ascend_scheduler_config
if vllm_version_is("0.10.2"): structured_outputs_config = vllm_config.structured_outputs_config
structured_outputs_config = vllm_config.decoding_config
else:
structured_outputs_config = vllm_config.structured_outputs_config
if (model_config is not None and not model_config.use_mla if (model_config is not None and not model_config.use_mla
and not scheduler_config.async_scheduling): and not scheduler_config.async_scheduling):
@ -212,9 +209,8 @@ class NPUPlatform(Platform):
vllm_config._set_cudagraph_sizes() vllm_config._set_cudagraph_sizes()
# TODO: Full graph is fully supported later, and the default value will be set to full graph. # TODO: Full graph is fully supported later, and the default value will be set to full graph.
if not vllm_version_is("0.10.2"): if compilation_config.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE:
if compilation_config.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE: compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
if compilation_config.cudagraph_mode == CUDAGraphMode.NONE: if compilation_config.cudagraph_mode == CUDAGraphMode.NONE:
compilation_config.level = CompilationLevel.NO_COMPILATION compilation_config.level = CompilationLevel.NO_COMPILATION

View File

@ -3,13 +3,9 @@ import torch_npu
from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler, random_sample from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler, random_sample
from vllm.v1.sample.sampler import Sampler from vllm.v1.sample.sampler import Sampler
from vllm_ascend.utils import is_310p, vllm_version_is from vllm_ascend.utils import is_310p
if vllm_version_is("0.10.2"): DEFAULT_LOGPROBS_MODE = "raw_logprobs"
from vllm.config import LogprobsMode
DEFAULT_LOGPROBS_MODE = LogprobsMode.RAW_LOGPROBS
else:
DEFAULT_LOGPROBS_MODE = "raw_logprobs"
class AscendSampler(Sampler): class AscendSampler(Sampler):
@ -69,18 +65,10 @@ class AscendTopKTopPSampler(TopKTopPSampler):
"""Override pytorch native implementation to torch_npu""" """Override pytorch native implementation to torch_npu"""
logits = self._apply_top_k_top_p(logits, k, p) logits = self._apply_top_k_top_p(logits, k, p)
logits_to_return = None logits_to_return = None
if vllm_version_is("0.10.2"): if self.logprobs_mode == "processed_logits":
if self.logprobs_mode == LogprobsMode.PROCESSED_LOGITS: logits_to_return = logits
logits_to_return = logits elif self.logprobs_mode == "processed_logprobs":
elif self.logprobs_mode == LogprobsMode.PROCESSED_LOGPROBS: logits_to_return = logits.log_softmax(dim=-1, dtype=torch.float32)
logits_to_return = logits.log_softmax(dim=-1,
dtype=torch.float32)
else:
if self.logprobs_mode == "processed_logits":
logits_to_return = logits
elif self.logprobs_mode == "processed_logprobs":
logits_to_return = logits.log_softmax(dim=-1,
dtype=torch.float32)
probs = logits.softmax(dim=-1, dtype=torch.float32) probs = logits.softmax(dim=-1, dtype=torch.float32)
return random_sample(probs, generators), logits_to_return return random_sample(probs, generators), logits_to_return

View File

@ -21,7 +21,6 @@ from vllm_ascend.attention.attention_mask import AttentionMaskBuilder
from vllm_ascend.attention.attention_v1 import AscendAttentionState from vllm_ascend.attention.attention_v1 import AscendAttentionState
from vllm_ascend.attention.utils import AscendCommonAttentionMetadata from vllm_ascend.attention.utils import AscendCommonAttentionMetadata
from vllm_ascend.spec_decode.interface import Proposer, SpecDcodeType from vllm_ascend.spec_decode.interface import Proposer, SpecDcodeType
from vllm_ascend.utils import vllm_version_is
PADDING_SLOT_ID = -1 PADDING_SLOT_ID = -1
@ -352,10 +351,7 @@ class EagleProposer(Proposer):
decode_token_per_req=self.runner.decode_token_per_req, decode_token_per_req=self.runner.decode_token_per_req,
num_computed_tokens_cpu=None, num_computed_tokens_cpu=None,
seq_lens=None) seq_lens=None)
if vllm_version_is("0.10.2"): builder = self.runner.attn_groups[0][0].get_metadata_builder()
builder = self.runner.attn_groups[0][0].metadata_builder
else:
builder = self.runner.attn_groups[0][0].get_metadata_builder()
attn_metadata_i = builder.build(0, common_attn_metadata, attn_metadata_i = builder.build(0, common_attn_metadata,
self.runner.get_model()) self.runner.get_model())
for layer_name in kv_cache_group_spec.layer_names: for layer_name in kv_cache_group_spec.layer_names:
@ -447,10 +443,7 @@ class EagleProposer(Proposer):
num_computed_tokens_cpu=None, num_computed_tokens_cpu=None,
seq_lens=None) seq_lens=None)
# FIXME(woosuk): The below two ops cause synchronization. Optimize. # FIXME(woosuk): The below two ops cause synchronization. Optimize.
if vllm_version_is("0.10.2"): builder = self.runner.attn_groups[0][0].get_metadata_builder()
builder = self.runner.attn_groups[0][0].metadata_builder
else:
builder = self.runner.attn_groups[0][0].get_metadata_builder()
attn_metadata = builder.build(0, common_attn_metadata, attn_metadata = builder.build(0, common_attn_metadata,
self.runner.get_model()) self.runner.get_model())
if self.use_cuda_graph and \ if self.use_cuda_graph and \
@ -479,10 +472,7 @@ class EagleProposer(Proposer):
hidden_states=self.hidden_states[:num_input_tokens], hidden_states=self.hidden_states[:num_input_tokens],
) )
sample_hidden_states = last_hidden_states[last_token_indices] sample_hidden_states = last_hidden_states[last_token_indices]
if vllm_version_is("0.10.2"): logits = self.model.compute_logits(sample_hidden_states)
logits = self.model.compute_logits(sample_hidden_states, None)
else:
logits = self.model.compute_logits(sample_hidden_states)
draft_token_ids = logits.argmax(dim=-1) draft_token_ids = logits.argmax(dim=-1)
# Early exit if there is only one draft token to be generated. # Early exit if there is only one draft token to be generated.
@ -586,12 +576,7 @@ class EagleProposer(Proposer):
hidden_states=self.hidden_states[:input_batch_size], hidden_states=self.hidden_states[:input_batch_size],
) )
hidden_states = hidden_states[:batch_size] hidden_states = hidden_states[:batch_size]
if vllm_version_is("0.10.2"): logits = self.model.compute_logits(last_hidden_states[:batch_size])
logits = self.model.compute_logits(
last_hidden_states[:batch_size], None)
else:
logits = self.model.compute_logits(
last_hidden_states[:batch_size])
# TODO(wenlong): get more than one token for tree attention # TODO(wenlong): get more than one token for tree attention
draft_token_ids = logits.argmax(dim=-1) draft_token_ids = logits.argmax(dim=-1)

View File

@ -24,8 +24,7 @@ from vllm_ascend.torchair.models.torchair_deepseek_mtp import \
TorchairDeepSeekMTP TorchairDeepSeekMTP
from vllm_ascend.torchair.utils import (TORCHAIR_CACHE_DIR, from vllm_ascend.torchair.utils import (TORCHAIR_CACHE_DIR,
TorchairCommonAttentionMetadata) TorchairCommonAttentionMetadata)
from vllm_ascend.utils import (ProfileExecuteDuration, lmhead_tp_enable, from vllm_ascend.utils import ProfileExecuteDuration, lmhead_tp_enable
vllm_version_is)
PADDING_SLOT_ID = -1 PADDING_SLOT_ID = -1
@ -400,10 +399,7 @@ class MtpProposer(Proposer):
seq_lens=None) seq_lens=None)
if not self.torchair_graph_enabled: if not self.torchair_graph_enabled:
if vllm_version_is("0.10.2"): builder = self.runner.attn_groups[0][0].get_metadata_builder()
builder = self.runner.attn_groups[0][0].metadata_builder
else:
builder = self.runner.attn_groups[0][0].get_metadata_builder()
attn_metadata_mtp = builder.build(0, common_attn_metadata, attn_metadata_mtp = builder.build(0, common_attn_metadata,
self.runner.get_model()) self.runner.get_model())

View File

@ -56,7 +56,6 @@ from vllm_ascend.attention.attention_v1 import AscendAttentionState
from vllm_ascend.ops.fused_moe import AscendFusedMoE from vllm_ascend.ops.fused_moe import AscendFusedMoE
from vllm_ascend.torchair.ops.sequence_parallel import (MetadataForPadding, from vllm_ascend.torchair.ops.sequence_parallel import (MetadataForPadding,
init_metadata_for_sp) init_metadata_for_sp)
from vllm_ascend.utils import vllm_version_is
class CustomSparseMoeBlock(Qwen3MoeSparseMoeBlock): class CustomSparseMoeBlock(Qwen3MoeSparseMoeBlock):
@ -312,14 +311,8 @@ class CustomQwen3MoeDecoderLayer(Qwen3MoeDecoderLayer):
quant_config=quant_config, quant_config=quant_config,
prefix=f"{prefix}.mlp") prefix=f"{prefix}.mlp")
else: else:
if vllm_version_is("0.10.2"): self.mlp = Qwen3MoeSparseMoeBlock(vllm_config=vllm_config,
self.mlp = Qwen3MoeSparseMoeBlock( prefix=f"{prefix}.mlp")
config=config,
quant_config=quant_config,
prefix=f"{prefix}.mlp")
else:
self.mlp = Qwen3MoeSparseMoeBlock(vllm_config=vllm_config,
prefix=f"{prefix}.mlp")
else: else:
self.mlp = Qwen3MoeMLP(hidden_size=config.hidden_size, self.mlp = Qwen3MoeMLP(hidden_size=config.hidden_size,
intermediate_size=config.intermediate_size, intermediate_size=config.intermediate_size,

View File

@ -50,8 +50,7 @@ from vllm_ascend.torchair.utils import npu_stream_switch, npu_wait_tensor
from vllm_ascend.utils import (AscendSocVersion, dispose_tensor, from vllm_ascend.utils import (AscendSocVersion, dispose_tensor,
get_all_reduce_merge_state, get_all_reduce_merge_state,
get_ascend_soc_version, get_ascend_soc_version,
get_rm_router_logits_state, is_310p, get_rm_router_logits_state, is_310p)
vllm_version_is)
def torchair_fused_experts_with_mc2( def torchair_fused_experts_with_mc2(
@ -1061,26 +1060,14 @@ class TorchairAscendFusedMoE(FusedMoE):
if self.scoring_func != "softmax" and not self.use_grouped_topk: if self.scoring_func != "softmax" and not self.use_grouped_topk:
raise ValueError("Only softmax scoring function is supported for " raise ValueError("Only softmax scoring function is supported for "
"non-grouped topk.") "non-grouped topk.")
self.moe = FusedMoEConfig(
if vllm_version_is("0.10.2"): num_experts=self.global_num_experts,
self.moe = FusedMoEConfig.make( experts_per_token=top_k,
num_experts=self.global_num_experts, hidden_dim=hidden_size,
experts_per_token=top_k, num_local_experts=self.local_num_experts,
hidden_dim=hidden_size, moe_parallel_config=self.moe_parallel_config,
num_local_experts=self.local_num_experts, in_dtype=params_dtype,
moe_parallel_config=self.moe_parallel_config, )
# TODO (bnell): this needs to be fixed for quantized types.
in_dtype=params_dtype,
quant_config=quant_config)
else:
self.moe = FusedMoEConfig(
num_experts=self.global_num_experts,
experts_per_token=top_k,
hidden_dim=hidden_size,
num_local_experts=self.local_num_experts,
moe_parallel_config=self.moe_parallel_config,
in_dtype=params_dtype,
)
if quant_config is None: if quant_config is None:
self.quant_method = TorchairAscendUnquantizedFusedMoEMethod( self.quant_method = TorchairAscendUnquantizedFusedMoEMethod(
self.moe) self.moe)
@ -1242,12 +1229,8 @@ class TorchairAscendFusedMoE(FusedMoE):
router_logits = get_dp_group().all_gather(router_logits, 0) router_logits = get_dp_group().all_gather(router_logits, 0)
elif fused_moe_state == FusedMoEState.NaiveMulticast: elif fused_moe_state == FusedMoEState.NaiveMulticast:
if vllm_version_is("0.10.2"): cu_tokens_across_dp_cpu = get_forward_context(
cu_tokens_across_dp_cpu = get_forward_context( ).dp_metadata.cu_tokens_across_sp(1)
).dp_metadata.cu_tokens_across_dp_cpu
else:
cu_tokens_across_dp_cpu = get_forward_context(
).dp_metadata.cu_tokens_across_sp(1)
hidden_states = self.naive_multicast(hidden_states, hidden_states = self.naive_multicast(hidden_states,
cu_tokens_across_dp_cpu) cu_tokens_across_dp_cpu)
if self.rm_router_logits: if self.rm_router_logits:

View File

@ -78,10 +78,12 @@ from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher
# yapf: disable # yapf: disable
from vllm.v1.kv_cache_interface import (AttentionSpec, FullAttentionSpec, from vllm.v1.kv_cache_interface import (AttentionSpec, FullAttentionSpec,
KVCacheConfig, KVCacheGroupSpec, KVCacheConfig, KVCacheGroupSpec,
KVCacheSpec, MambaSpec) KVCacheSpec, MambaSpec,
UniformTypeKVCacheSpecs)
# yapf: enable # yapf: enable
from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput, from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput,
DraftTokenIds, LogprobsTensors, ModelRunnerOutput) DraftTokenIds, LogprobsTensors, ModelRunnerOutput,
PoolerOutput)
from vllm.v1.pool.metadata import PoolingMetadata from vllm.v1.pool.metadata import PoolingMetadata
from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.sample.metadata import SamplingMetadata
from vllm.v1.spec_decode.metadata import SpecDecodeMetadata from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
@ -121,7 +123,7 @@ from vllm_ascend.spec_decode.mtp_proposer import MtpProposer
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ, from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ,
AscendSocVersion, ProfileExecuteDuration, AscendSocVersion, ProfileExecuteDuration,
get_ascend_soc_version, is_310p, get_ascend_soc_version, is_310p,
lmhead_tp_enable, vllm_version_is) lmhead_tp_enable)
from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch
if TYPE_CHECKING: if TYPE_CHECKING:
@ -143,13 +145,6 @@ if is_310p():
else: else:
ACL_FORMAT = ACL_FORMAT_FRACTAL_ND ACL_FORMAT = ACL_FORMAT_FRACTAL_ND
if not vllm_version_is("0.10.2"):
from vllm.v1.kv_cache_interface import UniformTypeKVCacheSpecs
from vllm.v1.outputs import PoolerOutput
else:
from vllm.sequence import PoolerOutput
UniformTypeKVCacheSpecs = None
@dataclass @dataclass
class GraphCaptureContext: class GraphCaptureContext:
@ -308,23 +303,13 @@ class NPUModelRunner(LoRAModelRunnerMixin):
dtype=self.dtype, dtype=self.dtype,
device=self.device) device=self.device)
# Set up Attention # Set up Attention
if vllm_version_is("0.10.2"): self.attn_backend = get_attn_backend(
self.attn_backend = get_attn_backend( 0,
0, self.dtype,
self.dtype, None,
None, self.block_size,
self.block_size, use_mla=self.model_config.use_mla,
self.model_config.is_attention_free, use_sfa=self.ascend_config.use_sfa)
use_mla=self.model_config.use_mla,
use_sfa=self.ascend_config.use_sfa)
else:
self.attn_backend = get_attn_backend(
0,
self.dtype,
None,
self.block_size,
use_mla=self.model_config.use_mla,
use_sfa=self.ascend_config.use_sfa)
if torch.version.cann.startswith("8.3"): if torch.version.cann.startswith("8.3"):
self.attn_mask_builder = AttentionMaskBuilder( self.attn_mask_builder = AttentionMaskBuilder(
self.scheduler_config.max_num_batched_tokens, self.dtype, self.scheduler_config.max_num_batched_tokens, self.dtype,
@ -602,12 +587,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
to_update.apply(pooling_params) to_update.apply(pooling_params)
backward_kwargs = {} backward_kwargs = {}
if vllm_version_is("0.10.2"): backward_kwargs["mm_features"] = new_req_data.mm_features
backward_kwargs["mm_kwargs"] = new_req_data.mm_kwargs
backward_kwargs["mm_hashes"] = new_req_data.mm_hashes
backward_kwargs["mm_positions"] = new_req_data.mm_positions
else:
backward_kwargs["mm_features"] = new_req_data.mm_features
self.requests[req_id] = CachedRequestState( self.requests[req_id] = CachedRequestState(
req_id=req_id, req_id=req_id,
@ -624,10 +604,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
# Only relevant for models using M-RoPE (e.g, Qwen2-VL) # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
if self.uses_mrope: if self.uses_mrope:
if vllm_version_is("0.10.2"): self._init_mrope_positions(self.requests[req_id])
self._init_mrope_positions_0102(self.requests[req_id])
else:
self._init_mrope_positions(self.requests[req_id])
req_ids_to_add.append(req_id) req_ids_to_add.append(req_id)
@ -759,39 +736,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
use_audio_in_video=use_audio_in_video, use_audio_in_video=use_audio_in_video,
) )
def _init_mrope_positions_0102(self, req_state: CachedRequestState):
image_grid_thw = []
video_grid_thw = []
second_per_grid_ts = []
audio_feature_lengths = []
use_audio_in_video = False
assert req_state.mm_kwargs is not None
for mm_item in req_state.mm_kwargs:
mm_input = mm_item.get_data()
if mm_input.get("image_grid_thw") is not None:
image_grid_thw.append(mm_input["image_grid_thw"].tolist())
if mm_input.get("video_grid_thw") is not None:
video_grid_thw.append(mm_input["video_grid_thw"].tolist())
if mm_input.get("second_per_grid_ts") is not None:
second_per_grid_ts.append(mm_input["second_per_grid_ts"])
if mm_input.get("audio_feature_lengths") is not None:
audio_feature_lengths.append(mm_input["audio_feature_lengths"])
if mm_input.get("use_audio_in_video") is True:
use_audio_in_video = True
hf_config = self.model_config.hf_config
req_state.mrope_positions, req_state.mrope_position_delta = \
MRotaryEmbedding.get_input_positions_tensor(
req_state.prompt_token_ids,
hf_config=hf_config,
image_grid_thw=image_grid_thw,
video_grid_thw=video_grid_thw,
second_per_grid_ts=second_per_grid_ts,
audio_feature_lengths=audio_feature_lengths,
use_audio_in_video=use_audio_in_video,
)
def _sync_metadata_across_dp( def _sync_metadata_across_dp(
self, num_tokens: int, with_prefill: bool, enable_dbo: bool self, num_tokens: int, with_prefill: bool, enable_dbo: bool
) -> tuple[int, Optional[torch.Tensor], bool, bool]: ) -> tuple[int, Optional[torch.Tensor], bool, bool]:
@ -966,12 +910,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
return return
# Batch the multi-modal inputs. # Batch the multi-modal inputs.
if vllm_version_is("0.10.2"): mm_kwargs, mm_hashes_pos = self._batch_mm_kwargs_from_scheduler(
mm_kwargs, mm_hashes_pos = self._batch_mm_kwargs_from_scheduler_0102( scheduler_output)
scheduler_output)
else:
mm_kwargs, mm_hashes_pos = self._batch_mm_kwargs_from_scheduler(
scheduler_output)
encoder_outputs = [] encoder_outputs = []
for _, num_items, mm_kwargs_group in group_mm_kwargs_by_modality( for _, num_items, mm_kwargs_group in group_mm_kwargs_by_modality(
@ -1003,31 +943,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
is_embed=pos_info.is_embed, is_embed=pos_info.is_embed,
) )
# TODO: remove this once we drop support for vLLM 0.10.2
def _batch_mm_kwargs_from_scheduler_0102(
self,
scheduler_output: "SchedulerOutput",
) -> tuple[list[MultiModalKwargsItem], list[tuple[str, PlaceholderRange]]]:
scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs
if not scheduled_encoder_inputs:
return [], []
# Batch the multi-modal inputs.
mm_kwargs = list[MultiModalKwargsItem]()
# list of tuple (mm_hash, position_info)
mm_hashes_pos = list[tuple[str, PlaceholderRange]]()
for req_id, encoder_input_ids in scheduled_encoder_inputs.items():
req_state = self.requests[req_id]
assert req_state.mm_hashes is not None
assert req_state.mm_kwargs is not None
assert req_state.mm_positions is not None
for mm_input_id in encoder_input_ids:
mm_hash = req_state.mm_hashes[mm_input_id]
mm_kwargs.append(req_state.mm_kwargs[mm_input_id])
mm_hashes_pos.append(
(mm_hash, req_state.mm_positions[mm_input_id]))
return mm_kwargs, mm_hashes_pos
def _batch_mm_kwargs_from_scheduler( def _batch_mm_kwargs_from_scheduler(
self, self,
scheduler_output: "SchedulerOutput", scheduler_output: "SchedulerOutput",
@ -1067,20 +982,11 @@ class NPUModelRunner(LoRAModelRunnerMixin):
) -> list[torch.Tensor]: ) -> list[torch.Tensor]:
def _iter_mm_features(req_state: CachedRequestState): def _iter_mm_features(req_state: CachedRequestState):
if vllm_version_is("0.10.2"): assert req_state.mm_features is not None
# legacy path (to be removed later) for mm_feature in req_state.mm_features:
assert req_state.mm_hashes is not None pos_info = mm_feature.mm_position
assert req_state.mm_positions is not None yield mm_feature.identifier, pos_info, getattr(
for mm_hash, pos_info in zip(req_state.mm_hashes, pos_info, "is_embed", None)
req_state.mm_positions):
yield mm_hash, pos_info, getattr(pos_info, "is_embed",
None)
else:
assert req_state.mm_features is not None
for mm_feature in req_state.mm_features:
pos_info = mm_feature.mm_position
yield mm_feature.identifier, pos_info, getattr(
pos_info, "is_embed", None)
mm_embeds: list[torch.Tensor] = [] mm_embeds: list[torch.Tensor] = []
@ -1527,10 +1433,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
for attn_group in self.attn_groups[kv_cache_group_id]: for attn_group in self.attn_groups[kv_cache_group_id]:
common_prefix_len = 0 common_prefix_len = 0
extra_attn_metadata_args = {} extra_attn_metadata_args = {}
if vllm_version_is("0.10.2"): builder = attn_group.get_metadata_builder()
builder = attn_group.metadata_builder
else:
builder = attn_group.get_metadata_builder()
if isinstance(builder, GDNAttentionMetadataBuilder): if isinstance(builder, GDNAttentionMetadataBuilder):
if use_spec_decode: if use_spec_decode:
extra_attn_metadata_args = dict( extra_attn_metadata_args = dict(
@ -1809,29 +1712,21 @@ class NPUModelRunner(LoRAModelRunnerMixin):
device=hidden_states.device) device=hidden_states.device)
seq_lens_cpu = self.seq_lens_cpu[:self.input_batch.num_reqs] seq_lens_cpu = self.seq_lens_cpu[:self.input_batch.num_reqs]
if vllm_version_is("0.10.2"): model = cast(VllmModelForPooling, self.model)
# Pooling models D2H & synchronize occurs in pooler.py:build_output raw_pooler_output = model.pooler(
raw_pooler_output = self.model.pooler( hidden_states=hidden_states,
hidden_states=hidden_states, pooling_metadata=pooling_metadata) pooling_metadata=pooling_metadata,
else: )
model = cast(VllmModelForPooling, self.model) raw_pooler_output = json_map_leaves(
raw_pooler_output = model.pooler( lambda x: x.to("cpu", non_blocking=True),
hidden_states=hidden_states, raw_pooler_output,
pooling_metadata=pooling_metadata, )
) torch.npu.synchronize()
raw_pooler_output = json_map_leaves(
lambda x: x.to("cpu", non_blocking=True),
raw_pooler_output,
)
torch.npu.synchronize()
pooler_output: list[Optional[torch.Tensor]] = [] pooler_output: list[Optional[torch.Tensor]] = []
for raw_output, seq_len, prompt_len in zip( for raw_output, seq_len, prompt_len in zip(
raw_pooler_output, seq_lens_cpu, pooling_metadata.prompt_lens): raw_pooler_output, seq_lens_cpu, pooling_metadata.prompt_lens):
if vllm_version_is("0.10.2"): output = raw_output if seq_len == prompt_len else None
output = raw_output.data if seq_len == prompt_len else None
else:
output = raw_output if seq_len == prompt_len else None
pooler_output.append(output) pooler_output.append(output)
return ModelRunnerOutput( return ModelRunnerOutput(
@ -2006,8 +1901,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
num_scheduled_tokens_np, finished_sending, num_scheduled_tokens_np, finished_sending,
finished_recving, kv_connector_output) finished_recving, kv_connector_output)
sample_hidden_states = hidden_states[logits_indices] sample_hidden_states = hidden_states[logits_indices]
logits = self._compute_logits_wrapper(sample_hidden_states, logits = self.model.compute_logits(sample_hidden_states)
None)
if broadcast_pp_output: if broadcast_pp_output:
model_output_broadcast_data = { model_output_broadcast_data = {
"logits": logits.contiguous(), "logits": logits.contiguous(),
@ -2302,10 +2196,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
) )
for attn_group in self.attn_groups[kv_cache_group_id]: for attn_group in self.attn_groups[kv_cache_group_id]:
if vllm_version_is("0.10.2"): builder = attn_group.get_metadata_builder()
builder = attn_group.metadata_builder
else:
builder = attn_group.get_metadata_builder()
attn_metadata_i = builder.build_for_graph_capture( attn_metadata_i = builder.build_for_graph_capture(
common_attn_metadata) common_attn_metadata)
for layer_name in kv_cache_group_spec.layer_names: for layer_name in kv_cache_group_spec.layer_names:
@ -2463,8 +2354,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
dtype=torch.int32) dtype=torch.int32)
def dummy_compute_logits(hidden_states): def dummy_compute_logits(hidden_states):
return self._compute_logits_wrapper( return self.model.compute_logits(
hidden_states[dummy_indices], None) hidden_states[dummy_indices])
with set_ascend_forward_context( with set_ascend_forward_context(
attn_metadata, attn_metadata,
@ -2542,18 +2433,13 @@ class NPUModelRunner(LoRAModelRunnerMixin):
logit_indices = np.cumsum(num_scheduled_tokens) - 1 logit_indices = np.cumsum(num_scheduled_tokens) - 1
# TODO: need to rum a dummy sampler for generate task # TODO: need to rum a dummy sampler for generate task
hidden_states = hidden_states[logit_indices] hidden_states = hidden_states[logit_indices]
output = self._compute_logits_wrapper(hidden_states, None) output = self.model.compute_logits(hidden_states)
NPUPlatform.synchronize() NPUPlatform.synchronize()
del hidden_states, output del hidden_states, output
self.encoder_cache.clear() self.encoder_cache.clear()
gc.collect() gc.collect()
def _compute_logits_wrapper(self, hidden_states, sampling_metadata):
if vllm_version_is("0.10.2"):
return self.model.compute_logits(hidden_states, sampling_metadata)
return self.model.compute_logits(hidden_states)
def _dummy_pooler_run_task( def _dummy_pooler_run_task(
self, self,
hidden_states: torch.Tensor, hidden_states: torch.Tensor,
@ -2615,10 +2501,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
for task in self.get_supported_pooling_tasks(): for task in self.get_supported_pooling_tasks():
# Run a full batch with each task to ensure none of them OOMs # Run a full batch with each task to ensure none of them OOMs
output = self._dummy_pooler_run_task(hidden_states, task) output = self._dummy_pooler_run_task(hidden_states, task)
if vllm_version_is("0.10.2"): output_size[task] = sum(o.nbytes for o in output)
output_size[task] = output.get_data_nbytes()
else:
output_size[task] = sum(o.nbytes for o in output)
del output # Allow GC del output # Allow GC
max_task = max(output_size.items(), key=lambda x: x[1])[0] max_task = max(output_size.items(), key=lambda x: x[1])[0]
@ -2657,16 +2540,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
self.model.get_eagle3_aux_hidden_state_layers()) self.model.get_eagle3_aux_hidden_state_layers())
if self.lora_config: if self.lora_config:
if vllm_version_is("0.10.2"): self.model = self.load_lora_model(self.model, self.vllm_config,
self.model = self.load_lora_model(self.model, self.device)
self.model_config,
self.scheduler_config,
self.lora_config,
self.device)
else:
self.model = self.load_lora_model(self.model,
self.vllm_config,
self.device)
logger.info("Loading model weights took %.4f GB", logger.info("Loading model weights took %.4f GB",
m.consumed_memory / float(2**30)) m.consumed_memory / float(2**30))
@ -2694,17 +2569,10 @@ class NPUModelRunner(LoRAModelRunnerMixin):
self.initialize_attn_backend(kv_cache_config) self.initialize_attn_backend(kv_cache_config)
self.use_hybrid_blocks = (len(self.attn_groups) > 1) self.use_hybrid_blocks = (len(self.attn_groups) > 1)
# NOTE: Currently, we determine whether we need `num_accepted_tokens` through `MambaSpec`. # NOTE: Currently, we determine whether we need `num_accepted_tokens` through `MambaSpec`.
if vllm_version_is("0.10.2"): self.need_accepted_tokens = any([
self.need_accepted_tokens = any([ isinstance(attn_group[0].kv_cache_spec, MambaSpec)
isinstance( for attn_group in self.attn_groups
self.kv_cache_config.kv_cache_groups[0].kv_cache_spec, ])
MambaSpec) for attn_group in self.attn_groups
])
else:
self.need_accepted_tokens = any([
isinstance(attn_group[0].kv_cache_spec, MambaSpec)
for attn_group in self.attn_groups
])
self.may_reinitialize_input_batch(kv_cache_config) self.may_reinitialize_input_batch(kv_cache_config)
@ -2737,11 +2605,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
kv_cache_sizes[kv_cache_tensor.shared_by[0]] = kv_cache_tensor.size kv_cache_sizes[kv_cache_tensor.shared_by[0]] = kv_cache_tensor.size
kv_caches: Dict[str, torch.Tensor] = {} kv_caches: Dict[str, torch.Tensor] = {}
for group in self._kv_cache_spec_attn_group_iterator_dispatcher(): for group in self._kv_cache_spec_attn_group_iterator():
if vllm_version_is("0.10.2"): kv_cache_spec = group.kv_cache_spec
kv_cache_spec, group = group
else:
kv_cache_spec = group.kv_cache_spec
attn_backend = group.backend attn_backend = group.backend
for layer_name in group.layer_names: for layer_name in group.layer_names:
if layer_name in self.runner_only_attn_layers: if layer_name in self.runner_only_attn_layers:
@ -2846,11 +2711,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
kv_cache_sizes[kv_cache_tensor.shared_by[0]] = kv_cache_tensor.size kv_cache_sizes[kv_cache_tensor.shared_by[0]] = kv_cache_tensor.size
kv_caches: Dict[str, torch.Tensor] = {} kv_caches: Dict[str, torch.Tensor] = {}
for group in self._kv_cache_spec_attn_group_iterator_dispatcher(): for group in self._kv_cache_spec_attn_group_iterator():
if vllm_version_is("0.10.2"): kv_cache_spec = group.kv_cache_spec
kv_cache_spec, group = group
else:
kv_cache_spec = group.kv_cache_spec
attn_backend = group.backend attn_backend = group.backend
for layer_name in group.layer_names: for layer_name in group.layer_names:
if layer_name in self.runner_only_attn_layers: if layer_name in self.runner_only_attn_layers:
@ -2996,11 +2858,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
)), "Some layers are not correctly initialized" )), "Some layers are not correctly initialized"
kv_caches: Dict[str, torch.Tensor] = {} kv_caches: Dict[str, torch.Tensor] = {}
for group in self._kv_cache_spec_attn_group_iterator_dispatcher(): for group in self._kv_cache_spec_attn_group_iterator():
if vllm_version_is("0.10.2"): kv_cache_spec = group.kv_cache_spec
kv_cache_spec, group = group
else:
kv_cache_spec = group.kv_cache_spec
attn_backend = group.backend attn_backend = group.backend
for layer_name in group.layer_names: for layer_name in group.layer_names:
if layer_name in self.runner_only_attn_layers: if layer_name in self.runner_only_attn_layers:
@ -3211,50 +3070,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
for k, v in attn_backend_layers.items() for k, v in attn_backend_layers.items()
} }
def get_attn_backends_for_layers(
layer_names: list[str]
) -> dict[type[AttentionBackend], list[str]]:
"""Get attention_backend for all attention layers
TODO: Only used in v0.10.2, drop me when 0.10.2 is dropped
"""
layers = get_layers_from_vllm_config(self.vllm_config,
AttentionLayerBase,
layer_names)
attn_backends = {}
attn_backend_layers = defaultdict(list)
# Dedupe based on full class name; this is a bit safer than
# using the class itself as the key because when we create dynamic
# attention backend subclasses (e.g. ChunkedLocalAttention) unless
# they are cached correctly, there will be different objects per
# layer.
for layer_name in layer_names:
attn_backend = layers[layer_name].get_attn_backend()
key = attn_backend.full_cls_name()
attn_backends[key] = attn_backend
attn_backend_layers[key].append(layer_name)
return {
attn_backends[k]: v
for k, v in attn_backend_layers.items()
}
def create_attn_groups_v0102(
attn_backends_map: dict[AttentionBackend, list[str]],
kv_cache_spec: KVCacheSpec,
) -> list[AttentionGroup]:
attn_groups: list[AttentionGroup] = []
for attn_backend, layer_names in attn_backends_map.items():
attn_metadata_builder_i = attn_backend.get_builder_cls()(
kv_cache_spec,
layer_names,
self.vllm_config,
self.device,
)
attn_group = AttentionGroup(attn_backend,
attn_metadata_builder_i,
layer_names)
attn_groups.append(attn_group)
return attn_groups
def create_attn_groups( def create_attn_groups(
attn_backends_map: dict[AttentionBackend, list[str]], attn_backends_map: dict[AttentionBackend, list[str]],
) -> list[AttentionGroup]: ) -> list[AttentionGroup]:
@ -3274,18 +3089,10 @@ class NPUModelRunner(LoRAModelRunnerMixin):
attn_groups.append(attn_group) attn_groups.append(attn_group)
return attn_groups return attn_groups
if vllm_version_is("0.10.2"): for kv_cache_group_spec in kv_cache_config.kv_cache_groups:
for kv_cache_group_spec in kv_cache_config.kv_cache_groups: attn_backends = get_attn_backends_for_group( # type: ignore
kv_cache_spec = kv_cache_group_spec.kv_cache_spec kv_cache_group_spec)
attn_backends = get_attn_backends_for_layers( self.attn_groups.append(create_attn_groups(attn_backends))
kv_cache_group_spec.layer_names)
self.attn_groups.append(
create_attn_groups_v0102(attn_backends, kv_cache_spec))
else:
for kv_cache_group_spec in kv_cache_config.kv_cache_groups:
attn_backends = get_attn_backends_for_group( # type: ignore
kv_cache_group_spec)
self.attn_groups.append(create_attn_groups(attn_backends))
# Calculate reorder batch threshold (if needed) # Calculate reorder batch threshold (if needed)
self.calculate_reorder_batch_threshold() self.calculate_reorder_batch_threshold()
@ -3299,31 +3106,13 @@ class NPUModelRunner(LoRAModelRunnerMixin):
for attn_groups in self.attn_groups: for attn_groups in self.attn_groups:
yield from attn_groups yield from attn_groups
def _kv_cache_spec_attn_group_iterator_v0102(
self) -> Iterator[tuple[KVCacheSpec, AttentionGroup]]:
if not self.kv_cache_config.kv_cache_groups:
return
for kv_cache_spec_id, attn_groups in enumerate(self.attn_groups):
for attn_group in attn_groups:
yield self.kv_cache_config.kv_cache_groups[
kv_cache_spec_id].kv_cache_spec, attn_group
def _kv_cache_spec_attn_group_iterator_dispatcher(self):
if vllm_version_is("0.10.2"):
return self._kv_cache_spec_attn_group_iterator_v0102()
else:
return self._kv_cache_spec_attn_group_iterator()
def calculate_reorder_batch_threshold(self) -> None: def calculate_reorder_batch_threshold(self) -> None:
""" """
Check that if any backends reorder batches; that the reordering Check that if any backends reorder batches; that the reordering
is compatible (e.g., decode threshold is the same) is compatible (e.g., decode threshold is the same)
""" """
for group in self._attn_group_iterator(): for group in self._attn_group_iterator():
if vllm_version_is("0.10.2"): attn_metadata_builder_i = group.get_metadata_builder()
attn_metadata_builder_i = group.metadata_builder
else:
attn_metadata_builder_i = group.get_metadata_builder()
if hasattr(attn_metadata_builder_i, "reorder_batch_threshold"): if hasattr(attn_metadata_builder_i, "reorder_batch_threshold"):
# check that if any backends reorder batches; that the reordering # check that if any backends reorder batches; that the reordering
# is compatible (e.g., decode threshold is the same) # is compatible (e.g., decode threshold is the same)
@ -3427,10 +3216,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
min_ag_builder_name = None min_ag_builder_name = None
for attn_group in self._attn_group_iterator(): for attn_group in self._attn_group_iterator():
if vllm_version_is("0.10.2"): builder = attn_group.get_metadata_builder()
builder = attn_group.metadata_builder
else:
builder = attn_group.get_metadata_builder()
if builder.aclgraph_support.value < min_ag_support.value: if builder.aclgraph_support.value < min_ag_support.value:
min_ag_support = builder.aclgraph_support min_ag_support = builder.aclgraph_support
min_ag_builder_name = builder.__class__.__name__ min_ag_builder_name = builder.__class__.__name__
@ -3674,7 +3460,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
req_idx = self.input_batch.req_id_to_index[req_id] req_idx = self.input_batch.req_id_to_index[req_id]
offset = self.query_start_loc_np[req_idx].item() offset = self.query_start_loc_np[req_idx].item()
prompt_hidden_states = hidden_states[offset:offset + num_logits] prompt_hidden_states = hidden_states[offset:offset + num_logits]
logits = self._compute_logits_wrapper(prompt_hidden_states, None) logits = self.model.compute_logits(prompt_hidden_states)
# Get the "target" tokens for each index. For prompt at index i, # Get the "target" tokens for each index. For prompt at index i,
# the token at prompt index i+1 is the "sampled" token we want # the token at prompt index i+1 is the "sampled" token we want

View File

@ -39,7 +39,6 @@ from vllm.v1.sample.metadata import SamplingMetadata
from vllm.v1.spec_decode.utils import is_spec_decode_unsupported from vllm.v1.spec_decode.utils import is_spec_decode_unsupported
from vllm.v1.utils import copy_slice from vllm.v1.utils import copy_slice
from vllm_ascend.utils import vllm_version_is
from vllm_ascend.worker.block_table import MultiGroupBlockTable from vllm_ascend.worker.block_table import MultiGroupBlockTable
@ -79,12 +78,6 @@ class CachedRequestState:
@deprecated("`mm_inputs` is superseded by `mm_kwargs` and will be " @deprecated("`mm_inputs` is superseded by `mm_kwargs` and will be "
"removed in v0.13. Please use `mm_kwargs` instead.") "removed in v0.13. Please use `mm_kwargs` instead.")
def mm_inputs(self) -> list[MultiModalKwargsItems]: def mm_inputs(self) -> list[MultiModalKwargsItems]:
if vllm_version_is("0.10.2"):
assert self.mm_kwargs is not None
return [
MultiModalKwargsItems.from_seq([item])
for item in self.mm_kwargs
]
assert self.mm_features is not None assert self.mm_features is not None
return [ return [
MultiModalKwargsItems.from_seq([f.data]) for f in self.mm_features MultiModalKwargsItems.from_seq([f.data]) for f in self.mm_features