mirror of
https://github.com/vllm-project/vllm-ascend.git
synced 2025-10-20 13:43:53 +08:00
Drop 0.10.2 (#3284)
Drop v0.10.2 support, we support vLLM 0.11.0rc3 now. - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/releases/v0.11.0 Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
@ -42,8 +42,6 @@ from vllm.model_executor.models.qwen2_5_vl import (
|
|||||||
from vllm.model_executor.models.utils import maybe_prefix
|
from vllm.model_executor.models.utils import maybe_prefix
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
|
|
||||||
from vllm_ascend.utils import vllm_version_is
|
|
||||||
|
|
||||||
MIN_PAD_SIZE = 64 # min_size to pad weight
|
MIN_PAD_SIZE = 64 # min_size to pad weight
|
||||||
MAX_PAD_SIZE = 128 # max_size to pad weight
|
MAX_PAD_SIZE = 128 # max_size to pad weight
|
||||||
|
|
||||||
@ -498,20 +496,12 @@ class AscendQwen2_5_VLForConditionalGeneration(
|
|||||||
super().__init__(vllm_config=vllm_config, prefix=prefix)
|
super().__init__(vllm_config=vllm_config, prefix=prefix)
|
||||||
config: Qwen2_5_VLConfig = vllm_config.model_config.hf_config
|
config: Qwen2_5_VLConfig = vllm_config.model_config.hf_config
|
||||||
quant_config = vllm_config.quant_config
|
quant_config = vllm_config.quant_config
|
||||||
if vllm_version_is("0.10.2"):
|
self.visual = AscendQwen2_5_VisionTransformer(
|
||||||
self.visual = AscendQwen2_5_VisionTransformer(
|
vision_config=config.vision_config,
|
||||||
vision_config=config.vision_config,
|
norm_eps=getattr(config, "rms_norm_eps", 1e-6),
|
||||||
norm_eps=getattr(config, "rms_norm_eps", 1e-6),
|
quant_config=quant_config,
|
||||||
quant_config=self._maybe_ignore_quant_config(quant_config),
|
prefix=maybe_prefix(prefix, "visual"),
|
||||||
prefix=maybe_prefix(prefix, "visual"),
|
)
|
||||||
)
|
|
||||||
else:
|
|
||||||
self.visual = AscendQwen2_5_VisionTransformer(
|
|
||||||
vision_config=config.vision_config,
|
|
||||||
norm_eps=getattr(config, "rms_norm_eps", 1e-6),
|
|
||||||
quant_config=quant_config,
|
|
||||||
prefix=maybe_prefix(prefix, "visual"),
|
|
||||||
)
|
|
||||||
|
|
||||||
def _process_image_input(self, image_input) -> tuple[torch.Tensor, ...]:
|
def _process_image_input(self, image_input) -> tuple[torch.Tensor, ...]:
|
||||||
|
|
||||||
|
@ -68,7 +68,6 @@ from vllm.model_executor.models.utils import WeightsMapper, maybe_prefix
|
|||||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
|
|
||||||
from vllm_ascend.models.qwen2_5_vl import AscendQwen2_5_VisionRotaryEmbedding
|
from vllm_ascend.models.qwen2_5_vl import AscendQwen2_5_VisionRotaryEmbedding
|
||||||
from vllm_ascend.utils import vllm_version_is
|
|
||||||
|
|
||||||
|
|
||||||
class AscendQwen2_5_VisionAttention_Without_Padding(Qwen2_5_VisionAttention):
|
class AscendQwen2_5_VisionAttention_Without_Padding(Qwen2_5_VisionAttention):
|
||||||
@ -484,20 +483,12 @@ class AscendQwen2_5_VLForConditionalGeneration_Without_Padding(
|
|||||||
super().__init__(vllm_config=vllm_config, prefix=prefix)
|
super().__init__(vllm_config=vllm_config, prefix=prefix)
|
||||||
config: Qwen2_5_VLConfig = vllm_config.model_config.hf_config
|
config: Qwen2_5_VLConfig = vllm_config.model_config.hf_config
|
||||||
quant_config = vllm_config.quant_config
|
quant_config = vllm_config.quant_config
|
||||||
if vllm_version_is("0.10.2"):
|
self.visual = AscendQwen2_5_VisionTransformer_Without_Padding(
|
||||||
self.visual = AscendQwen2_5_VisionTransformer_Without_Padding(
|
vision_config=config.vision_config,
|
||||||
vision_config=config.vision_config,
|
norm_eps=getattr(config, "rms_norm_eps", 1e-6),
|
||||||
norm_eps=getattr(config, "rms_norm_eps", 1e-6),
|
quant_config=quant_config,
|
||||||
quant_config=self._maybe_ignore_quant_config(quant_config),
|
prefix=maybe_prefix(prefix, "visual"),
|
||||||
prefix=maybe_prefix(prefix, "visual"),
|
)
|
||||||
)
|
|
||||||
else:
|
|
||||||
self.visual = AscendQwen2_5_VisionTransformer_Without_Padding(
|
|
||||||
vision_config=config.vision_config,
|
|
||||||
norm_eps=getattr(config, "rms_norm_eps", 1e-6),
|
|
||||||
quant_config=quant_config,
|
|
||||||
prefix=maybe_prefix(prefix, "visual"),
|
|
||||||
)
|
|
||||||
|
|
||||||
def _process_image_input(self, image_input) -> tuple[torch.Tensor, ...]:
|
def _process_image_input(self, image_input) -> tuple[torch.Tensor, ...]:
|
||||||
|
|
||||||
@ -563,20 +554,12 @@ class AscendQwen3VLForConditionalGeneration(Qwen3VLForConditionalGeneration):
|
|||||||
super().__init__(vllm_config=vllm_config, prefix=prefix)
|
super().__init__(vllm_config=vllm_config, prefix=prefix)
|
||||||
config: Qwen3VLConfig = vllm_config.model_config.hf_config
|
config: Qwen3VLConfig = vllm_config.model_config.hf_config
|
||||||
quant_config = vllm_config.quant_config
|
quant_config = vllm_config.quant_config
|
||||||
if vllm_version_is("0.10.2"):
|
self.visual = AscendQwen3_VisionTransformer(
|
||||||
self.visual = AscendQwen3_VisionTransformer(
|
config.vision_config,
|
||||||
config.vision_config,
|
norm_eps=getattr(config, "rms_norm_eps", 1e-6),
|
||||||
norm_eps=getattr(config, "rms_norm_eps", 1e-6),
|
quant_config=quant_config,
|
||||||
quant_config=self._maybe_ignore_quant_config(quant_config),
|
prefix=maybe_prefix(prefix, "visual"),
|
||||||
prefix=maybe_prefix(prefix, "visual"),
|
use_data_parallel=self.use_data_parallel)
|
||||||
use_data_parallel=self.use_data_parallel)
|
|
||||||
else:
|
|
||||||
self.visual = AscendQwen3_VisionTransformer(
|
|
||||||
config.vision_config,
|
|
||||||
norm_eps=getattr(config, "rms_norm_eps", 1e-6),
|
|
||||||
quant_config=quant_config,
|
|
||||||
prefix=maybe_prefix(prefix, "visual"),
|
|
||||||
use_data_parallel=self.use_data_parallel)
|
|
||||||
|
|
||||||
|
|
||||||
@MULTIMODAL_REGISTRY.register_processor(Qwen3VLMultiModalProcessor,
|
@MULTIMODAL_REGISTRY.register_processor(Qwen3VLMultiModalProcessor,
|
||||||
@ -613,19 +596,10 @@ class AscendQwen3VLMoeForConditionalGeneration(
|
|||||||
multimodal_config = vllm_config.model_config.multimodal_config
|
multimodal_config = vllm_config.model_config.multimodal_config
|
||||||
self.multimodal_config = multimodal_config
|
self.multimodal_config = multimodal_config
|
||||||
self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"
|
self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"
|
||||||
if vllm_version_is("0.10.2"):
|
self.visual = AscendQwen3_VisionTransformer(
|
||||||
self.visual = AscendQwen3_VisionTransformer(
|
config.vision_config,
|
||||||
config.vision_config,
|
norm_eps=getattr(config, "rms_norm_eps", 1e-6),
|
||||||
norm_eps=getattr(config, "rms_norm_eps", 1e-6),
|
quant_config=quant_config,
|
||||||
quant_config=self._maybe_ignore_quant_config(quant_config),
|
prefix=maybe_prefix(prefix, "visual"),
|
||||||
prefix=maybe_prefix(prefix, "visual"),
|
use_data_parallel=self.use_data_parallel,
|
||||||
use_data_parallel=self.use_data_parallel,
|
)
|
||||||
)
|
|
||||||
else:
|
|
||||||
self.visual = AscendQwen3_VisionTransformer(
|
|
||||||
config.vision_config,
|
|
||||||
norm_eps=getattr(config, "rms_norm_eps", 1e-6),
|
|
||||||
quant_config=quant_config,
|
|
||||||
prefix=maybe_prefix(prefix, "visual"),
|
|
||||||
use_data_parallel=self.use_data_parallel,
|
|
||||||
)
|
|
||||||
|
@ -40,8 +40,6 @@ from vllm.model_executor.models.qwen2_vl import (
|
|||||||
from vllm.model_executor.models.utils import maybe_prefix
|
from vllm.model_executor.models.utils import maybe_prefix
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
|
|
||||||
from vllm_ascend.utils import vllm_version_is
|
|
||||||
|
|
||||||
MIN_PAD_SIZE = 64 # min_size to pad weight
|
MIN_PAD_SIZE = 64 # min_size to pad weight
|
||||||
MAX_PAD_SIZE = 128 # max_size to pad weight
|
MAX_PAD_SIZE = 128 # max_size to pad weight
|
||||||
|
|
||||||
@ -345,18 +343,9 @@ class AscendQwen2VLForConditionalGeneration(Qwen2VLForConditionalGeneration):
|
|||||||
|
|
||||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||||
super().__init__(vllm_config=vllm_config, prefix=prefix)
|
super().__init__(vllm_config=vllm_config, prefix=prefix)
|
||||||
if vllm_version_is("0.10.2"):
|
self.visual = AscendQwen2VisionTransformer(
|
||||||
self.visual = AscendQwen2VisionTransformer(
|
self.config.vision_config,
|
||||||
self.config.vision_config,
|
norm_eps=getattr(self.config, "rms_norm_eps", 1e-6),
|
||||||
norm_eps=getattr(self.config, "rms_norm_eps", 1e-6),
|
quant_config=vllm_config.quant_config,
|
||||||
quant_config=self._maybe_ignore_quant_config(
|
prefix=maybe_prefix(prefix, "visual"),
|
||||||
vllm_config.quant_config),
|
)
|
||||||
prefix=maybe_prefix(prefix, "visual"),
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
self.visual = AscendQwen2VisionTransformer(
|
|
||||||
self.config.vision_config,
|
|
||||||
norm_eps=getattr(self.config, "rms_norm_eps", 1e-6),
|
|
||||||
quant_config=vllm_config.quant_config,
|
|
||||||
prefix=maybe_prefix(prefix, "visual"),
|
|
||||||
)
|
|
||||||
|
@ -47,7 +47,6 @@ from vllm.model_executor.models.utils import (
|
|||||||
make_empty_intermediate_tensors_factory, make_layers, maybe_prefix)
|
make_empty_intermediate_tensors_factory, make_layers, maybe_prefix)
|
||||||
|
|
||||||
from vllm_ascend.ops.fused_moe import AscendFusedMoE
|
from vllm_ascend.ops.fused_moe import AscendFusedMoE
|
||||||
from vllm_ascend.utils import vllm_version_is
|
|
||||||
|
|
||||||
|
|
||||||
class CustomSparseMoeBlock(Qwen3MoeSparseMoeBlock):
|
class CustomSparseMoeBlock(Qwen3MoeSparseMoeBlock):
|
||||||
@ -170,14 +169,8 @@ class CustomQwen3MoeDecoderLayer(Qwen3MoeDecoderLayer):
|
|||||||
quant_config=quant_config,
|
quant_config=quant_config,
|
||||||
prefix=f"{prefix}.mlp")
|
prefix=f"{prefix}.mlp")
|
||||||
else:
|
else:
|
||||||
if vllm_version_is("0.10.2"):
|
self.mlp = Qwen3MoeSparseMoeBlock(vllm_config=vllm_config,
|
||||||
self.mlp = Qwen3MoeSparseMoeBlock(
|
prefix=f"{prefix}.mlp")
|
||||||
config=config,
|
|
||||||
quant_config=quant_config,
|
|
||||||
prefix=f"{prefix}.mlp")
|
|
||||||
else:
|
|
||||||
self.mlp = Qwen3MoeSparseMoeBlock(vllm_config=vllm_config,
|
|
||||||
prefix=f"{prefix}.mlp")
|
|
||||||
else:
|
else:
|
||||||
self.mlp = Qwen3MoeMLP(hidden_size=config.hidden_size,
|
self.mlp = Qwen3MoeMLP(hidden_size=config.hidden_size,
|
||||||
intermediate_size=config.intermediate_size,
|
intermediate_size=config.intermediate_size,
|
||||||
|
@ -43,8 +43,7 @@ from vllm_ascend.ops.moe.experts_selector import select_experts
|
|||||||
from vllm_ascend.ops.moe.moe_comm_method import setup_moe_comm_method
|
from vllm_ascend.ops.moe.moe_comm_method import setup_moe_comm_method
|
||||||
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ,
|
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ,
|
||||||
get_all_reduce_merge_state,
|
get_all_reduce_merge_state,
|
||||||
get_rm_router_logits_state, is_310p,
|
get_rm_router_logits_state, is_310p)
|
||||||
vllm_version_is)
|
|
||||||
|
|
||||||
|
|
||||||
class AscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
|
class AscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
|
||||||
@ -275,25 +274,14 @@ class AscendFusedMoE(FusedMoE):
|
|||||||
if self.scoring_func != "softmax" and not self.use_grouped_topk:
|
if self.scoring_func != "softmax" and not self.use_grouped_topk:
|
||||||
raise ValueError("Only softmax scoring function is supported for "
|
raise ValueError("Only softmax scoring function is supported for "
|
||||||
"non-grouped topk.")
|
"non-grouped topk.")
|
||||||
if vllm_version_is("0.10.2"):
|
moe = FusedMoEConfig(
|
||||||
moe = FusedMoEConfig.make(
|
num_experts=self.global_num_experts,
|
||||||
num_experts=self.global_num_experts,
|
experts_per_token=top_k,
|
||||||
experts_per_token=top_k,
|
hidden_dim=hidden_size,
|
||||||
hidden_dim=hidden_size,
|
num_local_experts=self.local_num_experts,
|
||||||
num_local_experts=self.local_num_experts,
|
moe_parallel_config=self.moe_parallel_config,
|
||||||
moe_parallel_config=self.moe_parallel_config,
|
in_dtype=params_dtype,
|
||||||
# TODO (bnell): this needs to be fixed for quantized types.
|
)
|
||||||
in_dtype=params_dtype,
|
|
||||||
quant_config=quant_config)
|
|
||||||
else:
|
|
||||||
moe = FusedMoEConfig(
|
|
||||||
num_experts=self.global_num_experts,
|
|
||||||
experts_per_token=top_k,
|
|
||||||
hidden_dim=hidden_size,
|
|
||||||
num_local_experts=self.local_num_experts,
|
|
||||||
moe_parallel_config=self.moe_parallel_config,
|
|
||||||
in_dtype=params_dtype,
|
|
||||||
)
|
|
||||||
self.moe_config = moe
|
self.moe_config = moe
|
||||||
# TODO: The self.moe_config.tp_size here is not correct, fixme soon
|
# TODO: The self.moe_config.tp_size here is not correct, fixme soon
|
||||||
|
|
||||||
|
@ -26,8 +26,6 @@ from vllm.distributed.parallel_state import (
|
|||||||
from vllm.forward_context import get_forward_context
|
from vllm.forward_context import get_forward_context
|
||||||
from vllm.model_executor.layers.fused_moe import FusedMoEConfig
|
from vllm.model_executor.layers.fused_moe import FusedMoEConfig
|
||||||
|
|
||||||
from vllm_ascend.utils import vllm_version_is
|
|
||||||
|
|
||||||
|
|
||||||
class FusedMoEPrepareAndFinalize(ABC):
|
class FusedMoEPrepareAndFinalize(ABC):
|
||||||
"""
|
"""
|
||||||
@ -416,12 +414,8 @@ class FusedMoEPrepareAndFinalizeWithNaiveMulticast(FusedMoEPrepareAndFinalize):
|
|||||||
self.enable_shared_expert_dp = enable_shared_expert_dp
|
self.enable_shared_expert_dp = enable_shared_expert_dp
|
||||||
|
|
||||||
if self.moe_config.dp_size > 1:
|
if self.moe_config.dp_size > 1:
|
||||||
if vllm_version_is("0.10.2"):
|
self.cu_tokens_across_dp_cpu = get_forward_context(
|
||||||
self.cu_tokens_across_dp_cpu = get_forward_context(
|
).dp_metadata.cu_tokens_across_sp(1)
|
||||||
).dp_metadata.cu_tokens_across_dp_cpu
|
|
||||||
else:
|
|
||||||
self.cu_tokens_across_dp_cpu = get_forward_context(
|
|
||||||
).dp_metadata.cu_tokens_across_sp(1)
|
|
||||||
hidden_states = self._naive_multicast(hidden_states,
|
hidden_states = self._naive_multicast(hidden_states,
|
||||||
self.cu_tokens_across_dp_cpu)
|
self.cu_tokens_across_dp_cpu)
|
||||||
if rm_router_logits:
|
if rm_router_logits:
|
||||||
|
@ -16,8 +16,6 @@ from vllm.model_executor.layers.quantization.base_config import \
|
|||||||
from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
|
from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
from vllm_ascend.utils import vllm_version_is
|
|
||||||
|
|
||||||
|
|
||||||
class AscendAttention(Attention, nn.Module, AttentionLayerBase):
|
class AscendAttention(Attention, nn.Module, AttentionLayerBase):
|
||||||
"""Attention layer.
|
"""Attention layer.
|
||||||
@ -69,12 +67,10 @@ class AscendAttention(Attention, nn.Module, AttentionLayerBase):
|
|||||||
if cache_config is not None:
|
if cache_config is not None:
|
||||||
kv_cache_dtype = cache_config.cache_dtype
|
kv_cache_dtype = cache_config.cache_dtype
|
||||||
block_size = cache_config.block_size
|
block_size = cache_config.block_size
|
||||||
is_attention_free = cache_config.is_attention_free
|
|
||||||
calculate_kv_scales = cache_config.calculate_kv_scales
|
calculate_kv_scales = cache_config.calculate_kv_scales
|
||||||
else:
|
else:
|
||||||
kv_cache_dtype = "auto"
|
kv_cache_dtype = "auto"
|
||||||
block_size = 16
|
block_size = 16
|
||||||
is_attention_free = False
|
|
||||||
calculate_kv_scales = False
|
calculate_kv_scales = False
|
||||||
if num_kv_heads is None:
|
if num_kv_heads is None:
|
||||||
num_kv_heads = num_heads
|
num_kv_heads = num_heads
|
||||||
@ -135,23 +131,13 @@ class AscendAttention(Attention, nn.Module, AttentionLayerBase):
|
|||||||
# weight and activation dtype.
|
# weight and activation dtype.
|
||||||
dtype = torch.get_default_dtype()
|
dtype = torch.get_default_dtype()
|
||||||
if attn_backend is None:
|
if attn_backend is None:
|
||||||
if vllm_version_is("0.10.2"):
|
self.attn_backend = get_attn_backend(head_size,
|
||||||
self.attn_backend = get_attn_backend(head_size,
|
dtype,
|
||||||
dtype,
|
kv_cache_dtype,
|
||||||
kv_cache_dtype,
|
block_size,
|
||||||
block_size,
|
use_mla=use_mla,
|
||||||
is_attention_free,
|
use_sfa=use_sfa,
|
||||||
use_mla=use_mla,
|
has_sink=self.has_sink)
|
||||||
use_sfa=use_sfa,
|
|
||||||
has_sink=self.has_sink)
|
|
||||||
else:
|
|
||||||
self.attn_backend = get_attn_backend(head_size,
|
|
||||||
dtype,
|
|
||||||
kv_cache_dtype,
|
|
||||||
block_size,
|
|
||||||
use_mla=use_mla,
|
|
||||||
use_sfa=use_sfa,
|
|
||||||
has_sink=self.has_sink)
|
|
||||||
else:
|
else:
|
||||||
self.attn_backend = attn_backend
|
self.attn_backend = attn_backend
|
||||||
|
|
||||||
|
@ -27,154 +27,72 @@ from vllm.attention.selector import (backend_name_to_enum,
|
|||||||
from vllm.platforms import _Backend, current_platform
|
from vllm.platforms import _Backend, current_platform
|
||||||
from vllm.utils import resolve_obj_by_qualname
|
from vllm.utils import resolve_obj_by_qualname
|
||||||
|
|
||||||
from vllm_ascend.utils import vllm_version_is
|
|
||||||
|
|
||||||
if vllm_version_is("0.10.2"):
|
def get_attn_backend( # type: ignore[misc]
|
||||||
|
head_size: int,
|
||||||
|
dtype: torch.dtype,
|
||||||
|
kv_cache_dtype: Optional[str],
|
||||||
|
block_size: int,
|
||||||
|
use_mla: bool = False,
|
||||||
|
use_sfa: bool = False,
|
||||||
|
has_sink: bool = False,
|
||||||
|
) -> type[AttentionBackend]:
|
||||||
|
"""Selects which attention backend to use and lazily imports it."""
|
||||||
|
# Accessing envs.* behind an @lru_cache decorator can cause the wrong
|
||||||
|
# value to be returned from the cache if the value changes between calls.
|
||||||
|
# To avoid this, we read envs.VLLM_USE_V1 here and pass it explicitly to the
|
||||||
|
# private function.
|
||||||
|
return _cached_get_attn_backend(
|
||||||
|
head_size=head_size,
|
||||||
|
dtype=dtype,
|
||||||
|
kv_cache_dtype=kv_cache_dtype,
|
||||||
|
block_size=block_size,
|
||||||
|
use_v1=envs.VLLM_USE_V1,
|
||||||
|
use_mla=use_mla,
|
||||||
|
use_sfa=use_sfa,
|
||||||
|
has_sink=has_sink,
|
||||||
|
)
|
||||||
|
|
||||||
def get_attn_backend(
|
|
||||||
head_size: int,
|
|
||||||
dtype: torch.dtype,
|
|
||||||
kv_cache_dtype: Optional[str],
|
|
||||||
block_size: int,
|
|
||||||
is_attention_free: bool = False,
|
|
||||||
use_mla: bool = False,
|
|
||||||
use_sfa: bool = False,
|
|
||||||
has_sink: bool = False,
|
|
||||||
) -> type[AttentionBackend]:
|
|
||||||
"""Selects which attention backend to use and lazily imports it."""
|
|
||||||
# Accessing envs.* behind an @lru_cache decorator can cause the wrong
|
|
||||||
# value to be returned from the cache if the value changes between calls.
|
|
||||||
# To avoid this, we read envs.VLLM_USE_V1 here and pass it explicitly to the
|
|
||||||
# private function.
|
|
||||||
return _cached_get_attn_backend(
|
|
||||||
head_size=head_size,
|
|
||||||
dtype=dtype,
|
|
||||||
kv_cache_dtype=kv_cache_dtype,
|
|
||||||
block_size=block_size,
|
|
||||||
is_attention_free=is_attention_free,
|
|
||||||
use_v1=envs.VLLM_USE_V1,
|
|
||||||
use_mla=use_mla,
|
|
||||||
use_sfa=use_sfa,
|
|
||||||
has_sink=has_sink,
|
|
||||||
)
|
|
||||||
|
|
||||||
@cache
|
@cache
|
||||||
def _cached_get_attn_backend(
|
def _cached_get_attn_backend(
|
||||||
head_size: int,
|
head_size: int,
|
||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
kv_cache_dtype: Optional[str],
|
kv_cache_dtype: Optional[str],
|
||||||
block_size: int,
|
block_size: int,
|
||||||
is_attention_free: bool,
|
use_v1: bool = False,
|
||||||
use_v1: bool = False,
|
use_mla: bool = False,
|
||||||
use_mla: bool = False,
|
use_sfa: bool = False,
|
||||||
use_sfa: bool = False,
|
has_sink: bool = False,
|
||||||
has_sink: bool = False,
|
) -> type[AttentionBackend]:
|
||||||
) -> type[AttentionBackend]:
|
# Check whether a particular choice of backend was
|
||||||
# If there are no attention layers (e.g. we are running Mamba),
|
# previously forced.
|
||||||
# use the placeholder NO_ATTENTION
|
#
|
||||||
if is_attention_free:
|
# THIS SELECTION OVERRIDES THE VLLM_ATTENTION_BACKEND
|
||||||
from vllm.attention.backends.placeholder_attn import \
|
# ENVIRONMENT VARIABLE.
|
||||||
PlaceholderAttentionBackend
|
selected_backend = None
|
||||||
return PlaceholderAttentionBackend
|
backend_by_global_setting: Optional[_Backend] = (
|
||||||
|
get_global_forced_attn_backend())
|
||||||
|
if backend_by_global_setting is not None:
|
||||||
|
selected_backend = backend_by_global_setting
|
||||||
|
else:
|
||||||
|
# Check the environment variable and override if specified
|
||||||
|
backend_by_env_var: Optional[str] = envs.VLLM_ATTENTION_BACKEND
|
||||||
|
if backend_by_env_var is not None:
|
||||||
|
selected_backend = backend_name_to_enum(backend_by_env_var)
|
||||||
|
if selected_backend is None:
|
||||||
|
raise ValueError(
|
||||||
|
f"Invalid attention backend: '{backend_by_env_var}'. "
|
||||||
|
f"Valid backends are: {list(_Backend.__members__.keys())}")
|
||||||
|
|
||||||
# Check whether a particular choice of backend was
|
# get device-specific attn_backend
|
||||||
# previously forced.
|
attention_cls = current_platform.get_attn_backend_cls(
|
||||||
#
|
selected_backend, head_size, dtype, kv_cache_dtype, block_size, use_v1,
|
||||||
# THIS SELECTION OVERRIDES THE VLLM_ATTENTION_BACKEND
|
use_mla, use_sfa, has_sink)
|
||||||
# ENVIRONMENT VARIABLE.
|
if not attention_cls:
|
||||||
selected_backend = None
|
raise ValueError(
|
||||||
backend_by_global_setting: Optional[_Backend] = (
|
f"Invalid attention backend for {current_platform.device_name}")
|
||||||
get_global_forced_attn_backend())
|
return resolve_obj_by_qualname(attention_cls)
|
||||||
if backend_by_global_setting is not None:
|
|
||||||
selected_backend = backend_by_global_setting
|
|
||||||
else:
|
|
||||||
# Check the environment variable and override if specified
|
|
||||||
backend_by_env_var: Optional[str] = envs.VLLM_ATTENTION_BACKEND
|
|
||||||
if backend_by_env_var is not None:
|
|
||||||
selected_backend = backend_name_to_enum(backend_by_env_var)
|
|
||||||
if selected_backend is None:
|
|
||||||
raise ValueError(
|
|
||||||
f"Invalid attention backend: '{backend_by_env_var}'. "
|
|
||||||
f"Valid backends are: {list(_Backend.__members__.keys())}"
|
|
||||||
)
|
|
||||||
|
|
||||||
# get device-specific attn_backend
|
|
||||||
attention_cls = current_platform.get_attn_backend_cls(
|
|
||||||
selected_backend, head_size, dtype, kv_cache_dtype, block_size,
|
|
||||||
use_v1, use_mla, use_sfa, has_sink)
|
|
||||||
if not attention_cls:
|
|
||||||
raise ValueError(
|
|
||||||
f"Invalid attention backend for {current_platform.device_name}"
|
|
||||||
)
|
|
||||||
return resolve_obj_by_qualname(attention_cls)
|
|
||||||
else:
|
|
||||||
|
|
||||||
def get_attn_backend( # type: ignore[misc]
|
|
||||||
head_size: int,
|
|
||||||
dtype: torch.dtype,
|
|
||||||
kv_cache_dtype: Optional[str],
|
|
||||||
block_size: int,
|
|
||||||
use_mla: bool = False,
|
|
||||||
use_sfa: bool = False,
|
|
||||||
has_sink: bool = False,
|
|
||||||
) -> type[AttentionBackend]:
|
|
||||||
"""Selects which attention backend to use and lazily imports it."""
|
|
||||||
# Accessing envs.* behind an @lru_cache decorator can cause the wrong
|
|
||||||
# value to be returned from the cache if the value changes between calls.
|
|
||||||
# To avoid this, we read envs.VLLM_USE_V1 here and pass it explicitly to the
|
|
||||||
# private function.
|
|
||||||
return _cached_get_attn_backend(
|
|
||||||
head_size=head_size,
|
|
||||||
dtype=dtype,
|
|
||||||
kv_cache_dtype=kv_cache_dtype,
|
|
||||||
block_size=block_size,
|
|
||||||
use_v1=envs.VLLM_USE_V1,
|
|
||||||
use_mla=use_mla,
|
|
||||||
use_sfa=use_sfa,
|
|
||||||
has_sink=has_sink,
|
|
||||||
)
|
|
||||||
|
|
||||||
@cache
|
|
||||||
def _cached_get_attn_backend(
|
|
||||||
head_size: int,
|
|
||||||
dtype: torch.dtype,
|
|
||||||
kv_cache_dtype: Optional[str],
|
|
||||||
block_size: int,
|
|
||||||
use_v1: bool = False,
|
|
||||||
use_mla: bool = False,
|
|
||||||
use_sfa: bool = False,
|
|
||||||
has_sink: bool = False,
|
|
||||||
) -> type[AttentionBackend]:
|
|
||||||
# Check whether a particular choice of backend was
|
|
||||||
# previously forced.
|
|
||||||
#
|
|
||||||
# THIS SELECTION OVERRIDES THE VLLM_ATTENTION_BACKEND
|
|
||||||
# ENVIRONMENT VARIABLE.
|
|
||||||
selected_backend = None
|
|
||||||
backend_by_global_setting: Optional[_Backend] = (
|
|
||||||
get_global_forced_attn_backend())
|
|
||||||
if backend_by_global_setting is not None:
|
|
||||||
selected_backend = backend_by_global_setting
|
|
||||||
else:
|
|
||||||
# Check the environment variable and override if specified
|
|
||||||
backend_by_env_var: Optional[str] = envs.VLLM_ATTENTION_BACKEND
|
|
||||||
if backend_by_env_var is not None:
|
|
||||||
selected_backend = backend_name_to_enum(backend_by_env_var)
|
|
||||||
if selected_backend is None:
|
|
||||||
raise ValueError(
|
|
||||||
f"Invalid attention backend: '{backend_by_env_var}'. "
|
|
||||||
f"Valid backends are: {list(_Backend.__members__.keys())}"
|
|
||||||
)
|
|
||||||
|
|
||||||
# get device-specific attn_backend
|
|
||||||
attention_cls = current_platform.get_attn_backend_cls(
|
|
||||||
selected_backend, head_size, dtype, kv_cache_dtype, block_size,
|
|
||||||
use_v1, use_mla, use_sfa, has_sink)
|
|
||||||
if not attention_cls:
|
|
||||||
raise ValueError(
|
|
||||||
f"Invalid attention backend for {current_platform.device_name}"
|
|
||||||
)
|
|
||||||
return resolve_obj_by_qualname(attention_cls)
|
|
||||||
|
|
||||||
|
|
||||||
vllm.attention.get_attn_backend = get_attn_backend
|
vllm.attention.get_attn_backend = get_attn_backend
|
||||||
|
@ -1,11 +1,10 @@
|
|||||||
import torch
|
import torch
|
||||||
from torch.nn.parameter import Parameter
|
from torch.nn.parameter import Parameter
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
|
from vllm.model_executor.layers.linear import UnquantizedLinearMethod
|
||||||
from vllm.model_executor.utils import set_weight_attrs
|
from vllm.model_executor.utils import set_weight_attrs
|
||||||
from vllm.utils import GiB_bytes
|
from vllm.utils import GiB_bytes
|
||||||
|
|
||||||
from vllm_ascend.utils import vllm_version_is
|
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
@ -39,6 +38,4 @@ def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int,
|
|||||||
set_weight_attrs(weight, extra_weight_attrs)
|
set_weight_attrs(weight, extra_weight_attrs)
|
||||||
|
|
||||||
|
|
||||||
if not vllm_version_is("0.10.2"):
|
UnquantizedLinearMethod.create_weights = create_weights
|
||||||
from vllm.model_executor.layers.linear import UnquantizedLinearMethod
|
|
||||||
UnquantizedLinearMethod.create_weights = create_weights
|
|
||||||
|
@ -32,7 +32,7 @@ from vllm_ascend.ascend_config import (check_ascend_config, get_ascend_config,
|
|||||||
from vllm_ascend.torchair.utils import (check_torchair_cache_exist,
|
from vllm_ascend.torchair.utils import (check_torchair_cache_exist,
|
||||||
delete_torchair_cache_file)
|
delete_torchair_cache_file)
|
||||||
from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD, is_310p,
|
from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD, is_310p,
|
||||||
update_aclgraph_sizes, vllm_version_is)
|
update_aclgraph_sizes)
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from vllm.config import ModelConfig, VllmConfig
|
from vllm.config import ModelConfig, VllmConfig
|
||||||
@ -131,10 +131,7 @@ class NPUPlatform(Platform):
|
|||||||
cache_config = vllm_config.cache_config
|
cache_config = vllm_config.cache_config
|
||||||
scheduler_config = vllm_config.scheduler_config
|
scheduler_config = vllm_config.scheduler_config
|
||||||
ascend_scheduler_config = ascend_config.ascend_scheduler_config
|
ascend_scheduler_config = ascend_config.ascend_scheduler_config
|
||||||
if vllm_version_is("0.10.2"):
|
structured_outputs_config = vllm_config.structured_outputs_config
|
||||||
structured_outputs_config = vllm_config.decoding_config
|
|
||||||
else:
|
|
||||||
structured_outputs_config = vllm_config.structured_outputs_config
|
|
||||||
|
|
||||||
if (model_config is not None and not model_config.use_mla
|
if (model_config is not None and not model_config.use_mla
|
||||||
and not scheduler_config.async_scheduling):
|
and not scheduler_config.async_scheduling):
|
||||||
@ -212,9 +209,8 @@ class NPUPlatform(Platform):
|
|||||||
vllm_config._set_cudagraph_sizes()
|
vllm_config._set_cudagraph_sizes()
|
||||||
|
|
||||||
# TODO: Full graph is fully supported later, and the default value will be set to full graph.
|
# TODO: Full graph is fully supported later, and the default value will be set to full graph.
|
||||||
if not vllm_version_is("0.10.2"):
|
if compilation_config.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE:
|
||||||
if compilation_config.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE:
|
compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
|
||||||
compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
|
|
||||||
|
|
||||||
if compilation_config.cudagraph_mode == CUDAGraphMode.NONE:
|
if compilation_config.cudagraph_mode == CUDAGraphMode.NONE:
|
||||||
compilation_config.level = CompilationLevel.NO_COMPILATION
|
compilation_config.level = CompilationLevel.NO_COMPILATION
|
||||||
|
@ -3,13 +3,9 @@ import torch_npu
|
|||||||
from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler, random_sample
|
from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler, random_sample
|
||||||
from vllm.v1.sample.sampler import Sampler
|
from vllm.v1.sample.sampler import Sampler
|
||||||
|
|
||||||
from vllm_ascend.utils import is_310p, vllm_version_is
|
from vllm_ascend.utils import is_310p
|
||||||
|
|
||||||
if vllm_version_is("0.10.2"):
|
DEFAULT_LOGPROBS_MODE = "raw_logprobs"
|
||||||
from vllm.config import LogprobsMode
|
|
||||||
DEFAULT_LOGPROBS_MODE = LogprobsMode.RAW_LOGPROBS
|
|
||||||
else:
|
|
||||||
DEFAULT_LOGPROBS_MODE = "raw_logprobs"
|
|
||||||
|
|
||||||
|
|
||||||
class AscendSampler(Sampler):
|
class AscendSampler(Sampler):
|
||||||
@ -69,18 +65,10 @@ class AscendTopKTopPSampler(TopKTopPSampler):
|
|||||||
"""Override pytorch native implementation to torch_npu"""
|
"""Override pytorch native implementation to torch_npu"""
|
||||||
logits = self._apply_top_k_top_p(logits, k, p)
|
logits = self._apply_top_k_top_p(logits, k, p)
|
||||||
logits_to_return = None
|
logits_to_return = None
|
||||||
if vllm_version_is("0.10.2"):
|
if self.logprobs_mode == "processed_logits":
|
||||||
if self.logprobs_mode == LogprobsMode.PROCESSED_LOGITS:
|
logits_to_return = logits
|
||||||
logits_to_return = logits
|
elif self.logprobs_mode == "processed_logprobs":
|
||||||
elif self.logprobs_mode == LogprobsMode.PROCESSED_LOGPROBS:
|
logits_to_return = logits.log_softmax(dim=-1, dtype=torch.float32)
|
||||||
logits_to_return = logits.log_softmax(dim=-1,
|
|
||||||
dtype=torch.float32)
|
|
||||||
else:
|
|
||||||
if self.logprobs_mode == "processed_logits":
|
|
||||||
logits_to_return = logits
|
|
||||||
elif self.logprobs_mode == "processed_logprobs":
|
|
||||||
logits_to_return = logits.log_softmax(dim=-1,
|
|
||||||
dtype=torch.float32)
|
|
||||||
|
|
||||||
probs = logits.softmax(dim=-1, dtype=torch.float32)
|
probs = logits.softmax(dim=-1, dtype=torch.float32)
|
||||||
return random_sample(probs, generators), logits_to_return
|
return random_sample(probs, generators), logits_to_return
|
||||||
|
@ -21,7 +21,6 @@ from vllm_ascend.attention.attention_mask import AttentionMaskBuilder
|
|||||||
from vllm_ascend.attention.attention_v1 import AscendAttentionState
|
from vllm_ascend.attention.attention_v1 import AscendAttentionState
|
||||||
from vllm_ascend.attention.utils import AscendCommonAttentionMetadata
|
from vllm_ascend.attention.utils import AscendCommonAttentionMetadata
|
||||||
from vllm_ascend.spec_decode.interface import Proposer, SpecDcodeType
|
from vllm_ascend.spec_decode.interface import Proposer, SpecDcodeType
|
||||||
from vllm_ascend.utils import vllm_version_is
|
|
||||||
|
|
||||||
PADDING_SLOT_ID = -1
|
PADDING_SLOT_ID = -1
|
||||||
|
|
||||||
@ -352,10 +351,7 @@ class EagleProposer(Proposer):
|
|||||||
decode_token_per_req=self.runner.decode_token_per_req,
|
decode_token_per_req=self.runner.decode_token_per_req,
|
||||||
num_computed_tokens_cpu=None,
|
num_computed_tokens_cpu=None,
|
||||||
seq_lens=None)
|
seq_lens=None)
|
||||||
if vllm_version_is("0.10.2"):
|
builder = self.runner.attn_groups[0][0].get_metadata_builder()
|
||||||
builder = self.runner.attn_groups[0][0].metadata_builder
|
|
||||||
else:
|
|
||||||
builder = self.runner.attn_groups[0][0].get_metadata_builder()
|
|
||||||
attn_metadata_i = builder.build(0, common_attn_metadata,
|
attn_metadata_i = builder.build(0, common_attn_metadata,
|
||||||
self.runner.get_model())
|
self.runner.get_model())
|
||||||
for layer_name in kv_cache_group_spec.layer_names:
|
for layer_name in kv_cache_group_spec.layer_names:
|
||||||
@ -447,10 +443,7 @@ class EagleProposer(Proposer):
|
|||||||
num_computed_tokens_cpu=None,
|
num_computed_tokens_cpu=None,
|
||||||
seq_lens=None)
|
seq_lens=None)
|
||||||
# FIXME(woosuk): The below two ops cause synchronization. Optimize.
|
# FIXME(woosuk): The below two ops cause synchronization. Optimize.
|
||||||
if vllm_version_is("0.10.2"):
|
builder = self.runner.attn_groups[0][0].get_metadata_builder()
|
||||||
builder = self.runner.attn_groups[0][0].metadata_builder
|
|
||||||
else:
|
|
||||||
builder = self.runner.attn_groups[0][0].get_metadata_builder()
|
|
||||||
attn_metadata = builder.build(0, common_attn_metadata,
|
attn_metadata = builder.build(0, common_attn_metadata,
|
||||||
self.runner.get_model())
|
self.runner.get_model())
|
||||||
if self.use_cuda_graph and \
|
if self.use_cuda_graph and \
|
||||||
@ -479,10 +472,7 @@ class EagleProposer(Proposer):
|
|||||||
hidden_states=self.hidden_states[:num_input_tokens],
|
hidden_states=self.hidden_states[:num_input_tokens],
|
||||||
)
|
)
|
||||||
sample_hidden_states = last_hidden_states[last_token_indices]
|
sample_hidden_states = last_hidden_states[last_token_indices]
|
||||||
if vllm_version_is("0.10.2"):
|
logits = self.model.compute_logits(sample_hidden_states)
|
||||||
logits = self.model.compute_logits(sample_hidden_states, None)
|
|
||||||
else:
|
|
||||||
logits = self.model.compute_logits(sample_hidden_states)
|
|
||||||
draft_token_ids = logits.argmax(dim=-1)
|
draft_token_ids = logits.argmax(dim=-1)
|
||||||
|
|
||||||
# Early exit if there is only one draft token to be generated.
|
# Early exit if there is only one draft token to be generated.
|
||||||
@ -586,12 +576,7 @@ class EagleProposer(Proposer):
|
|||||||
hidden_states=self.hidden_states[:input_batch_size],
|
hidden_states=self.hidden_states[:input_batch_size],
|
||||||
)
|
)
|
||||||
hidden_states = hidden_states[:batch_size]
|
hidden_states = hidden_states[:batch_size]
|
||||||
if vllm_version_is("0.10.2"):
|
logits = self.model.compute_logits(last_hidden_states[:batch_size])
|
||||||
logits = self.model.compute_logits(
|
|
||||||
last_hidden_states[:batch_size], None)
|
|
||||||
else:
|
|
||||||
logits = self.model.compute_logits(
|
|
||||||
last_hidden_states[:batch_size])
|
|
||||||
|
|
||||||
# TODO(wenlong): get more than one token for tree attention
|
# TODO(wenlong): get more than one token for tree attention
|
||||||
draft_token_ids = logits.argmax(dim=-1)
|
draft_token_ids = logits.argmax(dim=-1)
|
||||||
|
@ -24,8 +24,7 @@ from vllm_ascend.torchair.models.torchair_deepseek_mtp import \
|
|||||||
TorchairDeepSeekMTP
|
TorchairDeepSeekMTP
|
||||||
from vllm_ascend.torchair.utils import (TORCHAIR_CACHE_DIR,
|
from vllm_ascend.torchair.utils import (TORCHAIR_CACHE_DIR,
|
||||||
TorchairCommonAttentionMetadata)
|
TorchairCommonAttentionMetadata)
|
||||||
from vllm_ascend.utils import (ProfileExecuteDuration, lmhead_tp_enable,
|
from vllm_ascend.utils import ProfileExecuteDuration, lmhead_tp_enable
|
||||||
vllm_version_is)
|
|
||||||
|
|
||||||
PADDING_SLOT_ID = -1
|
PADDING_SLOT_ID = -1
|
||||||
|
|
||||||
@ -400,10 +399,7 @@ class MtpProposer(Proposer):
|
|||||||
seq_lens=None)
|
seq_lens=None)
|
||||||
|
|
||||||
if not self.torchair_graph_enabled:
|
if not self.torchair_graph_enabled:
|
||||||
if vllm_version_is("0.10.2"):
|
builder = self.runner.attn_groups[0][0].get_metadata_builder()
|
||||||
builder = self.runner.attn_groups[0][0].metadata_builder
|
|
||||||
else:
|
|
||||||
builder = self.runner.attn_groups[0][0].get_metadata_builder()
|
|
||||||
attn_metadata_mtp = builder.build(0, common_attn_metadata,
|
attn_metadata_mtp = builder.build(0, common_attn_metadata,
|
||||||
self.runner.get_model())
|
self.runner.get_model())
|
||||||
|
|
||||||
|
@ -56,7 +56,6 @@ from vllm_ascend.attention.attention_v1 import AscendAttentionState
|
|||||||
from vllm_ascend.ops.fused_moe import AscendFusedMoE
|
from vllm_ascend.ops.fused_moe import AscendFusedMoE
|
||||||
from vllm_ascend.torchair.ops.sequence_parallel import (MetadataForPadding,
|
from vllm_ascend.torchair.ops.sequence_parallel import (MetadataForPadding,
|
||||||
init_metadata_for_sp)
|
init_metadata_for_sp)
|
||||||
from vllm_ascend.utils import vllm_version_is
|
|
||||||
|
|
||||||
|
|
||||||
class CustomSparseMoeBlock(Qwen3MoeSparseMoeBlock):
|
class CustomSparseMoeBlock(Qwen3MoeSparseMoeBlock):
|
||||||
@ -312,14 +311,8 @@ class CustomQwen3MoeDecoderLayer(Qwen3MoeDecoderLayer):
|
|||||||
quant_config=quant_config,
|
quant_config=quant_config,
|
||||||
prefix=f"{prefix}.mlp")
|
prefix=f"{prefix}.mlp")
|
||||||
else:
|
else:
|
||||||
if vllm_version_is("0.10.2"):
|
self.mlp = Qwen3MoeSparseMoeBlock(vllm_config=vllm_config,
|
||||||
self.mlp = Qwen3MoeSparseMoeBlock(
|
prefix=f"{prefix}.mlp")
|
||||||
config=config,
|
|
||||||
quant_config=quant_config,
|
|
||||||
prefix=f"{prefix}.mlp")
|
|
||||||
else:
|
|
||||||
self.mlp = Qwen3MoeSparseMoeBlock(vllm_config=vllm_config,
|
|
||||||
prefix=f"{prefix}.mlp")
|
|
||||||
else:
|
else:
|
||||||
self.mlp = Qwen3MoeMLP(hidden_size=config.hidden_size,
|
self.mlp = Qwen3MoeMLP(hidden_size=config.hidden_size,
|
||||||
intermediate_size=config.intermediate_size,
|
intermediate_size=config.intermediate_size,
|
||||||
|
@ -50,8 +50,7 @@ from vllm_ascend.torchair.utils import npu_stream_switch, npu_wait_tensor
|
|||||||
from vllm_ascend.utils import (AscendSocVersion, dispose_tensor,
|
from vllm_ascend.utils import (AscendSocVersion, dispose_tensor,
|
||||||
get_all_reduce_merge_state,
|
get_all_reduce_merge_state,
|
||||||
get_ascend_soc_version,
|
get_ascend_soc_version,
|
||||||
get_rm_router_logits_state, is_310p,
|
get_rm_router_logits_state, is_310p)
|
||||||
vllm_version_is)
|
|
||||||
|
|
||||||
|
|
||||||
def torchair_fused_experts_with_mc2(
|
def torchair_fused_experts_with_mc2(
|
||||||
@ -1061,26 +1060,14 @@ class TorchairAscendFusedMoE(FusedMoE):
|
|||||||
if self.scoring_func != "softmax" and not self.use_grouped_topk:
|
if self.scoring_func != "softmax" and not self.use_grouped_topk:
|
||||||
raise ValueError("Only softmax scoring function is supported for "
|
raise ValueError("Only softmax scoring function is supported for "
|
||||||
"non-grouped topk.")
|
"non-grouped topk.")
|
||||||
|
self.moe = FusedMoEConfig(
|
||||||
if vllm_version_is("0.10.2"):
|
num_experts=self.global_num_experts,
|
||||||
self.moe = FusedMoEConfig.make(
|
experts_per_token=top_k,
|
||||||
num_experts=self.global_num_experts,
|
hidden_dim=hidden_size,
|
||||||
experts_per_token=top_k,
|
num_local_experts=self.local_num_experts,
|
||||||
hidden_dim=hidden_size,
|
moe_parallel_config=self.moe_parallel_config,
|
||||||
num_local_experts=self.local_num_experts,
|
in_dtype=params_dtype,
|
||||||
moe_parallel_config=self.moe_parallel_config,
|
)
|
||||||
# TODO (bnell): this needs to be fixed for quantized types.
|
|
||||||
in_dtype=params_dtype,
|
|
||||||
quant_config=quant_config)
|
|
||||||
else:
|
|
||||||
self.moe = FusedMoEConfig(
|
|
||||||
num_experts=self.global_num_experts,
|
|
||||||
experts_per_token=top_k,
|
|
||||||
hidden_dim=hidden_size,
|
|
||||||
num_local_experts=self.local_num_experts,
|
|
||||||
moe_parallel_config=self.moe_parallel_config,
|
|
||||||
in_dtype=params_dtype,
|
|
||||||
)
|
|
||||||
if quant_config is None:
|
if quant_config is None:
|
||||||
self.quant_method = TorchairAscendUnquantizedFusedMoEMethod(
|
self.quant_method = TorchairAscendUnquantizedFusedMoEMethod(
|
||||||
self.moe)
|
self.moe)
|
||||||
@ -1242,12 +1229,8 @@ class TorchairAscendFusedMoE(FusedMoE):
|
|||||||
router_logits = get_dp_group().all_gather(router_logits, 0)
|
router_logits = get_dp_group().all_gather(router_logits, 0)
|
||||||
|
|
||||||
elif fused_moe_state == FusedMoEState.NaiveMulticast:
|
elif fused_moe_state == FusedMoEState.NaiveMulticast:
|
||||||
if vllm_version_is("0.10.2"):
|
cu_tokens_across_dp_cpu = get_forward_context(
|
||||||
cu_tokens_across_dp_cpu = get_forward_context(
|
).dp_metadata.cu_tokens_across_sp(1)
|
||||||
).dp_metadata.cu_tokens_across_dp_cpu
|
|
||||||
else:
|
|
||||||
cu_tokens_across_dp_cpu = get_forward_context(
|
|
||||||
).dp_metadata.cu_tokens_across_sp(1)
|
|
||||||
hidden_states = self.naive_multicast(hidden_states,
|
hidden_states = self.naive_multicast(hidden_states,
|
||||||
cu_tokens_across_dp_cpu)
|
cu_tokens_across_dp_cpu)
|
||||||
if self.rm_router_logits:
|
if self.rm_router_logits:
|
||||||
|
@ -78,10 +78,12 @@ from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher
|
|||||||
# yapf: disable
|
# yapf: disable
|
||||||
from vllm.v1.kv_cache_interface import (AttentionSpec, FullAttentionSpec,
|
from vllm.v1.kv_cache_interface import (AttentionSpec, FullAttentionSpec,
|
||||||
KVCacheConfig, KVCacheGroupSpec,
|
KVCacheConfig, KVCacheGroupSpec,
|
||||||
KVCacheSpec, MambaSpec)
|
KVCacheSpec, MambaSpec,
|
||||||
|
UniformTypeKVCacheSpecs)
|
||||||
# yapf: enable
|
# yapf: enable
|
||||||
from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput,
|
from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput,
|
||||||
DraftTokenIds, LogprobsTensors, ModelRunnerOutput)
|
DraftTokenIds, LogprobsTensors, ModelRunnerOutput,
|
||||||
|
PoolerOutput)
|
||||||
from vllm.v1.pool.metadata import PoolingMetadata
|
from vllm.v1.pool.metadata import PoolingMetadata
|
||||||
from vllm.v1.sample.metadata import SamplingMetadata
|
from vllm.v1.sample.metadata import SamplingMetadata
|
||||||
from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
|
from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
|
||||||
@ -121,7 +123,7 @@ from vllm_ascend.spec_decode.mtp_proposer import MtpProposer
|
|||||||
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ,
|
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ,
|
||||||
AscendSocVersion, ProfileExecuteDuration,
|
AscendSocVersion, ProfileExecuteDuration,
|
||||||
get_ascend_soc_version, is_310p,
|
get_ascend_soc_version, is_310p,
|
||||||
lmhead_tp_enable, vllm_version_is)
|
lmhead_tp_enable)
|
||||||
from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch
|
from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
@ -143,13 +145,6 @@ if is_310p():
|
|||||||
else:
|
else:
|
||||||
ACL_FORMAT = ACL_FORMAT_FRACTAL_ND
|
ACL_FORMAT = ACL_FORMAT_FRACTAL_ND
|
||||||
|
|
||||||
if not vllm_version_is("0.10.2"):
|
|
||||||
from vllm.v1.kv_cache_interface import UniformTypeKVCacheSpecs
|
|
||||||
from vllm.v1.outputs import PoolerOutput
|
|
||||||
else:
|
|
||||||
from vllm.sequence import PoolerOutput
|
|
||||||
UniformTypeKVCacheSpecs = None
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class GraphCaptureContext:
|
class GraphCaptureContext:
|
||||||
@ -308,23 +303,13 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
dtype=self.dtype,
|
dtype=self.dtype,
|
||||||
device=self.device)
|
device=self.device)
|
||||||
# Set up Attention
|
# Set up Attention
|
||||||
if vllm_version_is("0.10.2"):
|
self.attn_backend = get_attn_backend(
|
||||||
self.attn_backend = get_attn_backend(
|
0,
|
||||||
0,
|
self.dtype,
|
||||||
self.dtype,
|
None,
|
||||||
None,
|
self.block_size,
|
||||||
self.block_size,
|
use_mla=self.model_config.use_mla,
|
||||||
self.model_config.is_attention_free,
|
use_sfa=self.ascend_config.use_sfa)
|
||||||
use_mla=self.model_config.use_mla,
|
|
||||||
use_sfa=self.ascend_config.use_sfa)
|
|
||||||
else:
|
|
||||||
self.attn_backend = get_attn_backend(
|
|
||||||
0,
|
|
||||||
self.dtype,
|
|
||||||
None,
|
|
||||||
self.block_size,
|
|
||||||
use_mla=self.model_config.use_mla,
|
|
||||||
use_sfa=self.ascend_config.use_sfa)
|
|
||||||
if torch.version.cann.startswith("8.3"):
|
if torch.version.cann.startswith("8.3"):
|
||||||
self.attn_mask_builder = AttentionMaskBuilder(
|
self.attn_mask_builder = AttentionMaskBuilder(
|
||||||
self.scheduler_config.max_num_batched_tokens, self.dtype,
|
self.scheduler_config.max_num_batched_tokens, self.dtype,
|
||||||
@ -602,12 +587,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
to_update.apply(pooling_params)
|
to_update.apply(pooling_params)
|
||||||
|
|
||||||
backward_kwargs = {}
|
backward_kwargs = {}
|
||||||
if vllm_version_is("0.10.2"):
|
backward_kwargs["mm_features"] = new_req_data.mm_features
|
||||||
backward_kwargs["mm_kwargs"] = new_req_data.mm_kwargs
|
|
||||||
backward_kwargs["mm_hashes"] = new_req_data.mm_hashes
|
|
||||||
backward_kwargs["mm_positions"] = new_req_data.mm_positions
|
|
||||||
else:
|
|
||||||
backward_kwargs["mm_features"] = new_req_data.mm_features
|
|
||||||
|
|
||||||
self.requests[req_id] = CachedRequestState(
|
self.requests[req_id] = CachedRequestState(
|
||||||
req_id=req_id,
|
req_id=req_id,
|
||||||
@ -624,10 +604,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
|
|
||||||
# Only relevant for models using M-RoPE (e.g, Qwen2-VL)
|
# Only relevant for models using M-RoPE (e.g, Qwen2-VL)
|
||||||
if self.uses_mrope:
|
if self.uses_mrope:
|
||||||
if vllm_version_is("0.10.2"):
|
self._init_mrope_positions(self.requests[req_id])
|
||||||
self._init_mrope_positions_0102(self.requests[req_id])
|
|
||||||
else:
|
|
||||||
self._init_mrope_positions(self.requests[req_id])
|
|
||||||
|
|
||||||
req_ids_to_add.append(req_id)
|
req_ids_to_add.append(req_id)
|
||||||
|
|
||||||
@ -759,39 +736,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
use_audio_in_video=use_audio_in_video,
|
use_audio_in_video=use_audio_in_video,
|
||||||
)
|
)
|
||||||
|
|
||||||
def _init_mrope_positions_0102(self, req_state: CachedRequestState):
|
|
||||||
image_grid_thw = []
|
|
||||||
video_grid_thw = []
|
|
||||||
second_per_grid_ts = []
|
|
||||||
audio_feature_lengths = []
|
|
||||||
use_audio_in_video = False
|
|
||||||
assert req_state.mm_kwargs is not None
|
|
||||||
for mm_item in req_state.mm_kwargs:
|
|
||||||
mm_input = mm_item.get_data()
|
|
||||||
if mm_input.get("image_grid_thw") is not None:
|
|
||||||
image_grid_thw.append(mm_input["image_grid_thw"].tolist())
|
|
||||||
if mm_input.get("video_grid_thw") is not None:
|
|
||||||
video_grid_thw.append(mm_input["video_grid_thw"].tolist())
|
|
||||||
if mm_input.get("second_per_grid_ts") is not None:
|
|
||||||
second_per_grid_ts.append(mm_input["second_per_grid_ts"])
|
|
||||||
if mm_input.get("audio_feature_lengths") is not None:
|
|
||||||
audio_feature_lengths.append(mm_input["audio_feature_lengths"])
|
|
||||||
if mm_input.get("use_audio_in_video") is True:
|
|
||||||
use_audio_in_video = True
|
|
||||||
|
|
||||||
hf_config = self.model_config.hf_config
|
|
||||||
|
|
||||||
req_state.mrope_positions, req_state.mrope_position_delta = \
|
|
||||||
MRotaryEmbedding.get_input_positions_tensor(
|
|
||||||
req_state.prompt_token_ids,
|
|
||||||
hf_config=hf_config,
|
|
||||||
image_grid_thw=image_grid_thw,
|
|
||||||
video_grid_thw=video_grid_thw,
|
|
||||||
second_per_grid_ts=second_per_grid_ts,
|
|
||||||
audio_feature_lengths=audio_feature_lengths,
|
|
||||||
use_audio_in_video=use_audio_in_video,
|
|
||||||
)
|
|
||||||
|
|
||||||
def _sync_metadata_across_dp(
|
def _sync_metadata_across_dp(
|
||||||
self, num_tokens: int, with_prefill: bool, enable_dbo: bool
|
self, num_tokens: int, with_prefill: bool, enable_dbo: bool
|
||||||
) -> tuple[int, Optional[torch.Tensor], bool, bool]:
|
) -> tuple[int, Optional[torch.Tensor], bool, bool]:
|
||||||
@ -966,12 +910,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
return
|
return
|
||||||
|
|
||||||
# Batch the multi-modal inputs.
|
# Batch the multi-modal inputs.
|
||||||
if vllm_version_is("0.10.2"):
|
mm_kwargs, mm_hashes_pos = self._batch_mm_kwargs_from_scheduler(
|
||||||
mm_kwargs, mm_hashes_pos = self._batch_mm_kwargs_from_scheduler_0102(
|
scheduler_output)
|
||||||
scheduler_output)
|
|
||||||
else:
|
|
||||||
mm_kwargs, mm_hashes_pos = self._batch_mm_kwargs_from_scheduler(
|
|
||||||
scheduler_output)
|
|
||||||
encoder_outputs = []
|
encoder_outputs = []
|
||||||
|
|
||||||
for _, num_items, mm_kwargs_group in group_mm_kwargs_by_modality(
|
for _, num_items, mm_kwargs_group in group_mm_kwargs_by_modality(
|
||||||
@ -1003,31 +943,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
is_embed=pos_info.is_embed,
|
is_embed=pos_info.is_embed,
|
||||||
)
|
)
|
||||||
|
|
||||||
# TODO: remove this once we drop support for vLLM 0.10.2
|
|
||||||
def _batch_mm_kwargs_from_scheduler_0102(
|
|
||||||
self,
|
|
||||||
scheduler_output: "SchedulerOutput",
|
|
||||||
) -> tuple[list[MultiModalKwargsItem], list[tuple[str, PlaceholderRange]]]:
|
|
||||||
scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs
|
|
||||||
if not scheduled_encoder_inputs:
|
|
||||||
return [], []
|
|
||||||
# Batch the multi-modal inputs.
|
|
||||||
mm_kwargs = list[MultiModalKwargsItem]()
|
|
||||||
# list of tuple (mm_hash, position_info)
|
|
||||||
mm_hashes_pos = list[tuple[str, PlaceholderRange]]()
|
|
||||||
for req_id, encoder_input_ids in scheduled_encoder_inputs.items():
|
|
||||||
req_state = self.requests[req_id]
|
|
||||||
assert req_state.mm_hashes is not None
|
|
||||||
assert req_state.mm_kwargs is not None
|
|
||||||
assert req_state.mm_positions is not None
|
|
||||||
for mm_input_id in encoder_input_ids:
|
|
||||||
mm_hash = req_state.mm_hashes[mm_input_id]
|
|
||||||
mm_kwargs.append(req_state.mm_kwargs[mm_input_id])
|
|
||||||
mm_hashes_pos.append(
|
|
||||||
(mm_hash, req_state.mm_positions[mm_input_id]))
|
|
||||||
|
|
||||||
return mm_kwargs, mm_hashes_pos
|
|
||||||
|
|
||||||
def _batch_mm_kwargs_from_scheduler(
|
def _batch_mm_kwargs_from_scheduler(
|
||||||
self,
|
self,
|
||||||
scheduler_output: "SchedulerOutput",
|
scheduler_output: "SchedulerOutput",
|
||||||
@ -1067,20 +982,11 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
) -> list[torch.Tensor]:
|
) -> list[torch.Tensor]:
|
||||||
|
|
||||||
def _iter_mm_features(req_state: CachedRequestState):
|
def _iter_mm_features(req_state: CachedRequestState):
|
||||||
if vllm_version_is("0.10.2"):
|
assert req_state.mm_features is not None
|
||||||
# legacy path (to be removed later)
|
for mm_feature in req_state.mm_features:
|
||||||
assert req_state.mm_hashes is not None
|
pos_info = mm_feature.mm_position
|
||||||
assert req_state.mm_positions is not None
|
yield mm_feature.identifier, pos_info, getattr(
|
||||||
for mm_hash, pos_info in zip(req_state.mm_hashes,
|
pos_info, "is_embed", None)
|
||||||
req_state.mm_positions):
|
|
||||||
yield mm_hash, pos_info, getattr(pos_info, "is_embed",
|
|
||||||
None)
|
|
||||||
else:
|
|
||||||
assert req_state.mm_features is not None
|
|
||||||
for mm_feature in req_state.mm_features:
|
|
||||||
pos_info = mm_feature.mm_position
|
|
||||||
yield mm_feature.identifier, pos_info, getattr(
|
|
||||||
pos_info, "is_embed", None)
|
|
||||||
|
|
||||||
mm_embeds: list[torch.Tensor] = []
|
mm_embeds: list[torch.Tensor] = []
|
||||||
|
|
||||||
@ -1527,10 +1433,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
for attn_group in self.attn_groups[kv_cache_group_id]:
|
for attn_group in self.attn_groups[kv_cache_group_id]:
|
||||||
common_prefix_len = 0
|
common_prefix_len = 0
|
||||||
extra_attn_metadata_args = {}
|
extra_attn_metadata_args = {}
|
||||||
if vllm_version_is("0.10.2"):
|
builder = attn_group.get_metadata_builder()
|
||||||
builder = attn_group.metadata_builder
|
|
||||||
else:
|
|
||||||
builder = attn_group.get_metadata_builder()
|
|
||||||
if isinstance(builder, GDNAttentionMetadataBuilder):
|
if isinstance(builder, GDNAttentionMetadataBuilder):
|
||||||
if use_spec_decode:
|
if use_spec_decode:
|
||||||
extra_attn_metadata_args = dict(
|
extra_attn_metadata_args = dict(
|
||||||
@ -1809,29 +1712,21 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
device=hidden_states.device)
|
device=hidden_states.device)
|
||||||
seq_lens_cpu = self.seq_lens_cpu[:self.input_batch.num_reqs]
|
seq_lens_cpu = self.seq_lens_cpu[:self.input_batch.num_reqs]
|
||||||
|
|
||||||
if vllm_version_is("0.10.2"):
|
model = cast(VllmModelForPooling, self.model)
|
||||||
# Pooling models D2H & synchronize occurs in pooler.py:build_output
|
raw_pooler_output = model.pooler(
|
||||||
raw_pooler_output = self.model.pooler(
|
hidden_states=hidden_states,
|
||||||
hidden_states=hidden_states, pooling_metadata=pooling_metadata)
|
pooling_metadata=pooling_metadata,
|
||||||
else:
|
)
|
||||||
model = cast(VllmModelForPooling, self.model)
|
raw_pooler_output = json_map_leaves(
|
||||||
raw_pooler_output = model.pooler(
|
lambda x: x.to("cpu", non_blocking=True),
|
||||||
hidden_states=hidden_states,
|
raw_pooler_output,
|
||||||
pooling_metadata=pooling_metadata,
|
)
|
||||||
)
|
torch.npu.synchronize()
|
||||||
raw_pooler_output = json_map_leaves(
|
|
||||||
lambda x: x.to("cpu", non_blocking=True),
|
|
||||||
raw_pooler_output,
|
|
||||||
)
|
|
||||||
torch.npu.synchronize()
|
|
||||||
|
|
||||||
pooler_output: list[Optional[torch.Tensor]] = []
|
pooler_output: list[Optional[torch.Tensor]] = []
|
||||||
for raw_output, seq_len, prompt_len in zip(
|
for raw_output, seq_len, prompt_len in zip(
|
||||||
raw_pooler_output, seq_lens_cpu, pooling_metadata.prompt_lens):
|
raw_pooler_output, seq_lens_cpu, pooling_metadata.prompt_lens):
|
||||||
if vllm_version_is("0.10.2"):
|
output = raw_output if seq_len == prompt_len else None
|
||||||
output = raw_output.data if seq_len == prompt_len else None
|
|
||||||
else:
|
|
||||||
output = raw_output if seq_len == prompt_len else None
|
|
||||||
pooler_output.append(output)
|
pooler_output.append(output)
|
||||||
|
|
||||||
return ModelRunnerOutput(
|
return ModelRunnerOutput(
|
||||||
@ -2006,8 +1901,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
num_scheduled_tokens_np, finished_sending,
|
num_scheduled_tokens_np, finished_sending,
|
||||||
finished_recving, kv_connector_output)
|
finished_recving, kv_connector_output)
|
||||||
sample_hidden_states = hidden_states[logits_indices]
|
sample_hidden_states = hidden_states[logits_indices]
|
||||||
logits = self._compute_logits_wrapper(sample_hidden_states,
|
logits = self.model.compute_logits(sample_hidden_states)
|
||||||
None)
|
|
||||||
if broadcast_pp_output:
|
if broadcast_pp_output:
|
||||||
model_output_broadcast_data = {
|
model_output_broadcast_data = {
|
||||||
"logits": logits.contiguous(),
|
"logits": logits.contiguous(),
|
||||||
@ -2302,10 +2196,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
)
|
)
|
||||||
|
|
||||||
for attn_group in self.attn_groups[kv_cache_group_id]:
|
for attn_group in self.attn_groups[kv_cache_group_id]:
|
||||||
if vllm_version_is("0.10.2"):
|
builder = attn_group.get_metadata_builder()
|
||||||
builder = attn_group.metadata_builder
|
|
||||||
else:
|
|
||||||
builder = attn_group.get_metadata_builder()
|
|
||||||
attn_metadata_i = builder.build_for_graph_capture(
|
attn_metadata_i = builder.build_for_graph_capture(
|
||||||
common_attn_metadata)
|
common_attn_metadata)
|
||||||
for layer_name in kv_cache_group_spec.layer_names:
|
for layer_name in kv_cache_group_spec.layer_names:
|
||||||
@ -2463,8 +2354,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
dtype=torch.int32)
|
dtype=torch.int32)
|
||||||
|
|
||||||
def dummy_compute_logits(hidden_states):
|
def dummy_compute_logits(hidden_states):
|
||||||
return self._compute_logits_wrapper(
|
return self.model.compute_logits(
|
||||||
hidden_states[dummy_indices], None)
|
hidden_states[dummy_indices])
|
||||||
|
|
||||||
with set_ascend_forward_context(
|
with set_ascend_forward_context(
|
||||||
attn_metadata,
|
attn_metadata,
|
||||||
@ -2542,18 +2433,13 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
logit_indices = np.cumsum(num_scheduled_tokens) - 1
|
logit_indices = np.cumsum(num_scheduled_tokens) - 1
|
||||||
# TODO: need to rum a dummy sampler for generate task
|
# TODO: need to rum a dummy sampler for generate task
|
||||||
hidden_states = hidden_states[logit_indices]
|
hidden_states = hidden_states[logit_indices]
|
||||||
output = self._compute_logits_wrapper(hidden_states, None)
|
output = self.model.compute_logits(hidden_states)
|
||||||
|
|
||||||
NPUPlatform.synchronize()
|
NPUPlatform.synchronize()
|
||||||
del hidden_states, output
|
del hidden_states, output
|
||||||
self.encoder_cache.clear()
|
self.encoder_cache.clear()
|
||||||
gc.collect()
|
gc.collect()
|
||||||
|
|
||||||
def _compute_logits_wrapper(self, hidden_states, sampling_metadata):
|
|
||||||
if vllm_version_is("0.10.2"):
|
|
||||||
return self.model.compute_logits(hidden_states, sampling_metadata)
|
|
||||||
return self.model.compute_logits(hidden_states)
|
|
||||||
|
|
||||||
def _dummy_pooler_run_task(
|
def _dummy_pooler_run_task(
|
||||||
self,
|
self,
|
||||||
hidden_states: torch.Tensor,
|
hidden_states: torch.Tensor,
|
||||||
@ -2615,10 +2501,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
for task in self.get_supported_pooling_tasks():
|
for task in self.get_supported_pooling_tasks():
|
||||||
# Run a full batch with each task to ensure none of them OOMs
|
# Run a full batch with each task to ensure none of them OOMs
|
||||||
output = self._dummy_pooler_run_task(hidden_states, task)
|
output = self._dummy_pooler_run_task(hidden_states, task)
|
||||||
if vllm_version_is("0.10.2"):
|
output_size[task] = sum(o.nbytes for o in output)
|
||||||
output_size[task] = output.get_data_nbytes()
|
|
||||||
else:
|
|
||||||
output_size[task] = sum(o.nbytes for o in output)
|
|
||||||
del output # Allow GC
|
del output # Allow GC
|
||||||
|
|
||||||
max_task = max(output_size.items(), key=lambda x: x[1])[0]
|
max_task = max(output_size.items(), key=lambda x: x[1])[0]
|
||||||
@ -2657,16 +2540,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
self.model.get_eagle3_aux_hidden_state_layers())
|
self.model.get_eagle3_aux_hidden_state_layers())
|
||||||
|
|
||||||
if self.lora_config:
|
if self.lora_config:
|
||||||
if vllm_version_is("0.10.2"):
|
self.model = self.load_lora_model(self.model, self.vllm_config,
|
||||||
self.model = self.load_lora_model(self.model,
|
self.device)
|
||||||
self.model_config,
|
|
||||||
self.scheduler_config,
|
|
||||||
self.lora_config,
|
|
||||||
self.device)
|
|
||||||
else:
|
|
||||||
self.model = self.load_lora_model(self.model,
|
|
||||||
self.vllm_config,
|
|
||||||
self.device)
|
|
||||||
logger.info("Loading model weights took %.4f GB",
|
logger.info("Loading model weights took %.4f GB",
|
||||||
m.consumed_memory / float(2**30))
|
m.consumed_memory / float(2**30))
|
||||||
|
|
||||||
@ -2694,17 +2569,10 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
self.initialize_attn_backend(kv_cache_config)
|
self.initialize_attn_backend(kv_cache_config)
|
||||||
self.use_hybrid_blocks = (len(self.attn_groups) > 1)
|
self.use_hybrid_blocks = (len(self.attn_groups) > 1)
|
||||||
# NOTE: Currently, we determine whether we need `num_accepted_tokens` through `MambaSpec`.
|
# NOTE: Currently, we determine whether we need `num_accepted_tokens` through `MambaSpec`.
|
||||||
if vllm_version_is("0.10.2"):
|
self.need_accepted_tokens = any([
|
||||||
self.need_accepted_tokens = any([
|
isinstance(attn_group[0].kv_cache_spec, MambaSpec)
|
||||||
isinstance(
|
for attn_group in self.attn_groups
|
||||||
self.kv_cache_config.kv_cache_groups[0].kv_cache_spec,
|
])
|
||||||
MambaSpec) for attn_group in self.attn_groups
|
|
||||||
])
|
|
||||||
else:
|
|
||||||
self.need_accepted_tokens = any([
|
|
||||||
isinstance(attn_group[0].kv_cache_spec, MambaSpec)
|
|
||||||
for attn_group in self.attn_groups
|
|
||||||
])
|
|
||||||
|
|
||||||
self.may_reinitialize_input_batch(kv_cache_config)
|
self.may_reinitialize_input_batch(kv_cache_config)
|
||||||
|
|
||||||
@ -2737,11 +2605,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
kv_cache_sizes[kv_cache_tensor.shared_by[0]] = kv_cache_tensor.size
|
kv_cache_sizes[kv_cache_tensor.shared_by[0]] = kv_cache_tensor.size
|
||||||
|
|
||||||
kv_caches: Dict[str, torch.Tensor] = {}
|
kv_caches: Dict[str, torch.Tensor] = {}
|
||||||
for group in self._kv_cache_spec_attn_group_iterator_dispatcher():
|
for group in self._kv_cache_spec_attn_group_iterator():
|
||||||
if vllm_version_is("0.10.2"):
|
kv_cache_spec = group.kv_cache_spec
|
||||||
kv_cache_spec, group = group
|
|
||||||
else:
|
|
||||||
kv_cache_spec = group.kv_cache_spec
|
|
||||||
attn_backend = group.backend
|
attn_backend = group.backend
|
||||||
for layer_name in group.layer_names:
|
for layer_name in group.layer_names:
|
||||||
if layer_name in self.runner_only_attn_layers:
|
if layer_name in self.runner_only_attn_layers:
|
||||||
@ -2846,11 +2711,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
kv_cache_sizes[kv_cache_tensor.shared_by[0]] = kv_cache_tensor.size
|
kv_cache_sizes[kv_cache_tensor.shared_by[0]] = kv_cache_tensor.size
|
||||||
|
|
||||||
kv_caches: Dict[str, torch.Tensor] = {}
|
kv_caches: Dict[str, torch.Tensor] = {}
|
||||||
for group in self._kv_cache_spec_attn_group_iterator_dispatcher():
|
for group in self._kv_cache_spec_attn_group_iterator():
|
||||||
if vllm_version_is("0.10.2"):
|
kv_cache_spec = group.kv_cache_spec
|
||||||
kv_cache_spec, group = group
|
|
||||||
else:
|
|
||||||
kv_cache_spec = group.kv_cache_spec
|
|
||||||
attn_backend = group.backend
|
attn_backend = group.backend
|
||||||
for layer_name in group.layer_names:
|
for layer_name in group.layer_names:
|
||||||
if layer_name in self.runner_only_attn_layers:
|
if layer_name in self.runner_only_attn_layers:
|
||||||
@ -2996,11 +2858,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
)), "Some layers are not correctly initialized"
|
)), "Some layers are not correctly initialized"
|
||||||
|
|
||||||
kv_caches: Dict[str, torch.Tensor] = {}
|
kv_caches: Dict[str, torch.Tensor] = {}
|
||||||
for group in self._kv_cache_spec_attn_group_iterator_dispatcher():
|
for group in self._kv_cache_spec_attn_group_iterator():
|
||||||
if vllm_version_is("0.10.2"):
|
kv_cache_spec = group.kv_cache_spec
|
||||||
kv_cache_spec, group = group
|
|
||||||
else:
|
|
||||||
kv_cache_spec = group.kv_cache_spec
|
|
||||||
attn_backend = group.backend
|
attn_backend = group.backend
|
||||||
for layer_name in group.layer_names:
|
for layer_name in group.layer_names:
|
||||||
if layer_name in self.runner_only_attn_layers:
|
if layer_name in self.runner_only_attn_layers:
|
||||||
@ -3211,50 +3070,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
for k, v in attn_backend_layers.items()
|
for k, v in attn_backend_layers.items()
|
||||||
}
|
}
|
||||||
|
|
||||||
def get_attn_backends_for_layers(
|
|
||||||
layer_names: list[str]
|
|
||||||
) -> dict[type[AttentionBackend], list[str]]:
|
|
||||||
"""Get attention_backend for all attention layers
|
|
||||||
TODO: Only used in v0.10.2, drop me when 0.10.2 is dropped
|
|
||||||
"""
|
|
||||||
layers = get_layers_from_vllm_config(self.vllm_config,
|
|
||||||
AttentionLayerBase,
|
|
||||||
layer_names)
|
|
||||||
attn_backends = {}
|
|
||||||
attn_backend_layers = defaultdict(list)
|
|
||||||
# Dedupe based on full class name; this is a bit safer than
|
|
||||||
# using the class itself as the key because when we create dynamic
|
|
||||||
# attention backend subclasses (e.g. ChunkedLocalAttention) unless
|
|
||||||
# they are cached correctly, there will be different objects per
|
|
||||||
# layer.
|
|
||||||
for layer_name in layer_names:
|
|
||||||
attn_backend = layers[layer_name].get_attn_backend()
|
|
||||||
key = attn_backend.full_cls_name()
|
|
||||||
attn_backends[key] = attn_backend
|
|
||||||
attn_backend_layers[key].append(layer_name)
|
|
||||||
return {
|
|
||||||
attn_backends[k]: v
|
|
||||||
for k, v in attn_backend_layers.items()
|
|
||||||
}
|
|
||||||
|
|
||||||
def create_attn_groups_v0102(
|
|
||||||
attn_backends_map: dict[AttentionBackend, list[str]],
|
|
||||||
kv_cache_spec: KVCacheSpec,
|
|
||||||
) -> list[AttentionGroup]:
|
|
||||||
attn_groups: list[AttentionGroup] = []
|
|
||||||
for attn_backend, layer_names in attn_backends_map.items():
|
|
||||||
attn_metadata_builder_i = attn_backend.get_builder_cls()(
|
|
||||||
kv_cache_spec,
|
|
||||||
layer_names,
|
|
||||||
self.vllm_config,
|
|
||||||
self.device,
|
|
||||||
)
|
|
||||||
attn_group = AttentionGroup(attn_backend,
|
|
||||||
attn_metadata_builder_i,
|
|
||||||
layer_names)
|
|
||||||
attn_groups.append(attn_group)
|
|
||||||
return attn_groups
|
|
||||||
|
|
||||||
def create_attn_groups(
|
def create_attn_groups(
|
||||||
attn_backends_map: dict[AttentionBackend, list[str]],
|
attn_backends_map: dict[AttentionBackend, list[str]],
|
||||||
) -> list[AttentionGroup]:
|
) -> list[AttentionGroup]:
|
||||||
@ -3274,18 +3089,10 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
attn_groups.append(attn_group)
|
attn_groups.append(attn_group)
|
||||||
return attn_groups
|
return attn_groups
|
||||||
|
|
||||||
if vllm_version_is("0.10.2"):
|
for kv_cache_group_spec in kv_cache_config.kv_cache_groups:
|
||||||
for kv_cache_group_spec in kv_cache_config.kv_cache_groups:
|
attn_backends = get_attn_backends_for_group( # type: ignore
|
||||||
kv_cache_spec = kv_cache_group_spec.kv_cache_spec
|
kv_cache_group_spec)
|
||||||
attn_backends = get_attn_backends_for_layers(
|
self.attn_groups.append(create_attn_groups(attn_backends))
|
||||||
kv_cache_group_spec.layer_names)
|
|
||||||
self.attn_groups.append(
|
|
||||||
create_attn_groups_v0102(attn_backends, kv_cache_spec))
|
|
||||||
else:
|
|
||||||
for kv_cache_group_spec in kv_cache_config.kv_cache_groups:
|
|
||||||
attn_backends = get_attn_backends_for_group( # type: ignore
|
|
||||||
kv_cache_group_spec)
|
|
||||||
self.attn_groups.append(create_attn_groups(attn_backends))
|
|
||||||
|
|
||||||
# Calculate reorder batch threshold (if needed)
|
# Calculate reorder batch threshold (if needed)
|
||||||
self.calculate_reorder_batch_threshold()
|
self.calculate_reorder_batch_threshold()
|
||||||
@ -3299,31 +3106,13 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
for attn_groups in self.attn_groups:
|
for attn_groups in self.attn_groups:
|
||||||
yield from attn_groups
|
yield from attn_groups
|
||||||
|
|
||||||
def _kv_cache_spec_attn_group_iterator_v0102(
|
|
||||||
self) -> Iterator[tuple[KVCacheSpec, AttentionGroup]]:
|
|
||||||
if not self.kv_cache_config.kv_cache_groups:
|
|
||||||
return
|
|
||||||
for kv_cache_spec_id, attn_groups in enumerate(self.attn_groups):
|
|
||||||
for attn_group in attn_groups:
|
|
||||||
yield self.kv_cache_config.kv_cache_groups[
|
|
||||||
kv_cache_spec_id].kv_cache_spec, attn_group
|
|
||||||
|
|
||||||
def _kv_cache_spec_attn_group_iterator_dispatcher(self):
|
|
||||||
if vllm_version_is("0.10.2"):
|
|
||||||
return self._kv_cache_spec_attn_group_iterator_v0102()
|
|
||||||
else:
|
|
||||||
return self._kv_cache_spec_attn_group_iterator()
|
|
||||||
|
|
||||||
def calculate_reorder_batch_threshold(self) -> None:
|
def calculate_reorder_batch_threshold(self) -> None:
|
||||||
"""
|
"""
|
||||||
Check that if any backends reorder batches; that the reordering
|
Check that if any backends reorder batches; that the reordering
|
||||||
is compatible (e.g., decode threshold is the same)
|
is compatible (e.g., decode threshold is the same)
|
||||||
"""
|
"""
|
||||||
for group in self._attn_group_iterator():
|
for group in self._attn_group_iterator():
|
||||||
if vllm_version_is("0.10.2"):
|
attn_metadata_builder_i = group.get_metadata_builder()
|
||||||
attn_metadata_builder_i = group.metadata_builder
|
|
||||||
else:
|
|
||||||
attn_metadata_builder_i = group.get_metadata_builder()
|
|
||||||
if hasattr(attn_metadata_builder_i, "reorder_batch_threshold"):
|
if hasattr(attn_metadata_builder_i, "reorder_batch_threshold"):
|
||||||
# check that if any backends reorder batches; that the reordering
|
# check that if any backends reorder batches; that the reordering
|
||||||
# is compatible (e.g., decode threshold is the same)
|
# is compatible (e.g., decode threshold is the same)
|
||||||
@ -3427,10 +3216,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
min_ag_builder_name = None
|
min_ag_builder_name = None
|
||||||
|
|
||||||
for attn_group in self._attn_group_iterator():
|
for attn_group in self._attn_group_iterator():
|
||||||
if vllm_version_is("0.10.2"):
|
builder = attn_group.get_metadata_builder()
|
||||||
builder = attn_group.metadata_builder
|
|
||||||
else:
|
|
||||||
builder = attn_group.get_metadata_builder()
|
|
||||||
if builder.aclgraph_support.value < min_ag_support.value:
|
if builder.aclgraph_support.value < min_ag_support.value:
|
||||||
min_ag_support = builder.aclgraph_support
|
min_ag_support = builder.aclgraph_support
|
||||||
min_ag_builder_name = builder.__class__.__name__
|
min_ag_builder_name = builder.__class__.__name__
|
||||||
@ -3674,7 +3460,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
req_idx = self.input_batch.req_id_to_index[req_id]
|
req_idx = self.input_batch.req_id_to_index[req_id]
|
||||||
offset = self.query_start_loc_np[req_idx].item()
|
offset = self.query_start_loc_np[req_idx].item()
|
||||||
prompt_hidden_states = hidden_states[offset:offset + num_logits]
|
prompt_hidden_states = hidden_states[offset:offset + num_logits]
|
||||||
logits = self._compute_logits_wrapper(prompt_hidden_states, None)
|
logits = self.model.compute_logits(prompt_hidden_states)
|
||||||
|
|
||||||
# Get the "target" tokens for each index. For prompt at index i,
|
# Get the "target" tokens for each index. For prompt at index i,
|
||||||
# the token at prompt index i+1 is the "sampled" token we want
|
# the token at prompt index i+1 is the "sampled" token we want
|
||||||
|
@ -39,7 +39,6 @@ from vllm.v1.sample.metadata import SamplingMetadata
|
|||||||
from vllm.v1.spec_decode.utils import is_spec_decode_unsupported
|
from vllm.v1.spec_decode.utils import is_spec_decode_unsupported
|
||||||
from vllm.v1.utils import copy_slice
|
from vllm.v1.utils import copy_slice
|
||||||
|
|
||||||
from vllm_ascend.utils import vllm_version_is
|
|
||||||
from vllm_ascend.worker.block_table import MultiGroupBlockTable
|
from vllm_ascend.worker.block_table import MultiGroupBlockTable
|
||||||
|
|
||||||
|
|
||||||
@ -79,12 +78,6 @@ class CachedRequestState:
|
|||||||
@deprecated("`mm_inputs` is superseded by `mm_kwargs` and will be "
|
@deprecated("`mm_inputs` is superseded by `mm_kwargs` and will be "
|
||||||
"removed in v0.13. Please use `mm_kwargs` instead.")
|
"removed in v0.13. Please use `mm_kwargs` instead.")
|
||||||
def mm_inputs(self) -> list[MultiModalKwargsItems]:
|
def mm_inputs(self) -> list[MultiModalKwargsItems]:
|
||||||
if vllm_version_is("0.10.2"):
|
|
||||||
assert self.mm_kwargs is not None
|
|
||||||
return [
|
|
||||||
MultiModalKwargsItems.from_seq([item])
|
|
||||||
for item in self.mm_kwargs
|
|
||||||
]
|
|
||||||
assert self.mm_features is not None
|
assert self.mm_features is not None
|
||||||
return [
|
return [
|
||||||
MultiModalKwargsItems.from_seq([f.data]) for f in self.mm_features
|
MultiModalKwargsItems.from_seq([f.data]) for f in self.mm_features
|
||||||
|
Reference in New Issue
Block a user