mirror of
https://github.com/vllm-project/vllm-ascend.git
synced 2025-10-20 13:43:53 +08:00
[CI] upgrade to vllm 0.9.0 (#959)
Upgrade to vllm 0.9.0. 0.8.5 will not be supported any more. Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
2
.github/workflows/vllm_ascend_test.yaml
vendored
2
.github/workflows/vllm_ascend_test.yaml
vendored
@ -43,7 +43,7 @@ jobs:
|
||||
max-parallel: 2
|
||||
matrix:
|
||||
os: [linux-arm64-npu-1, linux-arm64-npu-4]
|
||||
vllm_version: [main, v0.8.5.post1]
|
||||
vllm_version: [main, v0.9.0]
|
||||
concurrency:
|
||||
group: >
|
||||
${{
|
||||
|
@ -41,7 +41,7 @@ jobs:
|
||||
strategy:
|
||||
max-parallel: 2
|
||||
matrix:
|
||||
vllm_version: [main, v0.8.5.post1]
|
||||
vllm_version: [main, v0.9.0]
|
||||
name: vLLM Ascend long term test
|
||||
runs-on: linux-arm64-npu-1
|
||||
container:
|
||||
|
2
.github/workflows/vllm_ascend_test_pd.yaml
vendored
2
.github/workflows/vllm_ascend_test_pd.yaml
vendored
@ -40,7 +40,7 @@ jobs:
|
||||
if: ${{ contains(github.event.pull_request.labels.*.name, 'pd-test') && contains(github.event.pull_request.labels.*.name, 'ready-for-test') || github.event_name == 'schedule' }}
|
||||
strategy:
|
||||
matrix:
|
||||
vllm_verison: [main, v0.8.5.post1]
|
||||
vllm_verison: [main, v0.9.0]
|
||||
name: vLLM Ascend prefilling decoding disaggregation test
|
||||
runs-on: linux-arm64-npu-static-8
|
||||
|
||||
|
@ -37,7 +37,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}
|
||||
|
||||
# Install vLLM
|
||||
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
||||
ARG VLLM_TAG=v0.8.5.post1
|
||||
ARG VLLM_TAG=v0.9.0
|
||||
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
||||
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||
|
@ -34,7 +34,7 @@ COPY . /vllm-workspace/vllm-ascend/
|
||||
|
||||
# Install vLLM
|
||||
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
||||
ARG VLLM_TAG=v0.8.5.post1
|
||||
ARG VLLM_TAG=v0.9.0
|
||||
|
||||
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
||||
|
@ -26,9 +26,9 @@ import torch
|
||||
from vllm import SamplingParams
|
||||
from vllm.sequence import PromptLogprobs, SampleLogprobs
|
||||
|
||||
from ....model_utils import (TokensTextLogprobs,
|
||||
TokensTextLogprobsPromptLogprobs,
|
||||
check_logprobs_close, check_outputs_equal)
|
||||
from tests.model_utils import (TokensTextLogprobs,
|
||||
TokensTextLogprobsPromptLogprobs,
|
||||
check_logprobs_close, check_outputs_equal)
|
||||
|
||||
PROMPTS = [
|
||||
"Hello, my name is",
|
||||
|
@ -30,7 +30,6 @@ from vllm.v1.core.sched.output import SchedulerOutput
|
||||
from vllm.v1.worker.gpu_input_batch import InputBatch
|
||||
|
||||
from vllm_ascend.ops.attention import vanilla_chunked_prefill
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
|
||||
class AscendAttentionBackend(AttentionBackend):
|
||||
@ -142,14 +141,11 @@ class AscendAttentionMetadataBuilder:
|
||||
|
||||
def build(self, num_reqs, num_actual_tokens, max_query_len,
|
||||
common_prefix_len):
|
||||
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
|
||||
block_table = (self.runner.input_batch.block_table.
|
||||
get_device_tensor()[:num_reqs])
|
||||
else:
|
||||
block_table = self.runner.input_batch.block_table[
|
||||
0].get_device_tensor()
|
||||
block_table[:num_reqs, :self.runner.max_num_blocks_per_req] = (
|
||||
block_table[:num_reqs])
|
||||
|
||||
block_table = self.runner.input_batch.block_table[0].get_device_tensor(
|
||||
)
|
||||
block_table[:num_reqs, :self.runner.max_num_blocks_per_req] = (
|
||||
block_table[:num_reqs])
|
||||
|
||||
query_lens = self.runner.query_lens
|
||||
seq_lens = self.runner.seq_lens_cpu[:num_reqs]
|
||||
|
@ -16,7 +16,6 @@ from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
|
||||
|
||||
from vllm_ascend.attention.attention_v1 import AscendAttentionState
|
||||
from vllm_ascend.ops.attention import vanilla_chunked_prefill_mla
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@ -239,14 +238,11 @@ class AscendMLAMetadataBuilder:
|
||||
# function. We should avoid GPU -> CPU sync as much as possible because
|
||||
# it blocks on all previous kernels.
|
||||
device = self.runner.device
|
||||
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
|
||||
block_table = (self.runner.input_batch.block_table.
|
||||
get_device_tensor()[:num_reqs])
|
||||
else:
|
||||
block_table = self.runner.input_batch.block_table[
|
||||
0].get_device_tensor()
|
||||
block_table[:num_reqs, :self.runner.max_num_blocks_per_req] = (
|
||||
block_table[:num_reqs])
|
||||
|
||||
block_table = self.runner.input_batch.block_table[0].get_device_tensor(
|
||||
)
|
||||
block_table[:num_reqs, :self.runner.max_num_blocks_per_req] = (
|
||||
block_table[:num_reqs])
|
||||
slot_mapping = self.runner.slot_mapping_cpu[:num_actual_tokens].to(
|
||||
device, non_blocking=True)
|
||||
input_positions = self.runner.positions_cpu[:num_actual_tokens].to(
|
||||
|
@ -26,18 +26,10 @@ from vllm.distributed import (GroupCoordinator,
|
||||
tensor_model_parallel_all_reduce)
|
||||
from vllm.distributed.parallel_state import get_dp_group
|
||||
from vllm.model_executor.layers.fused_moe.layer import (
|
||||
FusedMoE, UnquantizedFusedMoEMethod, determine_expert_map)
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if not (vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1")):
|
||||
from vllm.model_executor.layers.fused_moe.layer import (
|
||||
FusedMoEParallelConfig, MoEConfig)
|
||||
else:
|
||||
MoEConfig = None
|
||||
|
||||
from vllm.model_executor.layers.quantization.base_config import (
|
||||
QuantizationConfig, QuantizeMethodBase)
|
||||
FusedMoE, FusedMoEParallelConfig, MoEConfig, UnquantizedFusedMoEMethod,
|
||||
determine_expert_map)
|
||||
from vllm.model_executor.layers.quantization.base_config import \
|
||||
QuantizationConfig
|
||||
|
||||
import vllm_ascend.envs as envs_ascend
|
||||
from vllm_ascend.distributed.parallel_state import get_ep_group, get_etp_group
|
||||
@ -587,10 +579,8 @@ def select_experts(
|
||||
class AscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
|
||||
|
||||
def __init__(self, moe: MoEConfig = None):
|
||||
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
|
||||
super().__init__()
|
||||
else:
|
||||
super().__init__(moe=moe)
|
||||
|
||||
super().__init__(moe=moe)
|
||||
vllm_config = get_current_vllm_config()
|
||||
|
||||
ep_group = get_ep_group()
|
||||
@ -731,24 +721,17 @@ class AscendFusedMoE(FusedMoE):
|
||||
params_dtype = torch.get_default_dtype()
|
||||
|
||||
vllm_config = get_current_vllm_config()
|
||||
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
|
||||
self.ep_size = get_ep_group().world_size
|
||||
self.tp_size = get_etp_group().world_size
|
||||
self.dp_size = (dp_size if dp_size is not None else
|
||||
get_dp_group().world_size)
|
||||
self.dp_rank = (0 if self.dp_size == 1 else
|
||||
get_dp_group().rank_in_group)
|
||||
else:
|
||||
self.moe_parallel_config: FusedMoEParallelConfig = (
|
||||
FusedMoEParallelConfig.make(
|
||||
tp_size_=(tp_size if tp_size is not None else
|
||||
get_tensor_model_parallel_world_size()),
|
||||
dp_size_=(dp_size if dp_size is not None else
|
||||
get_dp_group().world_size),
|
||||
vllm_parallel_config=vllm_config.parallel_config))
|
||||
|
||||
self.moe_parallel_config.ep_size = get_ep_group().world_size
|
||||
self.moe_parallel_config.tp_size = get_etp_group().world_size
|
||||
self.moe_parallel_config: FusedMoEParallelConfig = (
|
||||
FusedMoEParallelConfig.make(
|
||||
tp_size_=(tp_size if tp_size is not None else
|
||||
get_tensor_model_parallel_world_size()),
|
||||
dp_size_=(dp_size if dp_size is not None else
|
||||
get_dp_group().world_size),
|
||||
vllm_parallel_config=vllm_config.parallel_config))
|
||||
|
||||
self.moe_parallel_config.ep_size = get_ep_group().world_size
|
||||
self.moe_parallel_config.tp_size = get_etp_group().world_size
|
||||
|
||||
self.top_k = top_k
|
||||
self.num_experts = num_experts
|
||||
@ -773,54 +756,39 @@ class AscendFusedMoE(FusedMoE):
|
||||
self.local_num_experts, self.expert_map = determine_expert_map(
|
||||
self.ep_size,
|
||||
get_ep_group().rank_in_group, self.global_num_experts)
|
||||
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
|
||||
self.tp_rank = get_etp_group().rank_in_group
|
||||
self.ep_rank = get_ep_group().rank_in_group
|
||||
else:
|
||||
self.moe_parallel_config.tp_rank = get_etp_group(
|
||||
).rank_in_group
|
||||
self.moe_parallel_config.ep_rank = get_ep_group().rank_in_group
|
||||
|
||||
self.moe_parallel_config.tp_rank = get_etp_group().rank_in_group
|
||||
self.moe_parallel_config.ep_rank = get_ep_group().rank_in_group
|
||||
|
||||
else:
|
||||
# Adjust TP size for DP attention
|
||||
# haven't test its functionality yet, may remove in the future
|
||||
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
|
||||
self.tp_rank = self.tp_size * self.dp_rank
|
||||
self.ep_rank = 0
|
||||
self.tp_size = self.tp_size * self.dp_size
|
||||
self.ep_size = 1
|
||||
else:
|
||||
self.moe_parallel_config.tp_rank = self.tp_size * self.dp_rank
|
||||
self.moe_parallel_config.ep_rank = 0
|
||||
self.moe_parallel_config.tp_size = self.tp_size * self.dp_size
|
||||
self.moe_parallel_config.ep_size = 1
|
||||
|
||||
self.moe_parallel_config.tp_rank = self.tp_size * self.dp_rank
|
||||
self.moe_parallel_config.ep_rank = 0
|
||||
self.moe_parallel_config.tp_size = self.tp_size * self.dp_size
|
||||
self.moe_parallel_config.ep_size = 1
|
||||
|
||||
self.local_num_experts, self.expert_map = (self.global_num_experts,
|
||||
None)
|
||||
if self.scoring_func != "softmax" and not self.use_grouped_topk:
|
||||
raise ValueError("Only softmax scoring function is supported for "
|
||||
"non-grouped topk.")
|
||||
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
|
||||
if quant_config is None:
|
||||
self.quant_method: Optional[QuantizeMethodBase] = (
|
||||
AscendUnquantizedFusedMoEMethod())
|
||||
else:
|
||||
self.quant_method = quant_config.get_quant_method(self, prefix)
|
||||
else:
|
||||
moe = MoEConfig(
|
||||
num_experts=self.global_num_experts,
|
||||
experts_per_token=top_k,
|
||||
hidden_dim=hidden_size,
|
||||
num_local_experts=self.local_num_experts,
|
||||
moe_parallel_config=self.moe_parallel_config,
|
||||
# TODO (bnell): this needs to be fixed for quantized types.
|
||||
in_dtype=params_dtype,
|
||||
)
|
||||
|
||||
if quant_config is None:
|
||||
self.quant_method = AscendUnquantizedFusedMoEMethod(moe)
|
||||
else:
|
||||
self.quant_method = quant_config.get_quant_method(self, prefix)
|
||||
moe = MoEConfig(
|
||||
num_experts=self.global_num_experts,
|
||||
experts_per_token=top_k,
|
||||
hidden_dim=hidden_size,
|
||||
num_local_experts=self.local_num_experts,
|
||||
moe_parallel_config=self.moe_parallel_config,
|
||||
# TODO (bnell): this needs to be fixed for quantized types.
|
||||
in_dtype=params_dtype,
|
||||
)
|
||||
|
||||
if quant_config is None:
|
||||
self.quant_method = AscendUnquantizedFusedMoEMethod(moe)
|
||||
else:
|
||||
self.quant_method = quant_config.get_quant_method(self, prefix)
|
||||
|
||||
assert self.quant_method is not None
|
||||
|
||||
|
@ -24,16 +24,9 @@
|
||||
# each worker's `__init__` function.
|
||||
#
|
||||
# Then in each kind of patch, there are three folders:
|
||||
# - patch_0_8_5: contains the patches applied when vllm version is 0.8.5.
|
||||
# - patch_0_9_0: contains the patches applied when vllm version is 0.9.0.
|
||||
# - patch_main: contains the patches applied when vllm version is main branch.
|
||||
# - patch_common: contains the patches applied in both 0.8.5 and main branch.
|
||||
#
|
||||
# In the future, with the vllm version upgrade, the new patch folder such as
|
||||
# patch_0_8_5, patch_0_8_6, etc. will be added to manage the patch for different
|
||||
# vllm version. And the patch_common will contain the patches applied in all the
|
||||
# vllm version.
|
||||
# Once the vllm version is too old that vllm-ascend will not support, the related
|
||||
# patch folder will be removed as well.
|
||||
# - patch_common: contains the patches applied in both 0.9.0 and main branch.
|
||||
#
|
||||
# Once a new patch is added in vllm-ascend, please add the patch description into this file as well.
|
||||
# ----------------------------------------------------------------------------------
|
||||
|
@ -17,8 +17,8 @@
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
# Import specific patches for different versions
|
||||
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
|
||||
from vllm_ascend.patch.platform import patch_0_8_5 # noqa: F401
|
||||
if vllm_version_is("0.9.0"):
|
||||
from vllm_ascend.patch.platform import patch_0_9_0 # noqa: F401
|
||||
from vllm_ascend.patch.platform import patch_common # noqa: F401
|
||||
else:
|
||||
from vllm_ascend.patch.platform import patch_common # noqa: F401
|
||||
|
@ -18,8 +18,8 @@
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
# Import specific patches for different versions
|
||||
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
|
||||
from vllm_ascend.patch.worker import patch_0_8_5 # noqa: F401
|
||||
if vllm_version_is("0.9.0"):
|
||||
from vllm_ascend.patch.worker import patch_0_9_0 # noqa: F401
|
||||
from vllm_ascend.patch.worker import patch_common # noqa: F401
|
||||
else:
|
||||
from vllm_ascend.patch.worker import patch_common # noqa: F401
|
||||
|
@ -64,8 +64,6 @@ from vllm.worker.model_runner_base import (
|
||||
_init_attn_metadata_from_tensor_dict,
|
||||
_init_sampling_metadata_from_tensor_dict)
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.attention.backends.abstract import AttentionBackend
|
||||
|
||||
@ -1017,10 +1015,8 @@ class NPUModelRunnerBase(ModelRunnerBase[TModelInputForNPU]):
|
||||
pattern: Optional[str] = None,
|
||||
max_size: Optional[int] = None,
|
||||
) -> None:
|
||||
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
|
||||
from vllm.model_executor.model_loader.loader import ShardedStateLoader # type: ignore[import] # isort: skip # noqa
|
||||
else:
|
||||
from vllm.model_executor.model_loader import ShardedStateLoader
|
||||
|
||||
from vllm.model_executor.model_loader import ShardedStateLoader
|
||||
ShardedStateLoader.save_model(
|
||||
self.model,
|
||||
path,
|
||||
@ -1032,12 +1028,9 @@ class NPUModelRunnerBase(ModelRunnerBase[TModelInputForNPU]):
|
||||
self,
|
||||
tensorizer_config: TensorizerConfig,
|
||||
) -> None:
|
||||
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
|
||||
from vllm.model_executor.model_loader.loader import \
|
||||
TensorizerLoader # type: ignore # noqa
|
||||
else:
|
||||
from vllm.model_executor.model_loader import \
|
||||
TensorizerLoader # type: ignore # noqa
|
||||
|
||||
from vllm.model_executor.model_loader import \
|
||||
TensorizerLoader # type: ignore # noqa
|
||||
TensorizerLoader.save_model(
|
||||
self.model,
|
||||
tensorizer_config=tensorizer_config,
|
||||
|
@ -61,7 +61,6 @@ from vllm_ascend.attention.attention import AttentionMaskBuilder
|
||||
from vllm_ascend.attention.attention_v1 import AscendAttentionState
|
||||
from vllm_ascend.platform import NPUPlatform
|
||||
from vllm_ascend.sample.rejection_sampler import AscendRejectionSampler
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import xgrammar as xgr # type: ignore[import-untyped]
|
||||
@ -210,16 +209,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
# Request states.
|
||||
self.requests: Dict[str, CachedRequestState] = {}
|
||||
# Persistent batch.
|
||||
# Remove this after we drop 0.8.5 support
|
||||
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
|
||||
self.input_batch = InputBatch(
|
||||
max_num_reqs=self.max_num_reqs,
|
||||
max_model_len=self.model_config.max_model_len,
|
||||
max_num_blocks_per_req=self.max_num_blocks_per_req,
|
||||
device=self.device,
|
||||
pin_memory=True,
|
||||
vocab_size=self.model_config.get_vocab_size(),
|
||||
)
|
||||
|
||||
self.input_ids = torch.zeros(self.max_num_tokens,
|
||||
dtype=torch.int32,
|
||||
@ -573,10 +562,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
|
||||
block_table_indices = (req_indices * self.max_num_blocks_per_req +
|
||||
positions_np // self.block_size)
|
||||
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
|
||||
block_table_cpu = self.input_batch.block_table.get_cpu_tensor()
|
||||
else:
|
||||
block_table_cpu = self.input_batch.block_table[0].get_cpu_tensor()
|
||||
|
||||
block_table_cpu = self.input_batch.block_table[0].get_cpu_tensor()
|
||||
block_numbers = block_table_cpu.flatten()[block_table_indices].numpy()
|
||||
block_offsets = positions_np % self.block_size
|
||||
np.add(block_numbers * self.block_size,
|
||||
@ -1182,16 +1169,16 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
"""
|
||||
import torch_npu
|
||||
kv_caches: Dict[str, torch.Tensor] = {}
|
||||
if not (vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1")):
|
||||
self.input_batch = InputBatch(
|
||||
max_num_reqs=self.max_num_reqs,
|
||||
max_model_len=self.model_config.max_model_len,
|
||||
max_num_batched_tokens=self.max_num_tokens,
|
||||
device=self.device,
|
||||
pin_memory=True,
|
||||
vocab_size=self.model_config.get_vocab_size(),
|
||||
block_size=self.cache_config.block_size,
|
||||
)
|
||||
|
||||
self.input_batch = InputBatch(
|
||||
max_num_reqs=self.max_num_reqs,
|
||||
max_model_len=self.model_config.max_model_len,
|
||||
max_num_batched_tokens=self.max_num_tokens,
|
||||
device=self.device,
|
||||
pin_memory=True,
|
||||
vocab_size=self.model_config.get_vocab_size(),
|
||||
block_size=self.cache_config.block_size,
|
||||
)
|
||||
|
||||
for kv_cache_group in kv_cache_config.kv_cache_groups:
|
||||
kv_cache_spec = kv_cache_group.kv_cache_spec
|
||||
|
Reference in New Issue
Block a user