From 787010a637ad110b2ef3f12fab9e57f6607938e5 Mon Sep 17 00:00:00 2001 From: wangxiyuan Date: Tue, 15 Jul 2025 12:49:57 +0800 Subject: [PATCH] [Test] Remove VLLM_USE_V1 in example and tests (#1733) V1 is enabled by default, no need to set it by hand now. This PR remove the useless setting in example and tests - vLLM version: v0.9.2 - vLLM main: https://github.com/vllm-project/vllm/commit/9ad0a4588ba4e9c979cda0d178dec4fcdb89fd0c Signed-off-by: wangxiyuan --- .github/workflows/vllm_ascend_test.yaml | 69 ++------- examples/offline_data_parallel.py | 1 - examples/offline_dualbatch_overlap_npu.py | 1 - examples/offline_inference_sleep_mode_npu.py | 1 - examples/run_dp_attention_etp16.sh | 1 - requirements-dev.txt | 1 + requirements-lint.txt | 1 + tests/{ => e2e}/conftest.py | 6 +- tests/{ => e2e}/model_utils.py | 0 .../multicard/test_fused_moe_allgather_ep.py | 15 +- tests/e2e/multicard/test_ilama_lora_tp2.py | 2 +- .../test_offline_inference_distributed.py | 2 +- tests/e2e/multicard/test_pipeline_parallel.py | 2 +- tests/e2e/multicard/test_prefix_caching.py | 10 +- .../e2e/multicard/test_torchair_graph_mode.py | 10 +- .../test_ascend_scheduler_e2e.py | 4 - .../ascend_scheduler/test_chunk_prefill.py | 4 +- .../spec_decode_v1/test_v1_mtp_correctness.py | 54 ++++--- .../spec_decode_v1/test_v1_spec_decode.py | 132 ++++++++---------- tests/e2e/singlecard/test_aclgraph.py | 47 +++---- tests/e2e/singlecard/test_camem.py | 2 +- tests/e2e/singlecard/test_chunked.py | 57 ++++---- tests/e2e/singlecard/test_embedding.py | 4 +- tests/e2e/singlecard/test_guided_decoding.py | 10 +- tests/e2e/singlecard/test_ilama_lora.py | 2 +- .../e2e/singlecard/test_offline_inference.py | 2 +- tests/{ => e2e}/utils.py | 0 tests/ut/test_ascend_config.py | 35 +++-- tests/ut/test_platform.py | 2 - 29 files changed, 186 insertions(+), 291 deletions(-) rename tests/{ => e2e}/conftest.py (98%) rename tests/{ => e2e}/model_utils.py (100%) rename tests/{ => e2e}/utils.py (100%) diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml index ec23fcef1..d06fd3d25 100644 --- a/.github/workflows/vllm_ascend_test.yaml +++ b/.github/workflows/vllm_ascend_test.yaml @@ -41,16 +41,10 @@ concurrency: jobs: lint: - # Only trigger lint on pull request - if: ${{ github.event_name == 'pull_request' }} uses: ./.github/workflows/pre-commit.yml changes: - # Only trigger changes on pull request - if: ${{ github.event_name == 'pull_request' }} runs-on: ubuntu-latest - permissions: - pull-requests: read outputs: e2e_tracker: ${{ steps.filter.outputs.e2e_tracker }} ut_tracker: ${{ steps.filter.outputs.ut_tracker }} @@ -60,20 +54,24 @@ jobs: with: filters: | e2e_tracker: + - '.github/workflows/vllm_ascend_test.yaml' - 'vllm_ascend/**' - 'csrc/**' - 'cmake/**' - 'tests/e2e/**' - - 'tests/conftest.py' - - 'tests/model_utils.py' - - 'tests/utils.py' + - 'CMakeLists.txt' + - 'setup.py' + - 'requirements.txt' + - 'requirements-dev.txt' + - 'requirements-lint.txt' + - 'packages.txt' ut_tracker: - 'tests/ut/**' ut: needs: [lint, changes] name: unit test - # only trigger unit test after lint passed and the change is e2e and ut related. Or the PR is merged. - if: ${{ github.event_name == 'push' || (needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true')) }} + # only trigger unit test after lint passed and the change is e2e and ut related. + if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }} runs-on: ubuntu-latest container: image: quay.io/ascend/cann:8.1.rc1-910b-ubuntu22.04-py3.10 @@ -112,9 +110,8 @@ jobs: python3 -m pip install -r requirements-dev.txt --extra-index https://download.pytorch.org/whl/cpu/ python3 -m pip install -v . --extra-index https://download.pytorch.org/whl/cpu/ - - name: Run unit test for V1 Engine + - name: Run unit test env: - VLLM_USE_V1: 1 VLLM_WORKER_MULTIPROC_METHOD: spawn TORCH_DEVICE_BACKEND_AUTOLOAD: 0 run: | @@ -133,8 +130,8 @@ jobs: e2e: needs: [lint, changes] - # only trigger e2e test after lint passed and the change is e2e related. - if: ${{ needs.lint.result == 'success' && needs.changes.outputs.e2e_tracker == 'true' }} + # only trigger e2e test after lint passed and the change is e2e related with pull request. + if: ${{ github.event_name == 'pull_request' && needs.lint.result == 'success' && needs.changes.outputs.e2e_tracker == 'true' }} strategy: max-parallel: 2 matrix: @@ -189,9 +186,8 @@ jobs: pip install -r requirements-dev.txt pip install -v -e . - - name: Run e2e test for V1 Engine + - name: Run e2e test env: - VLLM_USE_V1: 1 VLLM_WORKER_MULTIPROC_METHOD: spawn VLLM_USE_MODELSCOPE: True run: | @@ -213,26 +209,6 @@ jobs: # TODO: revert me when test_v1_spec_decode.py::test_ngram_correctness is fixed VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py - - name: Run e2e test on V0 engine - if: ${{ github.event_name == 'schedule' }} - env: - VLLM_USE_V1: 0 - VLLM_USE_MODELSCOPE: True - run: | - pytest -sv tests/e2e/singlecard/test_offline_inference.py - pytest -sv tests/e2e/singlecard/test_ilama_lora.py - pytest -sv tests/e2e/singlecard/test_guided_decoding.py - pytest -sv tests/e2e/singlecard/test_camem.py - pytest -sv tests/e2e/singlecard/test_prompt_embedding.py - pytest -sv tests/e2e/singlecard/test_embedding.py - pytest -sv tests/e2e/singlecard/ \ - --ignore=tests/e2e/singlecard/test_offline_inference.py \ - --ignore=tests/e2e/singlecard/test_ilama_lora.py \ - --ignore=tests/e2e/singlecard/test_guided_decoding.py \ - --ignore=tests/e2e/singlecard/test_camem.py \ - --ignore=tests/e2e/singlecard/test_prompt_embedding.py \ - --ignore=tests/e2e/singlecard/test_embedding.py - e2e-4-cards: needs: [e2e] if: ${{ needs.e2e.result == 'success' }} @@ -290,9 +266,8 @@ jobs: pip install -r requirements-dev.txt pip install -v -e . - - name: Run vllm-project/vllm-ascend test for V1 Engine + - name: Run vllm-project/vllm-ascend test env: - VLLM_USE_V1: 1 VLLM_WORKER_MULTIPROC_METHOD: spawn VLLM_USE_MODELSCOPE: True run: | @@ -308,19 +283,3 @@ jobs: pytest -sv tests/e2e/multicard/ --ignore=tests/e2e/multicard/test_ilama_lora_tp2.py \ --ignore=tests/e2e/multicard/test_offline_inference_distributed.py \ --ignore=tests/e2e/multicard/test_data_parallel.py - - - name: Run vllm-project/vllm-ascend test on V0 engine - if: ${{ github.event_name == 'schedule' }} - env: - VLLM_USE_V1: 0 - VLLM_USE_MODELSCOPE: True - run: | - pytest -sv tests/e2e/multicard/test_ilama_lora_tp2.py - # Fixme: run VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py will raise error. - # To avoid oom, we need to run the test in a single process. - pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ - pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W8A8 - pytest -sv tests/e2e/multicard/test_data_parallel.py - pytest -sv tests/e2e/multicard/ --ignore=tests/e2e/multicard/test_ilama_lora_tp2.py \ - --ignore=tests/e2e/multicard/test_offline_inference_distributed.py \ - --ignore=tests/e2e/multicard/test_data_parallel.py diff --git a/examples/offline_data_parallel.py b/examples/offline_data_parallel.py index 64084ac69..754dfbc7c 100644 --- a/examples/offline_data_parallel.py +++ b/examples/offline_data_parallel.py @@ -120,7 +120,6 @@ def main( trust_remote_code, ): # DP only support on V1 engine - os.environ["VLLM_USE_V1"] = "1" os.environ["VLLM_DP_RANK"] = str(global_dp_rank) os.environ["VLLM_DP_RANK_LOCAL"] = str(local_dp_rank) os.environ["VLLM_DP_SIZE"] = str(dp_size) diff --git a/examples/offline_dualbatch_overlap_npu.py b/examples/offline_dualbatch_overlap_npu.py index d8153e38c..e721ab2aa 100644 --- a/examples/offline_dualbatch_overlap_npu.py +++ b/examples/offline_dualbatch_overlap_npu.py @@ -5,7 +5,6 @@ from vllm import LLM, SamplingParams # enable dual-batch overlap for vllm ascend os.environ["VLLM_ASCEND_ENABLE_DBO"] = "1" -os.environ["VLLM_USE_V1"] = "1" # Sample prompts. prompts = ["The president of the United States is"] * 41 diff --git a/examples/offline_inference_sleep_mode_npu.py b/examples/offline_inference_sleep_mode_npu.py index 01b3b4a7e..7b7d42268 100644 --- a/examples/offline_inference_sleep_mode_npu.py +++ b/examples/offline_inference_sleep_mode_npu.py @@ -22,7 +22,6 @@ import torch from vllm import LLM, SamplingParams from vllm.utils import GiB_bytes -os.environ["VLLM_USE_V1"] = "1" os.environ["VLLM_USE_MODELSCOPE"] = "True" os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" diff --git a/examples/run_dp_attention_etp16.sh b/examples/run_dp_attention_etp16.sh index 62233de7c..5d87879a1 100644 --- a/examples/run_dp_attention_etp16.sh +++ b/examples/run_dp_attention_etp16.sh @@ -1,4 +1,3 @@ -export VLLM_USE_V1=1 export TASK_QUEUE_ENABLE=1 source /usr/local/Ascend/ascend-toolkit/set_env.sh source /usr/local/Ascend/nnal/atb/set_env.sh diff --git a/requirements-dev.txt b/requirements-dev.txt index 8bd7dcadb..d4a5acd1a 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -12,4 +12,5 @@ xgrammar zmq types-psutil pytest-cov +regex sentence_transformers diff --git a/requirements-lint.txt b/requirements-lint.txt index eab3838f0..8a575e5a2 100644 --- a/requirements-lint.txt +++ b/requirements-lint.txt @@ -4,5 +4,6 @@ pre-commit==4.0.1 # type checking mypy==1.11.1 types-PyYAML +types-regex types-requests types-setuptools diff --git a/tests/conftest.py b/tests/e2e/conftest.py similarity index 98% rename from tests/conftest.py rename to tests/e2e/conftest.py index 64bc466e7..50ca0f3e3 100644 --- a/tests/conftest.py +++ b/tests/e2e/conftest.py @@ -39,8 +39,8 @@ from vllm.sampling_params import BeamSearchParams from vllm.transformers_utils.utils import maybe_model_redirect from vllm.utils import is_list_of -from tests.model_utils import (PROMPT_TEMPLATES, TokensTextLogprobs, - TokensTextLogprobsPromptLogprobs) +from tests.e2e.model_utils import (PROMPT_TEMPLATES, TokensTextLogprobs, + TokensTextLogprobsPromptLogprobs) # TODO: remove this part after the patch merged into vllm, if # we not explicitly patch here, some of them might be effectiveless # in pytest scenario @@ -62,7 +62,7 @@ PromptAudioInput = _PromptMultiModalInput[Tuple[np.ndarray, int]] PromptVideoInput = _PromptMultiModalInput[np.ndarray] _TEST_DIR = os.path.dirname(__file__) -_TEST_PROMPTS = [os.path.join(_TEST_DIR, "e2e", "prompts", "example.txt")] +_TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")] def cleanup_dist_env_and_memory(shutdown_ray: bool = False): diff --git a/tests/model_utils.py b/tests/e2e/model_utils.py similarity index 100% rename from tests/model_utils.py rename to tests/e2e/model_utils.py diff --git a/tests/e2e/multicard/test_fused_moe_allgather_ep.py b/tests/e2e/multicard/test_fused_moe_allgather_ep.py index ad755dd16..221d33f0d 100644 --- a/tests/e2e/multicard/test_fused_moe_allgather_ep.py +++ b/tests/e2e/multicard/test_fused_moe_allgather_ep.py @@ -26,12 +26,11 @@ from unittest.mock import patch from modelscope import snapshot_download # type: ignore from vllm import SamplingParams -from tests.conftest import VllmRunner +from tests.e2e.conftest import VllmRunner @patch.dict( os.environ, { - "VLLM_USE_V1": "1", "VLLM_WORKER_MULTIPROC_METHOD": "spawn", "TASK_QUEUE_ENABLE": "1", "VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP": "1" @@ -56,12 +55,10 @@ def test_generate_with_allgather(): vllm_model.generate(example_prompts, sampling_params) -@patch.dict( - os.environ, { - "VLLM_USE_V1": "1", - "VLLM_WORKER_MULTIPROC_METHOD": "spawn", - "TASK_QUEUE_ENABLE": "1" - }) +@patch.dict(os.environ, { + "VLLM_WORKER_MULTIPROC_METHOD": "spawn", + "TASK_QUEUE_ENABLE": "1" +}) def test_generate_with_alltoall(): example_prompts = ["Hello, my name is"] sampling_params = SamplingParams(max_tokens=100, temperature=0.0) @@ -79,4 +76,4 @@ def test_generate_with_alltoall(): }, "expert_tensor_parallel_size": 1 }) as vllm_model: - vllm_model.generate(example_prompts, sampling_params) \ No newline at end of file + vllm_model.generate(example_prompts, sampling_params) diff --git a/tests/e2e/multicard/test_ilama_lora_tp2.py b/tests/e2e/multicard/test_ilama_lora_tp2.py index 3f62bfd7e..e22550c2f 100644 --- a/tests/e2e/multicard/test_ilama_lora_tp2.py +++ b/tests/e2e/multicard/test_ilama_lora_tp2.py @@ -1,7 +1,7 @@ import pytest from modelscope import snapshot_download # type: ignore -from tests.conftest import VllmRunner +from tests.e2e.conftest import VllmRunner from tests.e2e.singlecard.test_ilama_lora import (EXPECTED_LORA_OUTPUT, MODEL_PATH, do_sample) diff --git a/tests/e2e/multicard/test_offline_inference_distributed.py b/tests/e2e/multicard/test_offline_inference_distributed.py index 58d0bf0ba..2b155383c 100644 --- a/tests/e2e/multicard/test_offline_inference_distributed.py +++ b/tests/e2e/multicard/test_offline_inference_distributed.py @@ -27,7 +27,7 @@ from modelscope import snapshot_download # type: ignore from vllm import SamplingParams from vllm.model_executor.models.registry import ModelRegistry -from tests.conftest import VllmRunner +from tests.e2e.conftest import VllmRunner os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256" diff --git a/tests/e2e/multicard/test_pipeline_parallel.py b/tests/e2e/multicard/test_pipeline_parallel.py index a7070b688..c0c757ab3 100644 --- a/tests/e2e/multicard/test_pipeline_parallel.py +++ b/tests/e2e/multicard/test_pipeline_parallel.py @@ -16,7 +16,7 @@ # import pytest -from tests.conftest import VllmRunner +from tests.e2e.conftest import VllmRunner MODELS = [ "Qwen/Qwen3-0.6B", diff --git a/tests/e2e/multicard/test_prefix_caching.py b/tests/e2e/multicard/test_prefix_caching.py index 368d3ff95..73d0d2c4a 100644 --- a/tests/e2e/multicard/test_prefix_caching.py +++ b/tests/e2e/multicard/test_prefix_caching.py @@ -2,12 +2,10 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Compare the with and without prefix caching on V1 scheduler or AscendScheduler.""" -import os - import pytest -from tests.conftest import VllmRunner -from tests.model_utils import check_outputs_equal +from tests.e2e.conftest import VllmRunner +from tests.e2e.model_utils import check_outputs_equal MODELS = [ # for MHA @@ -60,8 +58,6 @@ INPUT_PROMPTS = [ ] -@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0", - reason="mtp is not supported on v1") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("max_tokens", [50]) def test_prefix_cache_with_v1_scheduler(model: str, max_tokens: int) -> None: @@ -89,8 +85,6 @@ def test_prefix_cache_with_v1_scheduler(model: str, max_tokens: int) -> None: ) -@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0", - reason="mtp is not supported on v1") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("max_tokens", [50]) def test_prefix_cache_with_ascend_scheduler(model: str, diff --git a/tests/e2e/multicard/test_torchair_graph_mode.py b/tests/e2e/multicard/test_torchair_graph_mode.py index ce628f9d3..d363560dd 100644 --- a/tests/e2e/multicard/test_torchair_graph_mode.py +++ b/tests/e2e/multicard/test_torchair_graph_mode.py @@ -22,9 +22,7 @@ Run `pytest tests/multicard/test_torchair_graph_mode.py`. import os from typing import Dict -import pytest - -from tests.conftest import VllmRunner +from tests.e2e.conftest import VllmRunner os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256" @@ -78,8 +76,6 @@ def _deepseek_torchair_test_fixture( print(f"Generated text: {vllm_output[i][1]!r}") -@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0", - reason="torchair graph is not supported on v0") def test_e2e_deepseekv3_with_torchair(): additional_config = { "torchair_graph_config": { @@ -89,8 +85,6 @@ def test_e2e_deepseekv3_with_torchair(): _deepseek_torchair_test_fixture(additional_config) -@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0", - reason="torchair graph is not supported on v0") def test_e2e_deepseekv3_with_torchair_ms_mla(): additional_config = { "torchair_graph_config": { @@ -150,8 +144,6 @@ def _pangu_torchair_test_fixture( print(f"Generated text: {vllm_output[i][1]!r}") -@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0", - reason="torchair graph is not supported on v0") def test_e2e_pangu_with_torchair(): additional_config = { "torchair_graph_config": { diff --git a/tests/e2e/singlecard/core/ascend_scheduler/test_ascend_scheduler_e2e.py b/tests/e2e/singlecard/core/ascend_scheduler/test_ascend_scheduler_e2e.py index 17116ab59..73e392a28 100644 --- a/tests/e2e/singlecard/core/ascend_scheduler/test_ascend_scheduler_e2e.py +++ b/tests/e2e/singlecard/core/ascend_scheduler/test_ascend_scheduler_e2e.py @@ -1,15 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import gc -import os import pytest import torch from vllm import LLM -if os.getenv("VLLM_USE_V1", "0") != "1": - pytest.skip("Test package requires V1", allow_module_level=True) - MODEL = "Qwen/Qwen2.5-0.5B-Instruct" PROMPT = "Hello my name is Robert and I" diff --git a/tests/e2e/singlecard/core/ascend_scheduler/test_chunk_prefill.py b/tests/e2e/singlecard/core/ascend_scheduler/test_chunk_prefill.py index 0b557960e..e25e857da 100644 --- a/tests/e2e/singlecard/core/ascend_scheduler/test_chunk_prefill.py +++ b/tests/e2e/singlecard/core/ascend_scheduler/test_chunk_prefill.py @@ -9,8 +9,8 @@ Run `pytest tests/e2e/singlecard/core/ascend_scheduler/test_chunk_prefill.py`. """ import pytest -from tests.conftest import VllmRunner -from tests.model_utils import check_outputs_equal +from tests.e2e.conftest import VllmRunner +from tests.e2e.model_utils import check_outputs_equal MODELS = [ "Qwen/Qwen3-0.6B-Base", diff --git a/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py b/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py index 0cf64b059..10322f49e 100644 --- a/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py +++ b/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py @@ -53,7 +53,6 @@ def model_name(): @pytest.mark.skipif( True, reason="TODO: Enable me after test_mtp_correctness is fixed") def test_mtp_correctness( - monkeypatch: pytest.MonkeyPatch, test_prompts: list[list[dict[str, Any]]], sampling_config: SamplingParams, model_name: str, @@ -62,33 +61,30 @@ def test_mtp_correctness( Compare the outputs of a original LLM and a speculative LLM should be the same when using mtp speculative decoding. ''' - with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1") + ref_llm = LLM(model=model_name, max_model_len=256, enforce_eager=True) + ref_outputs = ref_llm.chat(test_prompts, sampling_config) + del ref_llm - ref_llm = LLM(model=model_name, max_model_len=256, enforce_eager=True) - ref_outputs = ref_llm.chat(test_prompts, sampling_config) - del ref_llm + spec_llm = LLM(model=model_name, + trust_remote_code=True, + speculative_config={ + "method": "deepseek_mtp", + "num_speculative_tokens": 1, + }, + max_model_len=256, + enforce_eager=True) + spec_outputs = spec_llm.chat(test_prompts, sampling_config) + matches = 0 + misses = 0 + for ref_output, spec_output in zip(ref_outputs, spec_outputs): + if ref_output.outputs[0].text == spec_output.outputs[0].text: + matches += 1 + else: + misses += 1 + print(f"ref_output: {ref_output.outputs[0].text}") + print(f"spec_output: {spec_output.outputs[0].text}") - spec_llm = LLM(model=model_name, - trust_remote_code=True, - speculative_config={ - "method": "deepseek_mtp", - "num_speculative_tokens": 1, - }, - max_model_len=256, - enforce_eager=True) - spec_outputs = spec_llm.chat(test_prompts, sampling_config) - matches = 0 - misses = 0 - for ref_output, spec_output in zip(ref_outputs, spec_outputs): - if ref_output.outputs[0].text == spec_output.outputs[0].text: - matches += 1 - else: - misses += 1 - print(f"ref_output: {ref_output.outputs[0].text}") - print(f"spec_output: {spec_output.outputs[0].text}") - - # Heuristic: expect at least 66% of the prompts to match exactly - # Upon failure, inspect the outputs to check for inaccuracy. - assert matches > int(0.66 * len(ref_outputs)) - del spec_llm + # Heuristic: expect at least 66% of the prompts to match exactly + # Upon failure, inspect the outputs to check for inaccuracy. + assert matches > int(0.66 * len(ref_outputs)) + del spec_llm diff --git a/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py b/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py index 35cb19a14..56fa6cc63 100644 --- a/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py +++ b/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py @@ -60,7 +60,6 @@ def eagle3_model_name(): def test_ngram_correctness( - monkeypatch: pytest.MonkeyPatch, test_prompts: list[list[dict[str, Any]]], sampling_config: SamplingParams, model_name: str, @@ -70,44 +69,40 @@ def test_ngram_correctness( should be the same when using ngram speculative decoding. ''' pytest.skip("Not current support for the test.") - with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1") + ref_llm = LLM(model=model_name, max_model_len=1024, enforce_eager=True) + ref_outputs = ref_llm.chat(test_prompts, sampling_config) + del ref_llm - ref_llm = LLM(model=model_name, max_model_len=1024, enforce_eager=True) - ref_outputs = ref_llm.chat(test_prompts, sampling_config) - del ref_llm + spec_llm = LLM( + model=model_name, + speculative_config={ + "method": "ngram", + "prompt_lookup_max": 5, + "prompt_lookup_min": 3, + "num_speculative_tokens": 3, + }, + max_model_len=1024, + enforce_eager=True, + ) + spec_outputs = spec_llm.chat(test_prompts, sampling_config) + matches = 0 + misses = 0 + for ref_output, spec_output in zip(ref_outputs, spec_outputs): + if ref_output.outputs[0].text == spec_output.outputs[0].text: + matches += 1 + else: + misses += 1 + print(f"ref_output: {ref_output.outputs[0].text}") + print(f"spec_output: {spec_output.outputs[0].text}") - spec_llm = LLM( - model=model_name, - speculative_config={ - "method": "ngram", - "prompt_lookup_max": 5, - "prompt_lookup_min": 3, - "num_speculative_tokens": 3, - }, - max_model_len=1024, - enforce_eager=True, - ) - spec_outputs = spec_llm.chat(test_prompts, sampling_config) - matches = 0 - misses = 0 - for ref_output, spec_output in zip(ref_outputs, spec_outputs): - if ref_output.outputs[0].text == spec_output.outputs[0].text: - matches += 1 - else: - misses += 1 - print(f"ref_output: {ref_output.outputs[0].text}") - print(f"spec_output: {spec_output.outputs[0].text}") - - # Heuristic: expect at least 70% of the prompts to match exactly - # Upon failure, inspect the outputs to check for inaccuracy. - assert matches > int(0.7 * len(ref_outputs)) - del spec_llm + # Heuristic: expect at least 70% of the prompts to match exactly + # Upon failure, inspect the outputs to check for inaccuracy. + assert matches > int(0.7 * len(ref_outputs)) + del spec_llm @pytest.mark.parametrize("use_eagle3", [False, True], ids=["eagle", "eagle3"]) def test_eagle_correctness( - monkeypatch: pytest.MonkeyPatch, test_prompts: list[list[dict[str, Any]]], sampling_config: SamplingParams, model_name: str, @@ -119,43 +114,40 @@ def test_eagle_correctness( ''' if not use_eagle3: pytest.skip("Not current support for the test.") - with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1") - ref_llm = LLM(model=model_name, max_model_len=2048, enforce_eager=True) - ref_outputs = ref_llm.chat(test_prompts, sampling_config) - del ref_llm + ref_llm = LLM(model=model_name, max_model_len=2048, enforce_eager=True) + ref_outputs = ref_llm.chat(test_prompts, sampling_config) + del ref_llm - spec_model_name = eagle3_model_name( - ) if use_eagle3 else eagle_model_name() - spec_llm = LLM( - model=model_name, - trust_remote_code=True, - enable_chunked_prefill=True, - max_num_seqs=1, - max_num_batched_tokens=2048, - gpu_memory_utilization=0.6, - speculative_config={ - "method": "eagle3" if use_eagle3 else "eagle", - "model": spec_model_name, - "num_speculative_tokens": 2, - "max_model_len": 128, - }, - max_model_len=128, - enforce_eager=True, - ) - spec_outputs = spec_llm.chat(test_prompts, sampling_config) - matches = 0 - misses = 0 - for ref_output, spec_output in zip(ref_outputs, spec_outputs): - if ref_output.outputs[0].text == spec_output.outputs[0].text: - matches += 1 - else: - misses += 1 - print(f"ref_output: {ref_output.outputs[0].text}") - print(f"spec_output: {spec_output.outputs[0].text}") + spec_model_name = eagle3_model_name() if use_eagle3 else eagle_model_name() + spec_llm = LLM( + model=model_name, + trust_remote_code=True, + enable_chunked_prefill=True, + max_num_seqs=1, + max_num_batched_tokens=2048, + gpu_memory_utilization=0.6, + speculative_config={ + "method": "eagle3" if use_eagle3 else "eagle", + "model": spec_model_name, + "num_speculative_tokens": 2, + "max_model_len": 128, + }, + max_model_len=128, + enforce_eager=True, + ) + spec_outputs = spec_llm.chat(test_prompts, sampling_config) + matches = 0 + misses = 0 + for ref_output, spec_output in zip(ref_outputs, spec_outputs): + if ref_output.outputs[0].text == spec_output.outputs[0].text: + matches += 1 + else: + misses += 1 + print(f"ref_output: {ref_output.outputs[0].text}") + print(f"spec_output: {spec_output.outputs[0].text}") - # Heuristic: expect at least 66% of the prompts to match exactly - # Upon failure, inspect the outputs to check for inaccuracy. - assert matches > int(0.66 * len(ref_outputs)) - del spec_llm + # Heuristic: expect at least 66% of the prompts to match exactly + # Upon failure, inspect the outputs to check for inaccuracy. + assert matches > int(0.66 * len(ref_outputs)) + del spec_llm diff --git a/tests/e2e/singlecard/test_aclgraph.py b/tests/e2e/singlecard/test_aclgraph.py index 89dfa08e4..020196dac 100644 --- a/tests/e2e/singlecard/test_aclgraph.py +++ b/tests/e2e/singlecard/test_aclgraph.py @@ -20,14 +20,12 @@ Compare the outputs of vLLM with and without aclgraph. Run `pytest tests/compile/test_aclgraph.py`. """ -import os - import pytest import torch from vllm import LLM, SamplingParams -from tests.conftest import VllmRunner -from tests.model_utils import check_outputs_equal +from tests.e2e.conftest import VllmRunner +from tests.e2e.model_utils import check_outputs_equal MODELS = [ "Qwen/Qwen2.5-0.5B-Instruct", @@ -36,37 +34,29 @@ MODELS = [ ] -@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0", - reason="aclgraph only support on v1") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("max_tokens", [32]) def test_models( model: str, max_tokens: int, - monkeypatch: pytest.MonkeyPatch, ) -> None: - with monkeypatch.context() as m: - prompts = [ - "Hello, my name is", "The president of the United States is", - "The capital of France is", "The future of AI is" - ] + prompts = [ + "Hello, my name is", "The president of the United States is", + "The capital of France is", "The future of AI is" + ] - # aclgraph only support on v1 - m.setenv("VLLM_USE_V1", "1") + sampling_params = SamplingParams(max_tokens=max_tokens, temperature=0.0) + # TODO: change to use vllmrunner when the registry of custom op is solved + # while running pytest + vllm_model = LLM(model) + vllm_aclgraph_outputs = vllm_model.generate(prompts, sampling_params) + del vllm_model + torch.npu.empty_cache() - sampling_params = SamplingParams(max_tokens=max_tokens, - temperature=0.0) - # TODO: change to use vllmrunner when the registry of custom op is solved - # while running pytest - vllm_model = LLM(model) - vllm_aclgraph_outputs = vllm_model.generate(prompts, sampling_params) - del vllm_model - torch.npu.empty_cache() - - vllm_model = LLM(model, enforce_eager=True) - vllm_eager_outputs = vllm_model.generate(prompts, sampling_params) - del vllm_model - torch.npu.empty_cache() + vllm_model = LLM(model, enforce_eager=True) + vllm_eager_outputs = vllm_model.generate(prompts, sampling_params) + del vllm_model + torch.npu.empty_cache() vllm_aclgraph_outputs_list = [] for output in vllm_aclgraph_outputs: @@ -86,12 +76,9 @@ def test_models( ) -@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0", - reason="aclgraph only support on v1") def test_deepseek_raises_error(monkeypatch: pytest.MonkeyPatch) -> None: with monkeypatch.context() as m: m.setenv("VLLM_USE_MODELSCOPE", "True") - m.setenv("VLLM_USE_V1", "1") with pytest.raises(NotImplementedError) as excinfo: VllmRunner("deepseek-ai/DeepSeek-V2-Lite-Chat", max_model_len=1024, diff --git a/tests/e2e/singlecard/test_camem.py b/tests/e2e/singlecard/test_camem.py index 9fed3560e..a114998d3 100644 --- a/tests/e2e/singlecard/test_camem.py +++ b/tests/e2e/singlecard/test_camem.py @@ -21,7 +21,7 @@ import torch from vllm import LLM, SamplingParams from vllm.utils import GiB_bytes -from tests.utils import fork_new_process_for_each_test +from tests.e2e.utils import fork_new_process_for_each_test from vllm_ascend.device_allocator.camem import CaMemAllocator diff --git a/tests/e2e/singlecard/test_chunked.py b/tests/e2e/singlecard/test_chunked.py index 2240b88e2..874c8d187 100644 --- a/tests/e2e/singlecard/test_chunked.py +++ b/tests/e2e/singlecard/test_chunked.py @@ -20,8 +20,6 @@ Compare the outputs of vLLM with and without aclgraph. Run `pytest tests/compile/test_aclgraph.py`. """ -import os - import pytest import torch from vllm import LLM, SamplingParams @@ -29,8 +27,6 @@ from vllm import LLM, SamplingParams MODELS = ["deepseek-ai/DeepSeek-V2-Lite"] -@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0", - reason="new chunked only support on v1") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("max_tokens", [1]) def test_models( @@ -39,36 +35,33 @@ def test_models( monkeypatch: pytest.MonkeyPatch, ) -> None: return - with monkeypatch.context() as m: - prompts = "The president of the United States is" - m.setenv("VLLM_USE_V1", "1") + prompts = "The president of the United States is" - sampling_params = SamplingParams( - max_tokens=max_tokens, - temperature=0.0, - ) + sampling_params = SamplingParams( + max_tokens=max_tokens, + temperature=0.0, + ) - vllm_model = LLM(model, - long_prefill_token_threshold=4, - enforce_eager=True) - output_chunked = vllm_model.generate(prompts, sampling_params) - logprobs_chunked = output_chunked.outputs[0].logprobs - del vllm_model - torch.npu.empty_cache() + vllm_model = LLM(model, long_prefill_token_threshold=4, enforce_eager=True) + output_chunked = vllm_model.generate(prompts, sampling_params) + logprobs_chunked = output_chunked.outputs[0].logprobs + del vllm_model + torch.npu.empty_cache() - vllm_model = LLM(model, - enforce_eager=True, - additional_config={ - 'ascend_scheduler_config': { - 'enabled': True - }, - }) - output = vllm_model.generate(prompts, sampling_params) - logprobs = output.outputs[0].logprobs - del vllm_model - torch.npu.empty_cache() + vllm_model = LLM(model, + enforce_eager=True, + additional_config={ + 'ascend_scheduler_config': { + 'enabled': True + }, + }) + output = vllm_model.generate(prompts, sampling_params) + logprobs = output.outputs[0].logprobs + del vllm_model + torch.npu.empty_cache() - logprobs_similarity = torch.cosine_similarity( - logprobs_chunked.flatten(), logprobs.flatten(), dim=0) - assert logprobs_similarity > 0.95 + logprobs_similarity = torch.cosine_similarity(logprobs_chunked.flatten(), + logprobs.flatten(), + dim=0) + assert logprobs_similarity > 0.95 diff --git a/tests/e2e/singlecard/test_embedding.py b/tests/e2e/singlecard/test_embedding.py index 938f7cc3a..2868dc2e5 100644 --- a/tests/e2e/singlecard/test_embedding.py +++ b/tests/e2e/singlecard/test_embedding.py @@ -21,8 +21,8 @@ from typing import Optional from modelscope import snapshot_download # type: ignore[import-untyped] -from tests.conftest import HfRunner -from tests.utils import check_embeddings_close, matryoshka_fy +from tests.e2e.conftest import HfRunner +from tests.e2e.utils import check_embeddings_close, matryoshka_fy def run_embedding_correctness_test( diff --git a/tests/e2e/singlecard/test_guided_decoding.py b/tests/e2e/singlecard/test_guided_decoding.py index 9d103a530..20c03a5c8 100644 --- a/tests/e2e/singlecard/test_guided_decoding.py +++ b/tests/e2e/singlecard/test_guided_decoding.py @@ -18,14 +18,14 @@ # import json import os -import re import jsonschema import pytest +import regex as re from vllm.outputs import RequestOutput from vllm.sampling_params import GuidedDecodingParams, SamplingParams -from tests.conftest import VllmRunner +from tests.e2e.conftest import VllmRunner os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256" MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct" @@ -85,11 +85,7 @@ def sample_json_schema(): def check_backend(guided_decoding_backend: str): - if guided_decoding_backend not in GuidedDecodingBackendV0 and os.getenv( - "VLLM_USE_V1") == "0": - pytest.skip(f"{guided_decoding_backend} does not support v0, skip it.") - if guided_decoding_backend not in GuidedDecodingBackendV1 and os.getenv( - "VLLM_USE_V1") == "1": + if guided_decoding_backend not in GuidedDecodingBackendV1: pytest.skip(f"{guided_decoding_backend} does not support v1, skip it.") diff --git a/tests/e2e/singlecard/test_ilama_lora.py b/tests/e2e/singlecard/test_ilama_lora.py index 35f78ad77..e073e7c86 100644 --- a/tests/e2e/singlecard/test_ilama_lora.py +++ b/tests/e2e/singlecard/test_ilama_lora.py @@ -3,7 +3,7 @@ import vllm from modelscope import snapshot_download # type: ignore from vllm.lora.request import LoRARequest -from tests.conftest import VllmRunner +from tests.e2e.conftest import VllmRunner MODEL_PATH = "vllm-ascend/ilama-3.2-1B" diff --git a/tests/e2e/singlecard/test_offline_inference.py b/tests/e2e/singlecard/test_offline_inference.py index 26acb9428..400fe80d5 100644 --- a/tests/e2e/singlecard/test_offline_inference.py +++ b/tests/e2e/singlecard/test_offline_inference.py @@ -30,7 +30,7 @@ from vllm import SamplingParams from vllm.assets.image import ImageAsset import vllm_ascend # noqa: F401 -from tests.conftest import VllmRunner +from tests.e2e.conftest import VllmRunner MODELS = [ "Qwen/Qwen2.5-0.5B-Instruct", diff --git a/tests/utils.py b/tests/e2e/utils.py similarity index 100% rename from tests/utils.py rename to tests/e2e/utils.py diff --git a/tests/ut/test_ascend_config.py b/tests/ut/test_ascend_config.py index 85557fd1e..f5a28b4fd 100644 --- a/tests/ut/test_ascend_config.py +++ b/tests/ut/test_ascend_config.py @@ -14,7 +14,6 @@ # import os -from unittest import mock from transformers import PretrainedConfig from vllm.config import ModelConfig, VllmConfig @@ -170,25 +169,23 @@ class TestAscendConfig(TestBase): init_ascend_config(test_vllm_config) check_ascend_config(test_vllm_config, False) - # For V1 engine - with mock.patch.dict(os.environ, {"VLLM_USE_V1": "1"}): - test_vllm_config.additional_config = { - "torchair_graph_config": { - "enabled": True, - }, - "refresh": True - } - init_ascend_config(test_vllm_config) - check_ascend_config(test_vllm_config, False) + test_vllm_config.additional_config = { + "torchair_graph_config": { + "enabled": True, + }, + "refresh": True + } + init_ascend_config(test_vllm_config) + check_ascend_config(test_vllm_config, False) - test_vllm_config.additional_config = { - "torchair_graph_config": { - "enabled": False, - }, - "refresh": True - } - init_ascend_config(test_vllm_config) - check_ascend_config(test_vllm_config, False) + test_vllm_config.additional_config = { + "torchair_graph_config": { + "enabled": False, + }, + "refresh": True + } + init_ascend_config(test_vllm_config) + check_ascend_config(test_vllm_config, False) @_clean_up_ascend_config def test_check_ascend_config_wrong_case(self): diff --git a/tests/ut/test_platform.py b/tests/ut/test_platform.py index fd4e99980..b6bee95ed 100644 --- a/tests/ut/test_platform.py +++ b/tests/ut/test_platform.py @@ -373,7 +373,6 @@ class TestNPUPlatform(TestBase): @patch("vllm_ascend.utils.is_310p", return_value=False) @patch("vllm_ascend.ascend_config.check_ascend_config") @patch("vllm_ascend.ascend_config.init_ascend_config") - @patch("vllm.envs.VLLM_USE_V1", True) def test_check_and_update_config_v1_worker_class_selection( self, mock_init_ascend, mock_check_ascend, mock_is_310p): mock_init_ascend.return_value = self.mock_ascend_config @@ -392,7 +391,6 @@ class TestNPUPlatform(TestBase): @patch("vllm_ascend.ascend_config.check_ascend_config") @patch("vllm_ascend.ascend_config.init_ascend_config") @patch("vllm_ascend.utils.is_310p", return_value=True) - @patch("vllm.envs.VLLM_USE_V1", True) def test_check_and_update_config_310p_no_custom_ops( self, mock_is_310p, mock_init_ascend, mock_check_ascend): mock_init_ascend.return_value = self.mock_ascend_config