From 787010a637ad110b2ef3f12fab9e57f6607938e5 Mon Sep 17 00:00:00 2001
From: wangxiyuan <wangxiyuan1007@gmail.com>
Date: Tue, 15 Jul 2025 12:49:57 +0800
Subject: [PATCH] [Test] Remove VLLM_USE_V1 in example and tests (#1733)

V1 is enabled by default, no need to set it by hand now. This PR remove
the useless setting in example and tests

- vLLM version: v0.9.2
- vLLM main:
https://github.com/vllm-project/vllm/commit/9ad0a4588ba4e9c979cda0d178dec4fcdb89fd0c

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
---
 .github/workflows/vllm_ascend_test.yaml       |  69 ++-------
 examples/offline_data_parallel.py             |   1 -
 examples/offline_dualbatch_overlap_npu.py     |   1 -
 examples/offline_inference_sleep_mode_npu.py  |   1 -
 examples/run_dp_attention_etp16.sh            |   1 -
 requirements-dev.txt                          |   1 +
 requirements-lint.txt                         |   1 +
 tests/{ => e2e}/conftest.py                   |   6 +-
 tests/{ => e2e}/model_utils.py                |   0
 .../multicard/test_fused_moe_allgather_ep.py  |  15 +-
 tests/e2e/multicard/test_ilama_lora_tp2.py    |   2 +-
 .../test_offline_inference_distributed.py     |   2 +-
 tests/e2e/multicard/test_pipeline_parallel.py |   2 +-
 tests/e2e/multicard/test_prefix_caching.py    |  10 +-
 .../e2e/multicard/test_torchair_graph_mode.py |  10 +-
 .../test_ascend_scheduler_e2e.py              |   4 -
 .../ascend_scheduler/test_chunk_prefill.py    |   4 +-
 .../spec_decode_v1/test_v1_mtp_correctness.py |  54 ++++---
 .../spec_decode_v1/test_v1_spec_decode.py     | 132 ++++++++----------
 tests/e2e/singlecard/test_aclgraph.py         |  47 +++----
 tests/e2e/singlecard/test_camem.py            |   2 +-
 tests/e2e/singlecard/test_chunked.py          |  57 ++++----
 tests/e2e/singlecard/test_embedding.py        |   4 +-
 tests/e2e/singlecard/test_guided_decoding.py  |  10 +-
 tests/e2e/singlecard/test_ilama_lora.py       |   2 +-
 .../e2e/singlecard/test_offline_inference.py  |   2 +-
 tests/{ => e2e}/utils.py                      |   0
 tests/ut/test_ascend_config.py                |  35 +++--
 tests/ut/test_platform.py                     |   2 -
 29 files changed, 186 insertions(+), 291 deletions(-)
 rename tests/{ => e2e}/conftest.py (98%)
 rename tests/{ => e2e}/model_utils.py (100%)
 rename tests/{ => e2e}/utils.py (100%)

diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml
index ec23fcef1..d06fd3d25 100644
--- a/.github/workflows/vllm_ascend_test.yaml
+++ b/.github/workflows/vllm_ascend_test.yaml
@@ -41,16 +41,10 @@ concurrency:
 
 jobs:
   lint:
-    # Only trigger lint on pull request
-    if: ${{ github.event_name == 'pull_request' }}
     uses: ./.github/workflows/pre-commit.yml
 
   changes:
-    # Only trigger changes on pull request
-    if: ${{ github.event_name == 'pull_request' }}
     runs-on: ubuntu-latest
-    permissions:
-      pull-requests: read
     outputs:
       e2e_tracker: ${{ steps.filter.outputs.e2e_tracker }}
       ut_tracker: ${{ steps.filter.outputs.ut_tracker }}
@@ -60,20 +54,24 @@ jobs:
       with:
         filters: |
           e2e_tracker:
+            - '.github/workflows/vllm_ascend_test.yaml'
             - 'vllm_ascend/**'
             - 'csrc/**'
             - 'cmake/**'
             - 'tests/e2e/**'
-            - 'tests/conftest.py'
-            - 'tests/model_utils.py'
-            - 'tests/utils.py'
+            - 'CMakeLists.txt'
+            - 'setup.py'
+            - 'requirements.txt'
+            - 'requirements-dev.txt'
+            - 'requirements-lint.txt'
+            - 'packages.txt'
           ut_tracker:
             - 'tests/ut/**'
   ut:
     needs: [lint, changes]
     name: unit test
-    # only trigger unit test after lint passed and the change is e2e and ut related. Or the PR is merged.
-    if: ${{ github.event_name == 'push' || (needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true')) }}
+    # only trigger unit test after lint passed and the change is e2e and ut related.
+    if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }}
     runs-on: ubuntu-latest
     container:
       image: quay.io/ascend/cann:8.1.rc1-910b-ubuntu22.04-py3.10
@@ -112,9 +110,8 @@ jobs:
           python3 -m pip install -r requirements-dev.txt --extra-index https://download.pytorch.org/whl/cpu/
           python3 -m pip install -v . --extra-index https://download.pytorch.org/whl/cpu/
 
-      - name: Run unit test for V1 Engine
+      - name: Run unit test
         env:
-          VLLM_USE_V1: 1
           VLLM_WORKER_MULTIPROC_METHOD: spawn
           TORCH_DEVICE_BACKEND_AUTOLOAD: 0
         run: |
@@ -133,8 +130,8 @@ jobs:
 
   e2e:
     needs: [lint, changes]
-    # only trigger e2e test after lint passed and the change is e2e related.
-    if: ${{ needs.lint.result == 'success' && needs.changes.outputs.e2e_tracker == 'true' }}
+    # only trigger e2e test after lint passed and the change is e2e related with pull request.
+    if: ${{ github.event_name == 'pull_request' && needs.lint.result == 'success' && needs.changes.outputs.e2e_tracker == 'true' }}
     strategy:
       max-parallel: 2
       matrix:
@@ -189,9 +186,8 @@ jobs:
           pip install -r requirements-dev.txt
           pip install -v -e .
 
-      - name: Run e2e test for V1 Engine
+      - name: Run e2e test
         env:
-          VLLM_USE_V1: 1
           VLLM_WORKER_MULTIPROC_METHOD: spawn
           VLLM_USE_MODELSCOPE: True
         run: |
@@ -213,26 +209,6 @@ jobs:
           # TODO: revert me when test_v1_spec_decode.py::test_ngram_correctness is fixed
           VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py
 
-      - name: Run e2e test on V0 engine
-        if: ${{ github.event_name == 'schedule' }}
-        env:
-          VLLM_USE_V1: 0
-          VLLM_USE_MODELSCOPE: True
-        run: |
-          pytest -sv tests/e2e/singlecard/test_offline_inference.py
-          pytest -sv tests/e2e/singlecard/test_ilama_lora.py
-          pytest -sv tests/e2e/singlecard/test_guided_decoding.py
-          pytest -sv tests/e2e/singlecard/test_camem.py
-          pytest -sv tests/e2e/singlecard/test_prompt_embedding.py
-          pytest -sv tests/e2e/singlecard/test_embedding.py
-          pytest -sv tests/e2e/singlecard/ \
-            --ignore=tests/e2e/singlecard/test_offline_inference.py \
-            --ignore=tests/e2e/singlecard/test_ilama_lora.py \
-            --ignore=tests/e2e/singlecard/test_guided_decoding.py \
-            --ignore=tests/e2e/singlecard/test_camem.py \
-            --ignore=tests/e2e/singlecard/test_prompt_embedding.py \
-            --ignore=tests/e2e/singlecard/test_embedding.py
-
   e2e-4-cards:
     needs: [e2e]
     if: ${{ needs.e2e.result == 'success' }}
@@ -290,9 +266,8 @@ jobs:
           pip install -r requirements-dev.txt
           pip install -v -e .
 
-      - name: Run vllm-project/vllm-ascend test for V1 Engine
+      - name: Run vllm-project/vllm-ascend test
         env:
-          VLLM_USE_V1: 1
           VLLM_WORKER_MULTIPROC_METHOD: spawn
           VLLM_USE_MODELSCOPE: True
         run: |
@@ -308,19 +283,3 @@ jobs:
           pytest -sv tests/e2e/multicard/ --ignore=tests/e2e/multicard/test_ilama_lora_tp2.py \
             --ignore=tests/e2e/multicard/test_offline_inference_distributed.py \
             --ignore=tests/e2e/multicard/test_data_parallel.py
-
-      - name: Run vllm-project/vllm-ascend test on V0 engine
-        if: ${{ github.event_name == 'schedule' }}
-        env:
-          VLLM_USE_V1: 0
-          VLLM_USE_MODELSCOPE: True
-        run: |
-          pytest -sv tests/e2e/multicard/test_ilama_lora_tp2.py
-          # Fixme: run VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py will raise error.
-          # To avoid oom, we need to run the test in a single process.
-          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ
-          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W8A8
-          pytest -sv tests/e2e/multicard/test_data_parallel.py
-          pytest -sv tests/e2e/multicard/ --ignore=tests/e2e/multicard/test_ilama_lora_tp2.py \
-            --ignore=tests/e2e/multicard/test_offline_inference_distributed.py \
-            --ignore=tests/e2e/multicard/test_data_parallel.py
diff --git a/examples/offline_data_parallel.py b/examples/offline_data_parallel.py
index 64084ac69..754dfbc7c 100644
--- a/examples/offline_data_parallel.py
+++ b/examples/offline_data_parallel.py
@@ -120,7 +120,6 @@ def main(
     trust_remote_code,
 ):
     # DP only support on V1 engine
-    os.environ["VLLM_USE_V1"] = "1"
     os.environ["VLLM_DP_RANK"] = str(global_dp_rank)
     os.environ["VLLM_DP_RANK_LOCAL"] = str(local_dp_rank)
     os.environ["VLLM_DP_SIZE"] = str(dp_size)
diff --git a/examples/offline_dualbatch_overlap_npu.py b/examples/offline_dualbatch_overlap_npu.py
index d8153e38c..e721ab2aa 100644
--- a/examples/offline_dualbatch_overlap_npu.py
+++ b/examples/offline_dualbatch_overlap_npu.py
@@ -5,7 +5,6 @@ from vllm import LLM, SamplingParams
 
 # enable dual-batch overlap for vllm ascend
 os.environ["VLLM_ASCEND_ENABLE_DBO"] = "1"
-os.environ["VLLM_USE_V1"] = "1"
 
 # Sample prompts.
 prompts = ["The president of the United States is"] * 41
diff --git a/examples/offline_inference_sleep_mode_npu.py b/examples/offline_inference_sleep_mode_npu.py
index 01b3b4a7e..7b7d42268 100644
--- a/examples/offline_inference_sleep_mode_npu.py
+++ b/examples/offline_inference_sleep_mode_npu.py
@@ -22,7 +22,6 @@ import torch
 from vllm import LLM, SamplingParams
 from vllm.utils import GiB_bytes
 
-os.environ["VLLM_USE_V1"] = "1"
 os.environ["VLLM_USE_MODELSCOPE"] = "True"
 os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
 
diff --git a/examples/run_dp_attention_etp16.sh b/examples/run_dp_attention_etp16.sh
index 62233de7c..5d87879a1 100644
--- a/examples/run_dp_attention_etp16.sh
+++ b/examples/run_dp_attention_etp16.sh
@@ -1,4 +1,3 @@
-export VLLM_USE_V1=1
 export TASK_QUEUE_ENABLE=1
 source /usr/local/Ascend/ascend-toolkit/set_env.sh
 source /usr/local/Ascend/nnal/atb/set_env.sh
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 8bd7dcadb..d4a5acd1a 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -12,4 +12,5 @@ xgrammar
 zmq
 types-psutil
 pytest-cov
+regex
 sentence_transformers
diff --git a/requirements-lint.txt b/requirements-lint.txt
index eab3838f0..8a575e5a2 100644
--- a/requirements-lint.txt
+++ b/requirements-lint.txt
@@ -4,5 +4,6 @@ pre-commit==4.0.1
 # type checking
 mypy==1.11.1
 types-PyYAML
+types-regex
 types-requests
 types-setuptools
diff --git a/tests/conftest.py b/tests/e2e/conftest.py
similarity index 98%
rename from tests/conftest.py
rename to tests/e2e/conftest.py
index 64bc466e7..50ca0f3e3 100644
--- a/tests/conftest.py
+++ b/tests/e2e/conftest.py
@@ -39,8 +39,8 @@ from vllm.sampling_params import BeamSearchParams
 from vllm.transformers_utils.utils import maybe_model_redirect
 from vllm.utils import is_list_of
 
-from tests.model_utils import (PROMPT_TEMPLATES, TokensTextLogprobs,
-                               TokensTextLogprobsPromptLogprobs)
+from tests.e2e.model_utils import (PROMPT_TEMPLATES, TokensTextLogprobs,
+                                   TokensTextLogprobsPromptLogprobs)
 # TODO: remove this part after the patch merged into vllm, if
 # we not explicitly patch here, some of them might be effectiveless
 # in pytest scenario
@@ -62,7 +62,7 @@ PromptAudioInput = _PromptMultiModalInput[Tuple[np.ndarray, int]]
 PromptVideoInput = _PromptMultiModalInput[np.ndarray]
 
 _TEST_DIR = os.path.dirname(__file__)
-_TEST_PROMPTS = [os.path.join(_TEST_DIR, "e2e", "prompts", "example.txt")]
+_TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
 
 
 def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
diff --git a/tests/model_utils.py b/tests/e2e/model_utils.py
similarity index 100%
rename from tests/model_utils.py
rename to tests/e2e/model_utils.py
diff --git a/tests/e2e/multicard/test_fused_moe_allgather_ep.py b/tests/e2e/multicard/test_fused_moe_allgather_ep.py
index ad755dd16..221d33f0d 100644
--- a/tests/e2e/multicard/test_fused_moe_allgather_ep.py
+++ b/tests/e2e/multicard/test_fused_moe_allgather_ep.py
@@ -26,12 +26,11 @@ from unittest.mock import patch
 from modelscope import snapshot_download  # type: ignore
 from vllm import SamplingParams
 
-from tests.conftest import VllmRunner
+from tests.e2e.conftest import VllmRunner
 
 
 @patch.dict(
     os.environ, {
-        "VLLM_USE_V1": "1",
         "VLLM_WORKER_MULTIPROC_METHOD": "spawn",
         "TASK_QUEUE_ENABLE": "1",
         "VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP": "1"
@@ -56,12 +55,10 @@ def test_generate_with_allgather():
         vllm_model.generate(example_prompts, sampling_params)
 
 
-@patch.dict(
-    os.environ, {
-        "VLLM_USE_V1": "1",
-        "VLLM_WORKER_MULTIPROC_METHOD": "spawn",
-        "TASK_QUEUE_ENABLE": "1"
-    })
+@patch.dict(os.environ, {
+    "VLLM_WORKER_MULTIPROC_METHOD": "spawn",
+    "TASK_QUEUE_ENABLE": "1"
+})
 def test_generate_with_alltoall():
     example_prompts = ["Hello, my name is"]
     sampling_params = SamplingParams(max_tokens=100, temperature=0.0)
@@ -79,4 +76,4 @@ def test_generate_with_alltoall():
                         },
                         "expert_tensor_parallel_size": 1
                     }) as vllm_model:
-        vllm_model.generate(example_prompts, sampling_params)
\ No newline at end of file
+        vllm_model.generate(example_prompts, sampling_params)
diff --git a/tests/e2e/multicard/test_ilama_lora_tp2.py b/tests/e2e/multicard/test_ilama_lora_tp2.py
index 3f62bfd7e..e22550c2f 100644
--- a/tests/e2e/multicard/test_ilama_lora_tp2.py
+++ b/tests/e2e/multicard/test_ilama_lora_tp2.py
@@ -1,7 +1,7 @@
 import pytest
 from modelscope import snapshot_download  # type: ignore
 
-from tests.conftest import VllmRunner
+from tests.e2e.conftest import VllmRunner
 from tests.e2e.singlecard.test_ilama_lora import (EXPECTED_LORA_OUTPUT,
                                                   MODEL_PATH, do_sample)
 
diff --git a/tests/e2e/multicard/test_offline_inference_distributed.py b/tests/e2e/multicard/test_offline_inference_distributed.py
index 58d0bf0ba..2b155383c 100644
--- a/tests/e2e/multicard/test_offline_inference_distributed.py
+++ b/tests/e2e/multicard/test_offline_inference_distributed.py
@@ -27,7 +27,7 @@ from modelscope import snapshot_download  # type: ignore
 from vllm import SamplingParams
 from vllm.model_executor.models.registry import ModelRegistry
 
-from tests.conftest import VllmRunner
+from tests.e2e.conftest import VllmRunner
 
 os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
 
diff --git a/tests/e2e/multicard/test_pipeline_parallel.py b/tests/e2e/multicard/test_pipeline_parallel.py
index a7070b688..c0c757ab3 100644
--- a/tests/e2e/multicard/test_pipeline_parallel.py
+++ b/tests/e2e/multicard/test_pipeline_parallel.py
@@ -16,7 +16,7 @@
 #
 import pytest
 
-from tests.conftest import VllmRunner
+from tests.e2e.conftest import VllmRunner
 
 MODELS = [
     "Qwen/Qwen3-0.6B",
diff --git a/tests/e2e/multicard/test_prefix_caching.py b/tests/e2e/multicard/test_prefix_caching.py
index 368d3ff95..73d0d2c4a 100644
--- a/tests/e2e/multicard/test_prefix_caching.py
+++ b/tests/e2e/multicard/test_prefix_caching.py
@@ -2,12 +2,10 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Compare the with and without prefix caching on V1 scheduler or AscendScheduler."""
 
-import os
-
 import pytest
 
-from tests.conftest import VllmRunner
-from tests.model_utils import check_outputs_equal
+from tests.e2e.conftest import VllmRunner
+from tests.e2e.model_utils import check_outputs_equal
 
 MODELS = [
     # for MHA
@@ -60,8 +58,6 @@ INPUT_PROMPTS = [
 ]
 
 
-@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
-                    reason="mtp is not supported on v1")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("max_tokens", [50])
 def test_prefix_cache_with_v1_scheduler(model: str, max_tokens: int) -> None:
@@ -89,8 +85,6 @@ def test_prefix_cache_with_v1_scheduler(model: str, max_tokens: int) -> None:
     )
 
 
-@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
-                    reason="mtp is not supported on v1")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("max_tokens", [50])
 def test_prefix_cache_with_ascend_scheduler(model: str,
diff --git a/tests/e2e/multicard/test_torchair_graph_mode.py b/tests/e2e/multicard/test_torchair_graph_mode.py
index ce628f9d3..d363560dd 100644
--- a/tests/e2e/multicard/test_torchair_graph_mode.py
+++ b/tests/e2e/multicard/test_torchair_graph_mode.py
@@ -22,9 +22,7 @@ Run `pytest tests/multicard/test_torchair_graph_mode.py`.
 import os
 from typing import Dict
 
-import pytest
-
-from tests.conftest import VllmRunner
+from tests.e2e.conftest import VllmRunner
 
 os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
 
@@ -78,8 +76,6 @@ def _deepseek_torchair_test_fixture(
         print(f"Generated text: {vllm_output[i][1]!r}")
 
 
-@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
-                    reason="torchair graph is not supported on v0")
 def test_e2e_deepseekv3_with_torchair():
     additional_config = {
         "torchair_graph_config": {
@@ -89,8 +85,6 @@ def test_e2e_deepseekv3_with_torchair():
     _deepseek_torchair_test_fixture(additional_config)
 
 
-@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
-                    reason="torchair graph is not supported on v0")
 def test_e2e_deepseekv3_with_torchair_ms_mla():
     additional_config = {
         "torchair_graph_config": {
@@ -150,8 +144,6 @@ def _pangu_torchair_test_fixture(
         print(f"Generated text: {vllm_output[i][1]!r}")
 
 
-@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
-                    reason="torchair graph is not supported on v0")
 def test_e2e_pangu_with_torchair():
     additional_config = {
         "torchair_graph_config": {
diff --git a/tests/e2e/singlecard/core/ascend_scheduler/test_ascend_scheduler_e2e.py b/tests/e2e/singlecard/core/ascend_scheduler/test_ascend_scheduler_e2e.py
index 17116ab59..73e392a28 100644
--- a/tests/e2e/singlecard/core/ascend_scheduler/test_ascend_scheduler_e2e.py
+++ b/tests/e2e/singlecard/core/ascend_scheduler/test_ascend_scheduler_e2e.py
@@ -1,15 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import gc
-import os
 
 import pytest
 import torch
 from vllm import LLM
 
-if os.getenv("VLLM_USE_V1", "0") != "1":
-    pytest.skip("Test package requires V1", allow_module_level=True)
-
 MODEL = "Qwen/Qwen2.5-0.5B-Instruct"
 PROMPT = "Hello my name is Robert and I"
 
diff --git a/tests/e2e/singlecard/core/ascend_scheduler/test_chunk_prefill.py b/tests/e2e/singlecard/core/ascend_scheduler/test_chunk_prefill.py
index 0b557960e..e25e857da 100644
--- a/tests/e2e/singlecard/core/ascend_scheduler/test_chunk_prefill.py
+++ b/tests/e2e/singlecard/core/ascend_scheduler/test_chunk_prefill.py
@@ -9,8 +9,8 @@ Run `pytest tests/e2e/singlecard/core/ascend_scheduler/test_chunk_prefill.py`.
 """
 import pytest
 
-from tests.conftest import VllmRunner
-from tests.model_utils import check_outputs_equal
+from tests.e2e.conftest import VllmRunner
+from tests.e2e.model_utils import check_outputs_equal
 
 MODELS = [
     "Qwen/Qwen3-0.6B-Base",
diff --git a/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py b/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py
index 0cf64b059..10322f49e 100644
--- a/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py
+++ b/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py
@@ -53,7 +53,6 @@ def model_name():
 @pytest.mark.skipif(
     True, reason="TODO: Enable me after test_mtp_correctness is fixed")
 def test_mtp_correctness(
-    monkeypatch: pytest.MonkeyPatch,
     test_prompts: list[list[dict[str, Any]]],
     sampling_config: SamplingParams,
     model_name: str,
@@ -62,33 +61,30 @@ def test_mtp_correctness(
     Compare the outputs of a original LLM and a speculative LLM
     should be the same when using mtp speculative decoding.
     '''
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
+    ref_llm = LLM(model=model_name, max_model_len=256, enforce_eager=True)
+    ref_outputs = ref_llm.chat(test_prompts, sampling_config)
+    del ref_llm
 
-        ref_llm = LLM(model=model_name, max_model_len=256, enforce_eager=True)
-        ref_outputs = ref_llm.chat(test_prompts, sampling_config)
-        del ref_llm
+    spec_llm = LLM(model=model_name,
+                   trust_remote_code=True,
+                   speculative_config={
+                       "method": "deepseek_mtp",
+                       "num_speculative_tokens": 1,
+                   },
+                   max_model_len=256,
+                   enforce_eager=True)
+    spec_outputs = spec_llm.chat(test_prompts, sampling_config)
+    matches = 0
+    misses = 0
+    for ref_output, spec_output in zip(ref_outputs, spec_outputs):
+        if ref_output.outputs[0].text == spec_output.outputs[0].text:
+            matches += 1
+        else:
+            misses += 1
+            print(f"ref_output: {ref_output.outputs[0].text}")
+            print(f"spec_output: {spec_output.outputs[0].text}")
 
-        spec_llm = LLM(model=model_name,
-                       trust_remote_code=True,
-                       speculative_config={
-                           "method": "deepseek_mtp",
-                           "num_speculative_tokens": 1,
-                       },
-                       max_model_len=256,
-                       enforce_eager=True)
-        spec_outputs = spec_llm.chat(test_prompts, sampling_config)
-        matches = 0
-        misses = 0
-        for ref_output, spec_output in zip(ref_outputs, spec_outputs):
-            if ref_output.outputs[0].text == spec_output.outputs[0].text:
-                matches += 1
-            else:
-                misses += 1
-                print(f"ref_output: {ref_output.outputs[0].text}")
-                print(f"spec_output: {spec_output.outputs[0].text}")
-
-        # Heuristic: expect at least 66% of the prompts to match exactly
-        # Upon failure, inspect the outputs to check for inaccuracy.
-        assert matches > int(0.66 * len(ref_outputs))
-        del spec_llm
+    # Heuristic: expect at least 66% of the prompts to match exactly
+    # Upon failure, inspect the outputs to check for inaccuracy.
+    assert matches > int(0.66 * len(ref_outputs))
+    del spec_llm
diff --git a/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py b/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py
index 35cb19a14..56fa6cc63 100644
--- a/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py
+++ b/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py
@@ -60,7 +60,6 @@ def eagle3_model_name():
 
 
 def test_ngram_correctness(
-    monkeypatch: pytest.MonkeyPatch,
     test_prompts: list[list[dict[str, Any]]],
     sampling_config: SamplingParams,
     model_name: str,
@@ -70,44 +69,40 @@ def test_ngram_correctness(
     should be the same when using ngram speculative decoding.
     '''
     pytest.skip("Not current support for the test.")
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
+    ref_llm = LLM(model=model_name, max_model_len=1024, enforce_eager=True)
+    ref_outputs = ref_llm.chat(test_prompts, sampling_config)
+    del ref_llm
 
-        ref_llm = LLM(model=model_name, max_model_len=1024, enforce_eager=True)
-        ref_outputs = ref_llm.chat(test_prompts, sampling_config)
-        del ref_llm
+    spec_llm = LLM(
+        model=model_name,
+        speculative_config={
+            "method": "ngram",
+            "prompt_lookup_max": 5,
+            "prompt_lookup_min": 3,
+            "num_speculative_tokens": 3,
+        },
+        max_model_len=1024,
+        enforce_eager=True,
+    )
+    spec_outputs = spec_llm.chat(test_prompts, sampling_config)
+    matches = 0
+    misses = 0
+    for ref_output, spec_output in zip(ref_outputs, spec_outputs):
+        if ref_output.outputs[0].text == spec_output.outputs[0].text:
+            matches += 1
+        else:
+            misses += 1
+            print(f"ref_output: {ref_output.outputs[0].text}")
+            print(f"spec_output: {spec_output.outputs[0].text}")
 
-        spec_llm = LLM(
-            model=model_name,
-            speculative_config={
-                "method": "ngram",
-                "prompt_lookup_max": 5,
-                "prompt_lookup_min": 3,
-                "num_speculative_tokens": 3,
-            },
-            max_model_len=1024,
-            enforce_eager=True,
-        )
-        spec_outputs = spec_llm.chat(test_prompts, sampling_config)
-        matches = 0
-        misses = 0
-        for ref_output, spec_output in zip(ref_outputs, spec_outputs):
-            if ref_output.outputs[0].text == spec_output.outputs[0].text:
-                matches += 1
-            else:
-                misses += 1
-                print(f"ref_output: {ref_output.outputs[0].text}")
-                print(f"spec_output: {spec_output.outputs[0].text}")
-
-        # Heuristic: expect at least 70% of the prompts to match exactly
-        # Upon failure, inspect the outputs to check for inaccuracy.
-        assert matches > int(0.7 * len(ref_outputs))
-        del spec_llm
+    # Heuristic: expect at least 70% of the prompts to match exactly
+    # Upon failure, inspect the outputs to check for inaccuracy.
+    assert matches > int(0.7 * len(ref_outputs))
+    del spec_llm
 
 
 @pytest.mark.parametrize("use_eagle3", [False, True], ids=["eagle", "eagle3"])
 def test_eagle_correctness(
-    monkeypatch: pytest.MonkeyPatch,
     test_prompts: list[list[dict[str, Any]]],
     sampling_config: SamplingParams,
     model_name: str,
@@ -119,43 +114,40 @@ def test_eagle_correctness(
     '''
     if not use_eagle3:
         pytest.skip("Not current support for the test.")
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
 
-        ref_llm = LLM(model=model_name, max_model_len=2048, enforce_eager=True)
-        ref_outputs = ref_llm.chat(test_prompts, sampling_config)
-        del ref_llm
+    ref_llm = LLM(model=model_name, max_model_len=2048, enforce_eager=True)
+    ref_outputs = ref_llm.chat(test_prompts, sampling_config)
+    del ref_llm
 
-        spec_model_name = eagle3_model_name(
-        ) if use_eagle3 else eagle_model_name()
-        spec_llm = LLM(
-            model=model_name,
-            trust_remote_code=True,
-            enable_chunked_prefill=True,
-            max_num_seqs=1,
-            max_num_batched_tokens=2048,
-            gpu_memory_utilization=0.6,
-            speculative_config={
-                "method": "eagle3" if use_eagle3 else "eagle",
-                "model": spec_model_name,
-                "num_speculative_tokens": 2,
-                "max_model_len": 128,
-            },
-            max_model_len=128,
-            enforce_eager=True,
-        )
-        spec_outputs = spec_llm.chat(test_prompts, sampling_config)
-        matches = 0
-        misses = 0
-        for ref_output, spec_output in zip(ref_outputs, spec_outputs):
-            if ref_output.outputs[0].text == spec_output.outputs[0].text:
-                matches += 1
-            else:
-                misses += 1
-                print(f"ref_output: {ref_output.outputs[0].text}")
-                print(f"spec_output: {spec_output.outputs[0].text}")
+    spec_model_name = eagle3_model_name() if use_eagle3 else eagle_model_name()
+    spec_llm = LLM(
+        model=model_name,
+        trust_remote_code=True,
+        enable_chunked_prefill=True,
+        max_num_seqs=1,
+        max_num_batched_tokens=2048,
+        gpu_memory_utilization=0.6,
+        speculative_config={
+            "method": "eagle3" if use_eagle3 else "eagle",
+            "model": spec_model_name,
+            "num_speculative_tokens": 2,
+            "max_model_len": 128,
+        },
+        max_model_len=128,
+        enforce_eager=True,
+    )
+    spec_outputs = spec_llm.chat(test_prompts, sampling_config)
+    matches = 0
+    misses = 0
+    for ref_output, spec_output in zip(ref_outputs, spec_outputs):
+        if ref_output.outputs[0].text == spec_output.outputs[0].text:
+            matches += 1
+        else:
+            misses += 1
+            print(f"ref_output: {ref_output.outputs[0].text}")
+            print(f"spec_output: {spec_output.outputs[0].text}")
 
-        # Heuristic: expect at least 66% of the prompts to match exactly
-        # Upon failure, inspect the outputs to check for inaccuracy.
-        assert matches > int(0.66 * len(ref_outputs))
-        del spec_llm
+    # Heuristic: expect at least 66% of the prompts to match exactly
+    # Upon failure, inspect the outputs to check for inaccuracy.
+    assert matches > int(0.66 * len(ref_outputs))
+    del spec_llm
diff --git a/tests/e2e/singlecard/test_aclgraph.py b/tests/e2e/singlecard/test_aclgraph.py
index 89dfa08e4..020196dac 100644
--- a/tests/e2e/singlecard/test_aclgraph.py
+++ b/tests/e2e/singlecard/test_aclgraph.py
@@ -20,14 +20,12 @@ Compare the outputs of vLLM with and without aclgraph.
 Run `pytest tests/compile/test_aclgraph.py`.
 """
 
-import os
-
 import pytest
 import torch
 from vllm import LLM, SamplingParams
 
-from tests.conftest import VllmRunner
-from tests.model_utils import check_outputs_equal
+from tests.e2e.conftest import VllmRunner
+from tests.e2e.model_utils import check_outputs_equal
 
 MODELS = [
     "Qwen/Qwen2.5-0.5B-Instruct",
@@ -36,37 +34,29 @@ MODELS = [
 ]
 
 
-@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
-                    reason="aclgraph only support on v1")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("max_tokens", [32])
 def test_models(
     model: str,
     max_tokens: int,
-    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
-    with monkeypatch.context() as m:
-        prompts = [
-            "Hello, my name is", "The president of the United States is",
-            "The capital of France is", "The future of AI is"
-        ]
+    prompts = [
+        "Hello, my name is", "The president of the United States is",
+        "The capital of France is", "The future of AI is"
+    ]
 
-        # aclgraph only support on v1
-        m.setenv("VLLM_USE_V1", "1")
+    sampling_params = SamplingParams(max_tokens=max_tokens, temperature=0.0)
+    # TODO: change to use vllmrunner when the registry of custom op is solved
+    # while running pytest
+    vllm_model = LLM(model)
+    vllm_aclgraph_outputs = vllm_model.generate(prompts, sampling_params)
+    del vllm_model
+    torch.npu.empty_cache()
 
-        sampling_params = SamplingParams(max_tokens=max_tokens,
-                                         temperature=0.0)
-        # TODO: change to use vllmrunner when the registry of custom op is solved
-        # while running pytest
-        vllm_model = LLM(model)
-        vllm_aclgraph_outputs = vllm_model.generate(prompts, sampling_params)
-        del vllm_model
-        torch.npu.empty_cache()
-
-        vllm_model = LLM(model, enforce_eager=True)
-        vllm_eager_outputs = vllm_model.generate(prompts, sampling_params)
-        del vllm_model
-        torch.npu.empty_cache()
+    vllm_model = LLM(model, enforce_eager=True)
+    vllm_eager_outputs = vllm_model.generate(prompts, sampling_params)
+    del vllm_model
+    torch.npu.empty_cache()
 
     vllm_aclgraph_outputs_list = []
     for output in vllm_aclgraph_outputs:
@@ -86,12 +76,9 @@ def test_models(
     )
 
 
-@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
-                    reason="aclgraph only support on v1")
 def test_deepseek_raises_error(monkeypatch: pytest.MonkeyPatch) -> None:
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_MODELSCOPE", "True")
-        m.setenv("VLLM_USE_V1", "1")
         with pytest.raises(NotImplementedError) as excinfo:
             VllmRunner("deepseek-ai/DeepSeek-V2-Lite-Chat",
                        max_model_len=1024,
diff --git a/tests/e2e/singlecard/test_camem.py b/tests/e2e/singlecard/test_camem.py
index 9fed3560e..a114998d3 100644
--- a/tests/e2e/singlecard/test_camem.py
+++ b/tests/e2e/singlecard/test_camem.py
@@ -21,7 +21,7 @@ import torch
 from vllm import LLM, SamplingParams
 from vllm.utils import GiB_bytes
 
-from tests.utils import fork_new_process_for_each_test
+from tests.e2e.utils import fork_new_process_for_each_test
 from vllm_ascend.device_allocator.camem import CaMemAllocator
 
 
diff --git a/tests/e2e/singlecard/test_chunked.py b/tests/e2e/singlecard/test_chunked.py
index 2240b88e2..874c8d187 100644
--- a/tests/e2e/singlecard/test_chunked.py
+++ b/tests/e2e/singlecard/test_chunked.py
@@ -20,8 +20,6 @@ Compare the outputs of vLLM with and without aclgraph.
 Run `pytest tests/compile/test_aclgraph.py`.
 """
 
-import os
-
 import pytest
 import torch
 from vllm import LLM, SamplingParams
@@ -29,8 +27,6 @@ from vllm import LLM, SamplingParams
 MODELS = ["deepseek-ai/DeepSeek-V2-Lite"]
 
 
-@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
-                    reason="new chunked only support on v1")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("max_tokens", [1])
 def test_models(
@@ -39,36 +35,33 @@ def test_models(
     monkeypatch: pytest.MonkeyPatch,
 ) -> None:
     return
-    with monkeypatch.context() as m:
-        prompts = "The president of the United States is"
 
-        m.setenv("VLLM_USE_V1", "1")
+    prompts = "The president of the United States is"
 
-        sampling_params = SamplingParams(
-            max_tokens=max_tokens,
-            temperature=0.0,
-        )
+    sampling_params = SamplingParams(
+        max_tokens=max_tokens,
+        temperature=0.0,
+    )
 
-        vllm_model = LLM(model,
-                         long_prefill_token_threshold=4,
-                         enforce_eager=True)
-        output_chunked = vllm_model.generate(prompts, sampling_params)
-        logprobs_chunked = output_chunked.outputs[0].logprobs
-        del vllm_model
-        torch.npu.empty_cache()
+    vllm_model = LLM(model, long_prefill_token_threshold=4, enforce_eager=True)
+    output_chunked = vllm_model.generate(prompts, sampling_params)
+    logprobs_chunked = output_chunked.outputs[0].logprobs
+    del vllm_model
+    torch.npu.empty_cache()
 
-        vllm_model = LLM(model,
-                         enforce_eager=True,
-                         additional_config={
-                             'ascend_scheduler_config': {
-                                 'enabled': True
-                             },
-                         })
-        output = vllm_model.generate(prompts, sampling_params)
-        logprobs = output.outputs[0].logprobs
-        del vllm_model
-        torch.npu.empty_cache()
+    vllm_model = LLM(model,
+                     enforce_eager=True,
+                     additional_config={
+                         'ascend_scheduler_config': {
+                             'enabled': True
+                         },
+                     })
+    output = vllm_model.generate(prompts, sampling_params)
+    logprobs = output.outputs[0].logprobs
+    del vllm_model
+    torch.npu.empty_cache()
 
-        logprobs_similarity = torch.cosine_similarity(
-            logprobs_chunked.flatten(), logprobs.flatten(), dim=0)
-        assert logprobs_similarity > 0.95
+    logprobs_similarity = torch.cosine_similarity(logprobs_chunked.flatten(),
+                                                  logprobs.flatten(),
+                                                  dim=0)
+    assert logprobs_similarity > 0.95
diff --git a/tests/e2e/singlecard/test_embedding.py b/tests/e2e/singlecard/test_embedding.py
index 938f7cc3a..2868dc2e5 100644
--- a/tests/e2e/singlecard/test_embedding.py
+++ b/tests/e2e/singlecard/test_embedding.py
@@ -21,8 +21,8 @@ from typing import Optional
 
 from modelscope import snapshot_download  # type: ignore[import-untyped]
 
-from tests.conftest import HfRunner
-from tests.utils import check_embeddings_close, matryoshka_fy
+from tests.e2e.conftest import HfRunner
+from tests.e2e.utils import check_embeddings_close, matryoshka_fy
 
 
 def run_embedding_correctness_test(
diff --git a/tests/e2e/singlecard/test_guided_decoding.py b/tests/e2e/singlecard/test_guided_decoding.py
index 9d103a530..20c03a5c8 100644
--- a/tests/e2e/singlecard/test_guided_decoding.py
+++ b/tests/e2e/singlecard/test_guided_decoding.py
@@ -18,14 +18,14 @@
 #
 import json
 import os
-import re
 
 import jsonschema
 import pytest
+import regex as re
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import GuidedDecodingParams, SamplingParams
 
-from tests.conftest import VllmRunner
+from tests.e2e.conftest import VllmRunner
 
 os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
 MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"
@@ -85,11 +85,7 @@ def sample_json_schema():
 
 
 def check_backend(guided_decoding_backend: str):
-    if guided_decoding_backend not in GuidedDecodingBackendV0 and os.getenv(
-            "VLLM_USE_V1") == "0":
-        pytest.skip(f"{guided_decoding_backend} does not support v0, skip it.")
-    if guided_decoding_backend not in GuidedDecodingBackendV1 and os.getenv(
-            "VLLM_USE_V1") == "1":
+    if guided_decoding_backend not in GuidedDecodingBackendV1:
         pytest.skip(f"{guided_decoding_backend} does not support v1, skip it.")
 
 
diff --git a/tests/e2e/singlecard/test_ilama_lora.py b/tests/e2e/singlecard/test_ilama_lora.py
index 35f78ad77..e073e7c86 100644
--- a/tests/e2e/singlecard/test_ilama_lora.py
+++ b/tests/e2e/singlecard/test_ilama_lora.py
@@ -3,7 +3,7 @@ import vllm
 from modelscope import snapshot_download  # type: ignore
 from vllm.lora.request import LoRARequest
 
-from tests.conftest import VllmRunner
+from tests.e2e.conftest import VllmRunner
 
 MODEL_PATH = "vllm-ascend/ilama-3.2-1B"
 
diff --git a/tests/e2e/singlecard/test_offline_inference.py b/tests/e2e/singlecard/test_offline_inference.py
index 26acb9428..400fe80d5 100644
--- a/tests/e2e/singlecard/test_offline_inference.py
+++ b/tests/e2e/singlecard/test_offline_inference.py
@@ -30,7 +30,7 @@ from vllm import SamplingParams
 from vllm.assets.image import ImageAsset
 
 import vllm_ascend  # noqa: F401
-from tests.conftest import VllmRunner
+from tests.e2e.conftest import VllmRunner
 
 MODELS = [
     "Qwen/Qwen2.5-0.5B-Instruct",
diff --git a/tests/utils.py b/tests/e2e/utils.py
similarity index 100%
rename from tests/utils.py
rename to tests/e2e/utils.py
diff --git a/tests/ut/test_ascend_config.py b/tests/ut/test_ascend_config.py
index 85557fd1e..f5a28b4fd 100644
--- a/tests/ut/test_ascend_config.py
+++ b/tests/ut/test_ascend_config.py
@@ -14,7 +14,6 @@
 #
 
 import os
-from unittest import mock
 
 from transformers import PretrainedConfig
 from vllm.config import ModelConfig, VllmConfig
@@ -170,25 +169,23 @@ class TestAscendConfig(TestBase):
         init_ascend_config(test_vllm_config)
         check_ascend_config(test_vllm_config, False)
 
-        # For V1 engine
-        with mock.patch.dict(os.environ, {"VLLM_USE_V1": "1"}):
-            test_vllm_config.additional_config = {
-                "torchair_graph_config": {
-                    "enabled": True,
-                },
-                "refresh": True
-            }
-            init_ascend_config(test_vllm_config)
-            check_ascend_config(test_vllm_config, False)
+        test_vllm_config.additional_config = {
+            "torchair_graph_config": {
+                "enabled": True,
+            },
+            "refresh": True
+        }
+        init_ascend_config(test_vllm_config)
+        check_ascend_config(test_vllm_config, False)
 
-            test_vllm_config.additional_config = {
-                "torchair_graph_config": {
-                    "enabled": False,
-                },
-                "refresh": True
-            }
-            init_ascend_config(test_vllm_config)
-            check_ascend_config(test_vllm_config, False)
+        test_vllm_config.additional_config = {
+            "torchair_graph_config": {
+                "enabled": False,
+            },
+            "refresh": True
+        }
+        init_ascend_config(test_vllm_config)
+        check_ascend_config(test_vllm_config, False)
 
     @_clean_up_ascend_config
     def test_check_ascend_config_wrong_case(self):
diff --git a/tests/ut/test_platform.py b/tests/ut/test_platform.py
index fd4e99980..b6bee95ed 100644
--- a/tests/ut/test_platform.py
+++ b/tests/ut/test_platform.py
@@ -373,7 +373,6 @@ class TestNPUPlatform(TestBase):
     @patch("vllm_ascend.utils.is_310p", return_value=False)
     @patch("vllm_ascend.ascend_config.check_ascend_config")
     @patch("vllm_ascend.ascend_config.init_ascend_config")
-    @patch("vllm.envs.VLLM_USE_V1", True)
     def test_check_and_update_config_v1_worker_class_selection(
             self, mock_init_ascend, mock_check_ascend, mock_is_310p):
         mock_init_ascend.return_value = self.mock_ascend_config
@@ -392,7 +391,6 @@ class TestNPUPlatform(TestBase):
     @patch("vllm_ascend.ascend_config.check_ascend_config")
     @patch("vllm_ascend.ascend_config.init_ascend_config")
     @patch("vllm_ascend.utils.is_310p", return_value=True)
-    @patch("vllm.envs.VLLM_USE_V1", True)
     def test_check_and_update_config_310p_no_custom_ops(
             self, mock_is_310p, mock_init_ascend, mock_check_ascend):
         mock_init_ascend.return_value = self.mock_ascend_config