enable more tests (#161192)

Enable more vllm test against pytorch main, add schedule to run the test every 12 hours. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161192 Approved by: https://github.com/huydhn
2025-10-20 21:14:14 +08:00 · 2025-08-22 17:07:30 -07:00
parent 36ac916929
commit 6443ea337d
7 changed files with 174 additions and 14 deletions
--- a/.ci/lumen_cli/cli/lib/core/vllm/lib.py
+++ b/.ci/lumen_cli/cli/lib/core/vllm/lib.py
@ -3,7 +3,7 @@ from typing import Any

 from cli.lib.common.git_helper import clone_external_repo
 from cli.lib.common.pip_helper import pip_install_packages
-from cli.lib.common.utils import run_command, working_directory
+from cli.lib.common.utils import run_command, temp_environ, working_directory


 logger = logging.getLogger(__name__)
@ -20,8 +20,10 @@ def sample_vllm_test_library():
        "vllm_basic_correctness_test": {
            "title": "Basic Correctness Test",
            "id": "vllm_basic_correctness_test",
+            "env_vars": {
+                "VLLM_WORKER_MULTIPROC_METHOD": "spawn",
+            },
            "steps": [
-                "export VLLM_WORKER_MULTIPROC_METHOD=spawn",
                "pytest -v -s basic_correctness/test_cumem.py",
                "pytest -v -s basic_correctness/test_basic_correctness.py",
                "pytest -v -s basic_correctness/test_cpu_offload.py",
@ -42,8 +44,10 @@ def sample_vllm_test_library():
        "vllm_entrypoints_test": {
            "title": "Entrypoints Test ",
            "id": "vllm_entrypoints_test",
+            "env_vars": {
+                "VLLM_WORKER_MULTIPROC_METHOD": "spawn",
+            },
            "steps": [
-                "export VLLM_WORKER_MULTIPROC_METHOD=spawn",
                " ".join(
                    [
                        "pytest",
@ -70,10 +74,105 @@ def sample_vllm_test_library():
                "pytest -v -s test_regression.py",
            ],
        },
+        "vllm_lora_tp_test_distributed": {
+            "title": "LoRA TP Test (Distributed)",
+            "id": "vllm_lora_tp_test_distributed",
+            "env_vars": {
+                "VLLM_WORKER_MULTIPROC_METHOD": "spawn",
+            },
+            "num_gpus": 4,
+            "steps": [
+                "pytest -v -s -x lora/test_chatglm3_tp.py",
+                "echo $VLLM_WORKER_MULTIPROC_METHOD",
+                "pytest -v -s -x lora/test_llama_tp.py",
+                "pytest -v -s -x lora/test_multi_loras_with_tp.py",
+            ],
+        },
+        "vllm_lora_280_failure_test": {
+            "title": "LoRA 280 failure test",
+            "id": "vllm_lora_280_failure_test",
+            "steps": ["pytest -v lora/test_quant_model.py"],
+        },
+        "vllm_multi_model_processor_test": {
+            "title": "Multi-Modal Processor Test",
+            "id": "vllm_multi_model_processor_test",
+            "package_install": ["git+https://github.com/TIGER-AI-Lab/Mantis.git"],
+            "steps": [
+                "pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py",
+            ],
+        },
+        "vllm_pytorch_compilation_unit_tests": {
+            "title": "PyTorch Compilation Unit Tests",
+            "id": "vllm_pytorch_compilation_unit_tests",
+            "steps": [
+                "pytest -v -s compile/test_pass_manager.py",
+                "pytest -v -s compile/test_fusion.py",
+                "pytest -v -s compile/test_fusion_attn.py",
+                "pytest -v -s compile/test_silu_mul_quant_fusion.py",
+                "pytest -v -s compile/test_sequence_parallelism.py",
+                "pytest -v -s compile/test_async_tp.py",
+                "pytest -v -s compile/test_fusion_all_reduce.py",
+                "pytest -v -s compile/test_decorator.py",
+            ],
+        },
+        # TODO(elainewy):need to add g6 with 4 gpus to run this test
+        "vllm_lora_test": {
+            "title": "LoRA Test %N",
+            "id": "lora_test",
+            "parallelism": 4,
+            "steps": [
+                "echo '[checking] list sharded lora tests:'",
+                " ".join(
+                    [
+                        "pytest -q --collect-only lora",
+                        "--shard-id=$$BUILDKITE_PARALLEL_JOB",
+                        "--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT",
+                        "--ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py",
+                    ]
+                ),
+                "echo '[checking] Done. list lora tests'",
+                " ".join(
+                    [
+                        "pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB",
+                        "--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT",
+                        "--ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py",
+                    ]
+                ),
+            ],
+        },
    }


-def run_test_plan(test_plan: str, test_target: str, tests_map: dict[str, Any]):
+def check_parallelism(tests: Any, title: str, shard_id: int = 0, num_shards: int = 0):
+    """
+    a method to check if the test plan is parallelism or not.
+    """
+    parallelism = int(tests.get("parallelism", "0"))
+    is_parallel = parallelism and parallelism > 1
+
+    if not is_parallel:
+        return False
+
+    if shard_id > num_shards:
+        raise RuntimeError(
+            f"Test {title} expects {num_shards} shards, but invalid {shard_id} is provided"
+        )
+
+    if num_shards != parallelism:
+        raise RuntimeError(
+            f"Test {title} expects {parallelism} shards, but invalid {num_shards} is provided"
+        )
+
+    return True
+
+
+def run_test_plan(
+    test_plan: str,
+    test_target: str,
+    tests_map: dict[str, Any],
+    shard_id: int = 0,
+    num_shards: int = 0,
+):
    """
    a method to run list of tests based on the test plan.
    """
@ -83,17 +182,31 @@ def run_test_plan(test_plan: str, test_target: str, tests_map: dict[str, Any]):
            f"test {test_plan} not found, please add it to test plan pool"
        )
    tests = tests_map[test_plan]
-    logger.info("Running tests: %s", tests["title"])
    pkgs = tests.get("package_install", [])
+    title = tests.get("title", "unknown test")
+
+    is_parallel = check_parallelism(tests, title, shard_id, num_shards)
+    if is_parallel:
+        title = title.replace("%N", f"{shard_id}/{num_shards}")
+
+    logger.info("Running tests: %s", title)
    if pkgs:
        logger.info("Installing packages: %s", pkgs)
        pip_install_packages(packages=pkgs, prefer_uv=True)
-    with working_directory(tests.get("working_directory", "tests")):
+    with (
+        working_directory(tests.get("working_directory", "tests")),
+        temp_environ(tests.get("env_vars", {})),
+    ):
        failures = []
        for step in tests["steps"]:
+            logger.info("Running step: %s", step)
+            if is_parallel:
+                step = replace_buildkite_placeholders(step, shard_id, num_shards)
+                logger.info("Running parallel step: %s", step)
            code = run_command(cmd=step, check=False, use_shell=True)
            if code != 0:
                failures.append(step)
+            logger.info("Finish running step: %s", step)
        if failures:
            logger.error("Failed tests: %s", failures)
            raise RuntimeError(f"{len(failures)} pytest runs failed: {failures}")
@ -107,3 +220,13 @@ def clone_vllm(dst: str = "vllm"):
        dst=dst,
        update_submodules=True,
    )
+
+
+def replace_buildkite_placeholders(step: str, shard_id: int, num_shards: int) -> str:
+    mapping = {
+        "$$BUILDKITE_PARALLEL_JOB_COUNT": str(num_shards),
+        "$$BUILDKITE_PARALLEL_JOB": str(shard_id),
+    }
+    for k in sorted(mapping, key=len, reverse=True):
+        step = step.replace(k, mapping[k])
+    return step
--- a/.ci/lumen_cli/cli/lib/core/vllm/vllm_test.py
+++ b/.ci/lumen_cli/cli/lib/core/vllm/vllm_test.py
@ -61,6 +61,9 @@ class VllmTestRunner(BaseRunner):
        self.test_plan = ""
        self.test_type = TestInpuType.UNKNOWN

+        self.shard_id = args.shard_id
+        self.num_shards = args.num_shards
+
        if args.test_plan:
            self.test_plan = args.test_plan
            self.test_type = TestInpuType.TEST_PLAN
@ -103,7 +106,16 @@ class VllmTestRunner(BaseRunner):
        self.prepare()
        with working_directory(self.work_directory):
            if self.test_type == TestInpuType.TEST_PLAN:
-                run_test_plan(self.test_plan, "vllm", sample_vllm_test_library())
+                if self.num_shards > 1:
+                    run_test_plan(
+                        self.test_plan,
+                        "vllm",
+                        sample_vllm_test_library(),
+                        self.shard_id,
+                        self.num_shards,
+                    )
+                else:
+                    run_test_plan(self.test_plan, "vllm", sample_vllm_test_library())
            else:
                raise ValueError(f"Unknown test type {self.test_type}")

--- a/.ci/lumen_cli/cli/test_cli/register_test.py
+++ b/.ci/lumen_cli/cli/test_cli/register_test.py
@ -22,6 +22,18 @@ def common_args(parser: argparse.ArgumentParser) -> None:
    """
    Add common CLI arguments to the given parser.
    """
+    parser.add_argument(
+        "--shard-id",
+        type=int,
+        default=1,
+        help="a shard id to run, e.g. '0,1,2,3'",
+    )
+    parser.add_argument(
+        "--num-shards",
+        type=int,
+        default=1,
+        help="a number of shards to run, e.g. '4'",
+    )
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument(
        "-tp",
@ -29,7 +41,6 @@ def common_args(parser: argparse.ArgumentParser) -> None:
        type=str,
        help="a pre-defined test plan to run, e.g. 'basic_correctness_test'",
    )
-    # TODO(elainewy):add another common option that user can trigger a specific test with test config


 def register_test_commands(subparsers: argparse._SubParsersAction) -> None:
--- a/.ci/lumen_cli/tests/test_run_plan.py
+++ b/.ci/lumen_cli/tests/test_run_plan.py
@ -45,6 +45,10 @@ def patch_module(monkeypatch):
        workdir_calls.append(path)
        return nullcontext()

+    def fake_temp_env(map: dict[str, str]):
+        temp_calls.append(map)
+        return nullcontext()
+
    logger = SimpleNamespace(
        info=MagicMock(name="logger.info"),
        error=MagicMock(name="logger.error"),
@ -58,6 +62,7 @@ def patch_module(monkeypatch):
    monkeypatch.setattr(
        module, "working_directory", fake_working_directory, raising=True
    )
+    monkeypatch.setattr(module, "temp_environ", fake_temp_env, raising=True)
    monkeypatch.setattr(module, "logger", logger, raising=True)

    return SimpleNamespace(
@ -79,8 +84,8 @@ def test_success_runs_all_steps_and_uses_env_and_workdir(monkeypatch, patch_modu
            "title": "Basic suite",
            "package_install": [],
            "working_directory": "tests",
+            "env_vars": {"GLOBAL_FLAG": "1"},
            "steps": [
-                "export GLOBAL_FLAG=1",
                "export A=x && pytest -q",
                "export B=y && pytest -q tests/unit",
            ],
@ -97,14 +102,13 @@ def test_success_runs_all_steps_and_uses_env_and_workdir(monkeypatch, patch_modu
    checks = [_get_check(c) for c in calls]

    assert cmds == [
-        "export GLOBAL_FLAG=1",
        "export A=x && pytest -q",
        "export B=y && pytest -q tests/unit",
    ]
    assert all(chk is False for chk in checks)

-    # No temp_env assertions anymore
    assert patch_module.workdir_calls == ["tests"]
+    assert patch_module.temp_calls == [{"GLOBAL_FLAG": "1"}]


 def test_installs_packages_when_present(monkeypatch, patch_module):
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -1636,7 +1636,7 @@ elif [[ "$TEST_CONFIG" == *vllm* ]]; then
    fi
    echo "VLLM CI TORCH_CUDA_ARCH_LIST: $TORCH_CUDA_ARCH_LIST"
    (cd .ci/lumen_cli && python -m pip install -e .)
-    python -m cli.run test external vllm --test-plan "$TEST_CONFIG"
+    python -m cli.run test external vllm --test-plan "$TEST_CONFIG" --shard-id "$SHARD_NUMBER" --num-shards "$NUM_TEST_SHARDS"
 elif [[ "${TEST_CONFIG}" == *executorch* ]]; then
  test_executorch
 elif [[ "$TEST_CONFIG" == 'jit_legacy' ]]; then
--- a/.github/ci_configs/vllm/Dockerfile.tmp_vllm
+++ b/.github/ci_configs/vllm/Dockerfile.tmp_vllm
@ -177,7 +177,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 # track the nightly torch version used in the build, when we set up runtime environment we can make sure the version is the same
 RUN uv pip freeze | grep -i '^torch\|^torchvision\|^torchaudio' > torch_build_versions.txt
 RUN cat torch_build_versions.txt
-
 RUN pip freeze | grep -E 'torch|xformers|torchvision|torchaudio'

 #################### BASE BUILD IMAGE ####################
--- a/.github/workflows/vllm.yml
+++ b/.github/workflows/vllm.yml
@ -5,6 +5,9 @@ on:
    tags:
      - ciflow/vllm/*
  workflow_dispatch:
+  schedule:
+    # Every 12 hours starting at 00:00 UTC (00:00 and 12:00)
+    - cron: '0 0,12 * * *'

 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
@ -41,8 +44,16 @@ jobs:
        { include: [
          { config:  "vllm_basic_correctness_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
          { config: "vllm_basic_models_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
-          { config: "vllm_regression_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
          { config: "vllm_entrypoints_test", shard: 1, num_shards: 1,runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "vllm_regression_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "vllm_lora_280_failure_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "vllm_multi_model_processor_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "vllm_pytorch_compilation_unit_tests", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "vllm_lora_test", shard: 0, num_shards: 4, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "vllm_lora_test", shard: 1, num_shards: 4, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "vllm_lora_test", shard: 2, num_shards: 4,  runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "vllm_lora_test", shard: 3, num_shards: 4,  runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "vllm_lora_tp_test_distributed", shard: 1, num_shards: 1, runner: "linux.aws.h100.4"},
        ]}
    secrets: inherit