From 6443ea337df843681bc558d99efa84a3e5559b7f Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Fri, 22 Aug 2025 17:07:30 -0700
Subject: [PATCH] enable more tests (#161192)

Enable more vllm test against pytorch main, add schedule to run the test every 12 hours.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/161192
Approved by: https://github.com/huydhn
---
 .ci/lumen_cli/cli/lib/core/vllm/lib.py       | 135 ++++++++++++++++++-
 .ci/lumen_cli/cli/lib/core/vllm/vllm_test.py |  14 +-
 .ci/lumen_cli/cli/test_cli/register_test.py  |  13 +-
 .ci/lumen_cli/tests/test_run_plan.py         |  10 +-
 .ci/pytorch/test.sh                          |   2 +-
 .github/ci_configs/vllm/Dockerfile.tmp_vllm  |   1 -
 .github/workflows/vllm.yml                   |  13 +-
 7 files changed, 174 insertions(+), 14 deletions(-)

diff --git a/.ci/lumen_cli/cli/lib/core/vllm/lib.py b/.ci/lumen_cli/cli/lib/core/vllm/lib.py
index 2fa2618a27d8..7f3a930b2cc6 100644
--- a/.ci/lumen_cli/cli/lib/core/vllm/lib.py
+++ b/.ci/lumen_cli/cli/lib/core/vllm/lib.py
@@ -3,7 +3,7 @@ from typing import Any
 
 from cli.lib.common.git_helper import clone_external_repo
 from cli.lib.common.pip_helper import pip_install_packages
-from cli.lib.common.utils import run_command, working_directory
+from cli.lib.common.utils import run_command, temp_environ, working_directory
 
 
 logger = logging.getLogger(__name__)
@@ -20,8 +20,10 @@ def sample_vllm_test_library():
         "vllm_basic_correctness_test": {
             "title": "Basic Correctness Test",
             "id": "vllm_basic_correctness_test",
+            "env_vars": {
+                "VLLM_WORKER_MULTIPROC_METHOD": "spawn",
+            },
             "steps": [
-                "export VLLM_WORKER_MULTIPROC_METHOD=spawn",
                 "pytest -v -s basic_correctness/test_cumem.py",
                 "pytest -v -s basic_correctness/test_basic_correctness.py",
                 "pytest -v -s basic_correctness/test_cpu_offload.py",
@@ -42,8 +44,10 @@ def sample_vllm_test_library():
         "vllm_entrypoints_test": {
             "title": "Entrypoints Test ",
             "id": "vllm_entrypoints_test",
+            "env_vars": {
+                "VLLM_WORKER_MULTIPROC_METHOD": "spawn",
+            },
             "steps": [
-                "export VLLM_WORKER_MULTIPROC_METHOD=spawn",
                 " ".join(
                     [
                         "pytest",
@@ -70,10 +74,105 @@ def sample_vllm_test_library():
                 "pytest -v -s test_regression.py",
             ],
         },
+        "vllm_lora_tp_test_distributed": {
+            "title": "LoRA TP Test (Distributed)",
+            "id": "vllm_lora_tp_test_distributed",
+            "env_vars": {
+                "VLLM_WORKER_MULTIPROC_METHOD": "spawn",
+            },
+            "num_gpus": 4,
+            "steps": [
+                "pytest -v -s -x lora/test_chatglm3_tp.py",
+                "echo $VLLM_WORKER_MULTIPROC_METHOD",
+                "pytest -v -s -x lora/test_llama_tp.py",
+                "pytest -v -s -x lora/test_multi_loras_with_tp.py",
+            ],
+        },
+        "vllm_lora_280_failure_test": {
+            "title": "LoRA 280 failure test",
+            "id": "vllm_lora_280_failure_test",
+            "steps": ["pytest -v lora/test_quant_model.py"],
+        },
+        "vllm_multi_model_processor_test": {
+            "title": "Multi-Modal Processor Test",
+            "id": "vllm_multi_model_processor_test",
+            "package_install": ["git+https://github.com/TIGER-AI-Lab/Mantis.git"],
+            "steps": [
+                "pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py",
+            ],
+        },
+        "vllm_pytorch_compilation_unit_tests": {
+            "title": "PyTorch Compilation Unit Tests",
+            "id": "vllm_pytorch_compilation_unit_tests",
+            "steps": [
+                "pytest -v -s compile/test_pass_manager.py",
+                "pytest -v -s compile/test_fusion.py",
+                "pytest -v -s compile/test_fusion_attn.py",
+                "pytest -v -s compile/test_silu_mul_quant_fusion.py",
+                "pytest -v -s compile/test_sequence_parallelism.py",
+                "pytest -v -s compile/test_async_tp.py",
+                "pytest -v -s compile/test_fusion_all_reduce.py",
+                "pytest -v -s compile/test_decorator.py",
+            ],
+        },
+        # TODO(elainewy):need to add g6 with 4 gpus to run this test
+        "vllm_lora_test": {
+            "title": "LoRA Test %N",
+            "id": "lora_test",
+            "parallelism": 4,
+            "steps": [
+                "echo '[checking] list sharded lora tests:'",
+                " ".join(
+                    [
+                        "pytest -q --collect-only lora",
+                        "--shard-id=$$BUILDKITE_PARALLEL_JOB",
+                        "--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT",
+                        "--ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py",
+                    ]
+                ),
+                "echo '[checking] Done. list lora tests'",
+                " ".join(
+                    [
+                        "pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB",
+                        "--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT",
+                        "--ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py",
+                    ]
+                ),
+            ],
+        },
     }
 
 
-def run_test_plan(test_plan: str, test_target: str, tests_map: dict[str, Any]):
+def check_parallelism(tests: Any, title: str, shard_id: int = 0, num_shards: int = 0):
+    """
+    a method to check if the test plan is parallelism or not.
+    """
+    parallelism = int(tests.get("parallelism", "0"))
+    is_parallel = parallelism and parallelism > 1
+
+    if not is_parallel:
+        return False
+
+    if shard_id > num_shards:
+        raise RuntimeError(
+            f"Test {title} expects {num_shards} shards, but invalid {shard_id} is provided"
+        )
+
+    if num_shards != parallelism:
+        raise RuntimeError(
+            f"Test {title} expects {parallelism} shards, but invalid {num_shards} is provided"
+        )
+
+    return True
+
+
+def run_test_plan(
+    test_plan: str,
+    test_target: str,
+    tests_map: dict[str, Any],
+    shard_id: int = 0,
+    num_shards: int = 0,
+):
     """
     a method to run list of tests based on the test plan.
     """
@@ -83,17 +182,31 @@ def run_test_plan(test_plan: str, test_target: str, tests_map: dict[str, Any]):
             f"test {test_plan} not found, please add it to test plan pool"
         )
     tests = tests_map[test_plan]
-    logger.info("Running tests: %s", tests["title"])
     pkgs = tests.get("package_install", [])
+    title = tests.get("title", "unknown test")
+
+    is_parallel = check_parallelism(tests, title, shard_id, num_shards)
+    if is_parallel:
+        title = title.replace("%N", f"{shard_id}/{num_shards}")
+
+    logger.info("Running tests: %s", title)
     if pkgs:
         logger.info("Installing packages: %s", pkgs)
         pip_install_packages(packages=pkgs, prefer_uv=True)
-    with working_directory(tests.get("working_directory", "tests")):
+    with (
+        working_directory(tests.get("working_directory", "tests")),
+        temp_environ(tests.get("env_vars", {})),
+    ):
         failures = []
         for step in tests["steps"]:
+            logger.info("Running step: %s", step)
+            if is_parallel:
+                step = replace_buildkite_placeholders(step, shard_id, num_shards)
+                logger.info("Running parallel step: %s", step)
             code = run_command(cmd=step, check=False, use_shell=True)
             if code != 0:
                 failures.append(step)
+            logger.info("Finish running step: %s", step)
         if failures:
             logger.error("Failed tests: %s", failures)
             raise RuntimeError(f"{len(failures)} pytest runs failed: {failures}")
@@ -107,3 +220,13 @@ def clone_vllm(dst: str = "vllm"):
         dst=dst,
         update_submodules=True,
     )
+
+
+def replace_buildkite_placeholders(step: str, shard_id: int, num_shards: int) -> str:
+    mapping = {
+        "$$BUILDKITE_PARALLEL_JOB_COUNT": str(num_shards),
+        "$$BUILDKITE_PARALLEL_JOB": str(shard_id),
+    }
+    for k in sorted(mapping, key=len, reverse=True):
+        step = step.replace(k, mapping[k])
+    return step
diff --git a/.ci/lumen_cli/cli/lib/core/vllm/vllm_test.py b/.ci/lumen_cli/cli/lib/core/vllm/vllm_test.py
index e4a3a932bc57..2be8e246486e 100644
--- a/.ci/lumen_cli/cli/lib/core/vllm/vllm_test.py
+++ b/.ci/lumen_cli/cli/lib/core/vllm/vllm_test.py
@@ -61,6 +61,9 @@ class VllmTestRunner(BaseRunner):
         self.test_plan = ""
         self.test_type = TestInpuType.UNKNOWN
 
+        self.shard_id = args.shard_id
+        self.num_shards = args.num_shards
+
         if args.test_plan:
             self.test_plan = args.test_plan
             self.test_type = TestInpuType.TEST_PLAN
@@ -103,7 +106,16 @@ class VllmTestRunner(BaseRunner):
         self.prepare()
         with working_directory(self.work_directory):
             if self.test_type == TestInpuType.TEST_PLAN:
-                run_test_plan(self.test_plan, "vllm", sample_vllm_test_library())
+                if self.num_shards > 1:
+                    run_test_plan(
+                        self.test_plan,
+                        "vllm",
+                        sample_vllm_test_library(),
+                        self.shard_id,
+                        self.num_shards,
+                    )
+                else:
+                    run_test_plan(self.test_plan, "vllm", sample_vllm_test_library())
             else:
                 raise ValueError(f"Unknown test type {self.test_type}")
 
diff --git a/.ci/lumen_cli/cli/test_cli/register_test.py b/.ci/lumen_cli/cli/test_cli/register_test.py
index 20132b6d5554..2973341b83ed 100644
--- a/.ci/lumen_cli/cli/test_cli/register_test.py
+++ b/.ci/lumen_cli/cli/test_cli/register_test.py
@@ -22,6 +22,18 @@ def common_args(parser: argparse.ArgumentParser) -> None:
     """
     Add common CLI arguments to the given parser.
     """
+    parser.add_argument(
+        "--shard-id",
+        type=int,
+        default=1,
+        help="a shard id to run, e.g. '0,1,2,3'",
+    )
+    parser.add_argument(
+        "--num-shards",
+        type=int,
+        default=1,
+        help="a number of shards to run, e.g. '4'",
+    )
     group = parser.add_mutually_exclusive_group(required=True)
     group.add_argument(
         "-tp",
@@ -29,7 +41,6 @@ def common_args(parser: argparse.ArgumentParser) -> None:
         type=str,
         help="a pre-defined test plan to run, e.g. 'basic_correctness_test'",
     )
-    # TODO(elainewy):add another common option that user can trigger a specific test with test config
 
 
 def register_test_commands(subparsers: argparse._SubParsersAction) -> None:
diff --git a/.ci/lumen_cli/tests/test_run_plan.py b/.ci/lumen_cli/tests/test_run_plan.py
index 2d07827a1f69..a85ed2e3986f 100644
--- a/.ci/lumen_cli/tests/test_run_plan.py
+++ b/.ci/lumen_cli/tests/test_run_plan.py
@@ -45,6 +45,10 @@ def patch_module(monkeypatch):
         workdir_calls.append(path)
         return nullcontext()
 
+    def fake_temp_env(map: dict[str, str]):
+        temp_calls.append(map)
+        return nullcontext()
+
     logger = SimpleNamespace(
         info=MagicMock(name="logger.info"),
         error=MagicMock(name="logger.error"),
@@ -58,6 +62,7 @@ def patch_module(monkeypatch):
     monkeypatch.setattr(
         module, "working_directory", fake_working_directory, raising=True
     )
+    monkeypatch.setattr(module, "temp_environ", fake_temp_env, raising=True)
     monkeypatch.setattr(module, "logger", logger, raising=True)
 
     return SimpleNamespace(
@@ -79,8 +84,8 @@ def test_success_runs_all_steps_and_uses_env_and_workdir(monkeypatch, patch_modu
             "title": "Basic suite",
             "package_install": [],
             "working_directory": "tests",
+            "env_vars": {"GLOBAL_FLAG": "1"},
             "steps": [
-                "export GLOBAL_FLAG=1",
                 "export A=x && pytest -q",
                 "export B=y && pytest -q tests/unit",
             ],
@@ -97,14 +102,13 @@ def test_success_runs_all_steps_and_uses_env_and_workdir(monkeypatch, patch_modu
     checks = [_get_check(c) for c in calls]
 
     assert cmds == [
-        "export GLOBAL_FLAG=1",
         "export A=x && pytest -q",
         "export B=y && pytest -q tests/unit",
     ]
     assert all(chk is False for chk in checks)
 
-    # No temp_env assertions anymore
     assert patch_module.workdir_calls == ["tests"]
+    assert patch_module.temp_calls == [{"GLOBAL_FLAG": "1"}]
 
 
 def test_installs_packages_when_present(monkeypatch, patch_module):
diff --git a/.ci/pytorch/test.sh b/.ci/pytorch/test.sh
index d27516ec9266..5a82ec2fa85e 100755
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@@ -1636,7 +1636,7 @@ elif [[ "$TEST_CONFIG" == *vllm* ]]; then
     fi
     echo "VLLM CI TORCH_CUDA_ARCH_LIST: $TORCH_CUDA_ARCH_LIST"
     (cd .ci/lumen_cli && python -m pip install -e .)
-    python -m cli.run test external vllm --test-plan "$TEST_CONFIG"
+    python -m cli.run test external vllm --test-plan "$TEST_CONFIG" --shard-id "$SHARD_NUMBER" --num-shards "$NUM_TEST_SHARDS"
 elif [[ "${TEST_CONFIG}" == *executorch* ]]; then
   test_executorch
 elif [[ "$TEST_CONFIG" == 'jit_legacy' ]]; then
diff --git a/.github/ci_configs/vllm/Dockerfile.tmp_vllm b/.github/ci_configs/vllm/Dockerfile.tmp_vllm
index 54eb415d8b67..330a78424fee 100644
--- a/.github/ci_configs/vllm/Dockerfile.tmp_vllm
+++ b/.github/ci_configs/vllm/Dockerfile.tmp_vllm
@@ -177,7 +177,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 # track the nightly torch version used in the build, when we set up runtime environment we can make sure the version is the same
 RUN uv pip freeze | grep -i '^torch\|^torchvision\|^torchaudio' > torch_build_versions.txt
 RUN cat torch_build_versions.txt
-
 RUN pip freeze | grep -E 'torch|xformers|torchvision|torchaudio'
 
 #################### BASE BUILD IMAGE ####################
diff --git a/.github/workflows/vllm.yml b/.github/workflows/vllm.yml
index f58dacda84cd..14524069ab5a 100644
--- a/.github/workflows/vllm.yml
+++ b/.github/workflows/vllm.yml
@@ -5,6 +5,9 @@ on:
     tags:
       - ciflow/vllm/*
   workflow_dispatch:
+  schedule:
+    # Every 12 hours starting at 00:00 UTC (00:00 and 12:00)
+    - cron: '0 0,12 * * *'
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
@@ -41,8 +44,16 @@ jobs:
         { include: [
           { config:  "vllm_basic_correctness_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
           { config: "vllm_basic_models_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
-          { config: "vllm_regression_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
           { config: "vllm_entrypoints_test", shard: 1, num_shards: 1,runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "vllm_regression_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "vllm_lora_280_failure_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "vllm_multi_model_processor_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "vllm_pytorch_compilation_unit_tests", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "vllm_lora_test", shard: 0, num_shards: 4, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "vllm_lora_test", shard: 1, num_shards: 4, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "vllm_lora_test", shard: 2, num_shards: 4,  runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "vllm_lora_test", shard: 3, num_shards: 4,  runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "vllm_lora_tp_test_distributed", shard: 1, num_shards: 1, runner: "linux.aws.h100.4"},
         ]}
     secrets: inherit