[CI] Replace large models with tiny alternatives in tests (#24057)

Signed-off-by: Tahsin Tunan <tahsintunan@gmail.com> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Nick Hill <nhill@redhat.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-20 14:53:52 +08:00 · 2025-10-16 20:51:27 +06:00
parent 02d709a6f1
commit 43721bc67f
17 changed files with 118 additions and 59 deletions
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@ -20,7 +20,7 @@ from ..models.utils import check_outputs_equal
 from ..utils import multi_gpu_test

 MODELS = [
-    "google/gemma-2-2b-it",
+    "hmellor/tiny-random-Gemma2ForCausalLM",
    "meta-llama/Llama-3.2-1B-Instruct",
 ]

@ -29,7 +29,7 @@ TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")

 def test_vllm_gc_ed():
    """Verify vllm instance is GC'ed when it is deleted"""
-    llm = LLM("distilbert/distilgpt2")
+    llm = LLM("hmellor/tiny-random-LlamaForCausalLM")
    weak_llm = weakref.ref(llm)
    del llm
    # If there's any circular reference to vllm, this fails
@ -125,14 +125,14 @@ def test_models(
@pytest.mark.parametrize(
    "model, distributed_executor_backend, attention_backend, test_suite, extra_env",
    [
-        ("distilbert/distilgpt2", "ray", "", "L4", {}),
-        ("distilbert/distilgpt2", "mp", "", "L4", {}),
-        ("distilbert/distilgpt2", "ray", "", "L4", {"VLLM_SLEEP_WHEN_IDLE": "1"}),
-        ("distilbert/distilgpt2", "mp", "", "L4", {"VLLM_SLEEP_WHEN_IDLE": "1"}),
+        ("facebook/opt-125m", "ray", "", "L4", {}),
+        ("facebook/opt-125m", "mp", "", "L4", {}),
+        ("facebook/opt-125m", "ray", "", "L4", {"VLLM_SLEEP_WHEN_IDLE": "1"}),
+        ("facebook/opt-125m", "mp", "", "L4", {"VLLM_SLEEP_WHEN_IDLE": "1"}),
        ("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4", {}),
        ("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4", {}),
-        ("distilbert/distilgpt2", "ray", "", "A100", {}),
-        ("distilbert/distilgpt2", "mp", "", "A100", {}),
+        ("facebook/opt-125m", "ray", "", "A100", {}),
+        ("facebook/opt-125m", "mp", "", "A100", {}),
    ],
 )
@pytest.mark.parametrize("enable_prompt_embeds", [True, False])
--- a/tests/basic_correctness/test_cpu_offload.py
+++ b/tests/basic_correctness/test_cpu_offload.py
@ -6,5 +6,5 @@ from ..utils import compare_two_settings

 def test_cpu_offload():
    compare_two_settings(
-        "meta-llama/Llama-3.2-1B-Instruct", [], ["--cpu-offload-gb", "1"]
+        "hmellor/tiny-random-LlamaForCausalLM", [], ["--cpu-offload-gb", "1"]
    )
--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@ -120,7 +120,7 @@ def test_cumem_with_cudagraph():
    "model",
    [
        # sleep mode with safetensors
-        "meta-llama/Llama-3.2-1B",
+        "hmellor/tiny-random-LlamaForCausalLM",
        # sleep mode with pytorch checkpoint
        "facebook/opt-125m",
    ],
@ -174,7 +174,7 @@ def test_end_to_end(model: str):

@create_new_process_for_each_test()
 def test_deep_sleep():
-    model = "Qwen/Qwen3-0.6B"
+    model = "hmellor/tiny-random-LlamaForCausalLM"
    free, total = torch.cuda.mem_get_info()
    used_bytes_baseline = total - free  # in case other process is running
    llm = LLM(model, enable_sleep_mode=True)
--- a/tests/distributed/test_sequence_parallel.py
+++ b/tests/distributed/test_sequence_parallel.py
@ -273,14 +273,14 @@ def _compare_sp(

 SP_TEXT_GENERATION_MODELS = {
    # [Decoder-only]
-    "meta-llama/Llama-3.2-1B-Instruct": SPTestSettings.fast(),
+    "hmellor/tiny-random-LlamaForCausalLM": SPTestSettings.fast(),
    "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8": SPTestSettings.fp8_quant(),
 }

 SP_TEST_MODELS = [
    # TODO support other models
    # [LANGUAGE GENERATION]
-    "meta-llama/Llama-3.2-1B-Instruct",
+    "hmellor/tiny-random-LlamaForCausalLM",
    "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8",
 ]

--- a/tests/entrypoints/llm/test_collective_rpc.py
+++ b/tests/entrypoints/llm/test_collective_rpc.py
@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import pytest
+import torch

 from vllm import LLM

@ -12,6 +13,8 @@ from ...utils import create_new_process_for_each_test
@pytest.mark.parametrize("backend", ["mp", "ray"])
@create_new_process_for_each_test()
 def test_collective_rpc(tp_size, backend, monkeypatch):
+    if torch.cuda.device_count() < tp_size:
+        pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
    if tp_size == 1 and backend == "ray":
        pytest.skip("Skip duplicate test case")
    if tp_size == 1:
@ -24,7 +27,7 @@ def test_collective_rpc(tp_size, backend, monkeypatch):

    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
    llm = LLM(
-        model="meta-llama/Llama-3.2-1B-Instruct",
+        model="hmellor/tiny-random-LlamaForCausalLM",
        enforce_eager=True,
        load_format="dummy",
        tensor_parallel_size=tp_size,
--- a/tests/entrypoints/openai/test_run_batch.py
+++ b/tests/entrypoints/openai/test_run_batch.py
@ -9,7 +9,7 @@ import pytest

 from vllm.entrypoints.openai.protocol import BatchRequestOutput

-MODEL_NAME = "Qwen/Qwen3-0.6B"
+MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"

 # ruff: noqa: E501
 INPUT_BATCH = (
--- a/tests/entrypoints/openai/test_serving_models.py
+++ b/tests/entrypoints/openai/test_serving_models.py
@ -16,7 +16,7 @@ from vllm.entrypoints.openai.protocol import (
 from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
 from vllm.lora.request import LoRARequest

-MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
+MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
 BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
 LORA_LOADING_SUCCESS_MESSAGE = "Success: LoRA adapter '{lora_name}' added successfully."
 LORA_UNLOADING_SUCCESS_MESSAGE = (
--- a/tests/entrypoints/openai/test_shutdown.py
+++ b/tests/entrypoints/openai/test_shutdown.py
@ -1,37 +1,93 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

+import signal
+import subprocess
+import sys
+import time
+
 import openai
 import pytest

-from ...utils import RemoteOpenAIServer
+from ...utils import get_open_port

-MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
+MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"


@pytest.mark.asyncio
 async def test_shutdown_on_engine_failure():
-    # dtype, max-len etc set so that this can run in CI
-    args = [
-        "--dtype",
-        "bfloat16",
-        "--max-model-len",
-        "8192",
-        "--enforce-eager",
-        "--max-num-seqs",
-        "128",
-    ]
+    """Verify that API returns connection error when server process is killed.

-    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
-        async with remote_server.get_async_client() as client:
-            with pytest.raises((openai.APIConnectionError, openai.InternalServerError)):
-                # Asking for lots of prompt logprobs will currently crash the
-                # engine. This may change in the future when that bug is fixed
-                prompt = "Hello " * 4000
-                await client.completions.create(
-                    model=MODEL_NAME, prompt=prompt, extra_body={"prompt_logprobs": 10}
+    Starts a vLLM server, kills it to simulate a crash, then verifies that
+    subsequent API calls fail appropriately.
+    """
+
+    port = get_open_port()
+
+    proc = subprocess.Popen(
+        [
+            # dtype, max-len etc set so that this can run in CI
+            sys.executable,
+            "-m",
+            "vllm.entrypoints.openai.api_server",
+            "--model",
+            MODEL_NAME,
+            "--dtype",
+            "bfloat16",
+            "--max-model-len",
+            "128",
+            "--enforce-eager",
+            "--port",
+            str(port),
+            "--gpu-memory-utilization",
+            "0.05",
+            "--max-num-seqs",
+            "2",
+            "--disable-frontend-multiprocessing",
+        ],
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        text=True,
+        preexec_fn=lambda: signal.signal(signal.SIGINT, signal.SIG_IGN),
+    )
+
+    # Wait for server startup
+    start_time = time.time()
+    client = openai.AsyncOpenAI(
+        base_url=f"http://localhost:{port}/v1",
+        api_key="dummy",
+        max_retries=0,
+        timeout=10,
+    )
+
+    # Poll until server is ready
+    while time.time() - start_time < 30:
+        try:
+            await client.completions.create(
+                model=MODEL_NAME, prompt="Hello", max_tokens=1
+            )
+            break
+        except Exception:
+            time.sleep(0.5)
+            if proc.poll() is not None:
+                stdout, stderr = proc.communicate(timeout=1)
+                pytest.fail(
+                    f"Server died during startup. stdout: {stdout}, stderr: {stderr}"
                )
+    else:
+        proc.terminate()
+        proc.wait(timeout=5)
+        pytest.fail("Server failed to start in 30 seconds")

-            # Now the server should shut down
-            return_code = remote_server.proc.wait(timeout=8)
-            assert return_code is not None
+    # Kill server to simulate crash
+    proc.terminate()
+    time.sleep(1)
+
+    # Verify API calls now fail
+    with pytest.raises((openai.APIConnectionError, openai.APIStatusError)):
+        await client.completions.create(
+            model=MODEL_NAME, prompt="This should fail", max_tokens=1
+        )
+
+    return_code = proc.wait(timeout=5)
+    assert return_code is not None
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@ -330,6 +330,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
            "guard": "meta-llama/Llama-Guard-3-1B",
            "hermes": "NousResearch/Hermes-3-Llama-3.1-8B",
            "fp8": "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8",
+            "tiny": "hmellor/tiny-random-LlamaForCausalLM",
        },
    ),
    "LLaMAForCausalLM": _HfExamplesInfo(
--- a/tests/samplers/test_no_bad_words.py
+++ b/tests/samplers/test_no_bad_words.py
@ -35,15 +35,13 @@ def _generate(


 class TestOneTokenBadWord:
-    MODEL = "TheBloke/Llama-2-7B-fp16"
+    MODEL = "hmellor/tiny-random-LlamaForCausalLM"

-    PROMPT = "Hi! How are"
-    TARGET_TOKEN = "you"
+    PROMPT = "How old are "
+    TARGET_TOKEN = "mn"

    def setup_method(self, method):
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            self.MODEL, add_prefix_space=True
-        )
+        self.tokenizer = AutoTokenizer.from_pretrained(self.MODEL)

        self.num_prompt_tokens = len(self._encode(self.PROMPT))
        self.target_token_id = self._encode(
--- a/tests/v1/core/test_scheduler_e2e.py
+++ b/tests/v1/core/test_scheduler_e2e.py
@ -5,7 +5,7 @@ import pytest

 from vllm import LLM

-MODEL = "meta-llama/Llama-3.2-1B"
+MODEL = "hmellor/tiny-random-LlamaForCausalLM"
 PROMPT = "Hello my name is Robert and I"


--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@ -24,9 +24,11 @@ from ...utils import create_new_process_for_each_test, multi_gpu_test
 if not current_platform.is_cuda():
    pytest.skip(reason="V1 currently only supported on CUDA.", allow_module_level=True)

-MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
+MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
 TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)
-PROMPT = "Hello my name is Robert and I love quantization kernels"
+# test_engine_core_concurrent_batches assumes exactly 12 tokens per prompt.
+# Adjust prompt if changing model to maintain 12-token length.
+PROMPT = "I am Gyoubu Masataka Oniwa"
 PROMPT_TOKENS = TOKENIZER(PROMPT).input_ids


--- a/tests/v1/entrypoints/openai/test_multi_api_servers.py
+++ b/tests/v1/entrypoints/openai/test_multi_api_servers.py
@ -10,7 +10,7 @@ import pytest_asyncio
 from tests.utils import RemoteOpenAIServer
 from tests.v1.utils import check_request_balancing

-MODEL_NAME = "ibm-research/PowerMoE-3b"
+MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"

 DP_SIZE = os.getenv("DP_SIZE", "1")

--- a/tests/v1/sample/test_sampling_params_e2e.py
+++ b/tests/v1/sample/test_sampling_params_e2e.py
@ -5,16 +5,13 @@ import pytest

 from vllm import LLM, SamplingParams

-MODEL = "meta-llama/Llama-3.2-1B"
+MODEL = "hmellor/tiny-random-LlamaForCausalLM"
 PROMPT = "Hello my name is Robert and I"


@pytest.fixture(scope="module")
 def llm() -> LLM:
-    # Disable prefix caching so that we can test prompt logprobs.
-    # TODO remove this after https://github.com/vllm-project/vllm/pull/13949
-    # is merged
-    return LLM(MODEL, enforce_eager=True, enable_prefix_caching=False)
+    return LLM(MODEL, enforce_eager=True)


 def test_n_gt_1(llm):
--- a/tests/v1/shutdown/test_delete.py
+++ b/tests/v1/shutdown/test_delete.py
@ -15,7 +15,7 @@ from vllm.sampling_params import RequestOutputKind
 from vllm.utils import cuda_device_count_stateless
 from vllm.v1.engine.async_llm import AsyncLLM

-MODELS = ["meta-llama/Llama-3.2-1B"]
+MODELS = ["hmellor/tiny-random-LlamaForCausalLM"]


@pytest.mark.asyncio
--- a/tests/v1/shutdown/test_forward_error.py
+++ b/tests/v1/shutdown/test_forward_error.py
@ -18,7 +18,7 @@ from vllm.utils import cuda_device_count_stateless
 from vllm.v1.engine.async_llm import AsyncLLM
 from vllm.v1.engine.exceptions import EngineDeadError

-MODELS = ["meta-llama/Llama-3.2-1B"]
+MODELS = ["hmellor/tiny-random-LlamaForCausalLM"]


 def evil_forward(self, *args, **kwargs):
--- a/tests/v1/shutdown/test_startup_error.py
+++ b/tests/v1/shutdown/test_startup_error.py
@ -16,7 +16,7 @@ from vllm.model_executor.models.llama import LlamaForCausalLM
 from vllm.utils import cuda_device_count_stateless
 from vllm.v1.engine.async_llm import AsyncLLM

-MODELS = ["meta-llama/Llama-3.2-1B"]
+MODELS = ["hmellor/tiny-random-LlamaForCausalLM"]


 def evil_method(self, *args, **kwargs):
@ -76,8 +76,10 @@ def test_llm_startup_error(
    Test profiling (forward()) and load weights failures.
    TODO(andy) - LLM without multiprocessing.
    """
-    if model != "meta-llama/Llama-3.2-1B":
-        pytest.skip(reason="Only test meta-llama/Llama-3.2-1B")
+    # Skip non-Llama models since we monkeypatch LlamaForCausalLM specifically.
+    # If MODELS list grows, each architecture needs its own test variant.
+    if model != "JackFram/llama-68m":
+        pytest.skip(reason="Only test JackFram/llama-68m")
    if cuda_device_count_stateless() < tensor_parallel_size:
        pytest.skip(reason="Not enough CUDA devices")