[CI] Replace large models with tiny alternatives in tests (#24057)

Signed-off-by: Tahsin Tunan <tahsintunan@gmail.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Tahsin Tunan
2025-10-16 20:51:27 +06:00
committed by GitHub
parent 02d709a6f1
commit 43721bc67f
17 changed files with 118 additions and 59 deletions

View File

@ -20,7 +20,7 @@ from ..models.utils import check_outputs_equal
from ..utils import multi_gpu_test
MODELS = [
"google/gemma-2-2b-it",
"hmellor/tiny-random-Gemma2ForCausalLM",
"meta-llama/Llama-3.2-1B-Instruct",
]
@ -29,7 +29,7 @@ TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
def test_vllm_gc_ed():
"""Verify vllm instance is GC'ed when it is deleted"""
llm = LLM("distilbert/distilgpt2")
llm = LLM("hmellor/tiny-random-LlamaForCausalLM")
weak_llm = weakref.ref(llm)
del llm
# If there's any circular reference to vllm, this fails
@ -125,14 +125,14 @@ def test_models(
@pytest.mark.parametrize(
"model, distributed_executor_backend, attention_backend, test_suite, extra_env",
[
("distilbert/distilgpt2", "ray", "", "L4", {}),
("distilbert/distilgpt2", "mp", "", "L4", {}),
("distilbert/distilgpt2", "ray", "", "L4", {"VLLM_SLEEP_WHEN_IDLE": "1"}),
("distilbert/distilgpt2", "mp", "", "L4", {"VLLM_SLEEP_WHEN_IDLE": "1"}),
("facebook/opt-125m", "ray", "", "L4", {}),
("facebook/opt-125m", "mp", "", "L4", {}),
("facebook/opt-125m", "ray", "", "L4", {"VLLM_SLEEP_WHEN_IDLE": "1"}),
("facebook/opt-125m", "mp", "", "L4", {"VLLM_SLEEP_WHEN_IDLE": "1"}),
("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4", {}),
("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4", {}),
("distilbert/distilgpt2", "ray", "", "A100", {}),
("distilbert/distilgpt2", "mp", "", "A100", {}),
("facebook/opt-125m", "ray", "", "A100", {}),
("facebook/opt-125m", "mp", "", "A100", {}),
],
)
@pytest.mark.parametrize("enable_prompt_embeds", [True, False])

View File

@ -6,5 +6,5 @@ from ..utils import compare_two_settings
def test_cpu_offload():
compare_two_settings(
"meta-llama/Llama-3.2-1B-Instruct", [], ["--cpu-offload-gb", "1"]
"hmellor/tiny-random-LlamaForCausalLM", [], ["--cpu-offload-gb", "1"]
)

View File

@ -120,7 +120,7 @@ def test_cumem_with_cudagraph():
"model",
[
# sleep mode with safetensors
"meta-llama/Llama-3.2-1B",
"hmellor/tiny-random-LlamaForCausalLM",
# sleep mode with pytorch checkpoint
"facebook/opt-125m",
],
@ -174,7 +174,7 @@ def test_end_to_end(model: str):
@create_new_process_for_each_test()
def test_deep_sleep():
model = "Qwen/Qwen3-0.6B"
model = "hmellor/tiny-random-LlamaForCausalLM"
free, total = torch.cuda.mem_get_info()
used_bytes_baseline = total - free # in case other process is running
llm = LLM(model, enable_sleep_mode=True)

View File

@ -273,14 +273,14 @@ def _compare_sp(
SP_TEXT_GENERATION_MODELS = {
# [Decoder-only]
"meta-llama/Llama-3.2-1B-Instruct": SPTestSettings.fast(),
"hmellor/tiny-random-LlamaForCausalLM": SPTestSettings.fast(),
"RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8": SPTestSettings.fp8_quant(),
}
SP_TEST_MODELS = [
# TODO support other models
# [LANGUAGE GENERATION]
"meta-llama/Llama-3.2-1B-Instruct",
"hmellor/tiny-random-LlamaForCausalLM",
"RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8",
]

View File

@ -2,6 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import torch
from vllm import LLM
@ -12,6 +13,8 @@ from ...utils import create_new_process_for_each_test
@pytest.mark.parametrize("backend", ["mp", "ray"])
@create_new_process_for_each_test()
def test_collective_rpc(tp_size, backend, monkeypatch):
if torch.cuda.device_count() < tp_size:
pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
if tp_size == 1 and backend == "ray":
pytest.skip("Skip duplicate test case")
if tp_size == 1:
@ -24,7 +27,7 @@ def test_collective_rpc(tp_size, backend, monkeypatch):
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
llm = LLM(
model="meta-llama/Llama-3.2-1B-Instruct",
model="hmellor/tiny-random-LlamaForCausalLM",
enforce_eager=True,
load_format="dummy",
tensor_parallel_size=tp_size,

View File

@ -9,7 +9,7 @@ import pytest
from vllm.entrypoints.openai.protocol import BatchRequestOutput
MODEL_NAME = "Qwen/Qwen3-0.6B"
MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
# ruff: noqa: E501
INPUT_BATCH = (

View File

@ -16,7 +16,7 @@ from vllm.entrypoints.openai.protocol import (
from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
from vllm.lora.request import LoRARequest
MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
LORA_LOADING_SUCCESS_MESSAGE = "Success: LoRA adapter '{lora_name}' added successfully."
LORA_UNLOADING_SUCCESS_MESSAGE = (

View File

@ -1,37 +1,93 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import signal
import subprocess
import sys
import time
import openai
import pytest
from ...utils import RemoteOpenAIServer
from ...utils import get_open_port
MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
@pytest.mark.asyncio
async def test_shutdown_on_engine_failure():
# dtype, max-len etc set so that this can run in CI
args = [
"--dtype",
"bfloat16",
"--max-model-len",
"8192",
"--enforce-eager",
"--max-num-seqs",
"128",
]
"""Verify that API returns connection error when server process is killed.
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
async with remote_server.get_async_client() as client:
with pytest.raises((openai.APIConnectionError, openai.InternalServerError)):
# Asking for lots of prompt logprobs will currently crash the
# engine. This may change in the future when that bug is fixed
prompt = "Hello " * 4000
await client.completions.create(
model=MODEL_NAME, prompt=prompt, extra_body={"prompt_logprobs": 10}
Starts a vLLM server, kills it to simulate a crash, then verifies that
subsequent API calls fail appropriately.
"""
port = get_open_port()
proc = subprocess.Popen(
[
# dtype, max-len etc set so that this can run in CI
sys.executable,
"-m",
"vllm.entrypoints.openai.api_server",
"--model",
MODEL_NAME,
"--dtype",
"bfloat16",
"--max-model-len",
"128",
"--enforce-eager",
"--port",
str(port),
"--gpu-memory-utilization",
"0.05",
"--max-num-seqs",
"2",
"--disable-frontend-multiprocessing",
],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
preexec_fn=lambda: signal.signal(signal.SIGINT, signal.SIG_IGN),
)
# Wait for server startup
start_time = time.time()
client = openai.AsyncOpenAI(
base_url=f"http://localhost:{port}/v1",
api_key="dummy",
max_retries=0,
timeout=10,
)
# Poll until server is ready
while time.time() - start_time < 30:
try:
await client.completions.create(
model=MODEL_NAME, prompt="Hello", max_tokens=1
)
break
except Exception:
time.sleep(0.5)
if proc.poll() is not None:
stdout, stderr = proc.communicate(timeout=1)
pytest.fail(
f"Server died during startup. stdout: {stdout}, stderr: {stderr}"
)
else:
proc.terminate()
proc.wait(timeout=5)
pytest.fail("Server failed to start in 30 seconds")
# Now the server should shut down
return_code = remote_server.proc.wait(timeout=8)
assert return_code is not None
# Kill server to simulate crash
proc.terminate()
time.sleep(1)
# Verify API calls now fail
with pytest.raises((openai.APIConnectionError, openai.APIStatusError)):
await client.completions.create(
model=MODEL_NAME, prompt="This should fail", max_tokens=1
)
return_code = proc.wait(timeout=5)
assert return_code is not None

View File

@ -330,6 +330,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"guard": "meta-llama/Llama-Guard-3-1B",
"hermes": "NousResearch/Hermes-3-Llama-3.1-8B",
"fp8": "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8",
"tiny": "hmellor/tiny-random-LlamaForCausalLM",
},
),
"LLaMAForCausalLM": _HfExamplesInfo(

View File

@ -35,15 +35,13 @@ def _generate(
class TestOneTokenBadWord:
MODEL = "TheBloke/Llama-2-7B-fp16"
MODEL = "hmellor/tiny-random-LlamaForCausalLM"
PROMPT = "Hi! How are"
TARGET_TOKEN = "you"
PROMPT = "How old are "
TARGET_TOKEN = "mn"
def setup_method(self, method):
self.tokenizer = AutoTokenizer.from_pretrained(
self.MODEL, add_prefix_space=True
)
self.tokenizer = AutoTokenizer.from_pretrained(self.MODEL)
self.num_prompt_tokens = len(self._encode(self.PROMPT))
self.target_token_id = self._encode(

View File

@ -5,7 +5,7 @@ import pytest
from vllm import LLM
MODEL = "meta-llama/Llama-3.2-1B"
MODEL = "hmellor/tiny-random-LlamaForCausalLM"
PROMPT = "Hello my name is Robert and I"

View File

@ -24,9 +24,11 @@ from ...utils import create_new_process_for_each_test, multi_gpu_test
if not current_platform.is_cuda():
pytest.skip(reason="V1 currently only supported on CUDA.", allow_module_level=True)
MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)
PROMPT = "Hello my name is Robert and I love quantization kernels"
# test_engine_core_concurrent_batches assumes exactly 12 tokens per prompt.
# Adjust prompt if changing model to maintain 12-token length.
PROMPT = "I am Gyoubu Masataka Oniwa"
PROMPT_TOKENS = TOKENIZER(PROMPT).input_ids

View File

@ -10,7 +10,7 @@ import pytest_asyncio
from tests.utils import RemoteOpenAIServer
from tests.v1.utils import check_request_balancing
MODEL_NAME = "ibm-research/PowerMoE-3b"
MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
DP_SIZE = os.getenv("DP_SIZE", "1")

View File

@ -5,16 +5,13 @@ import pytest
from vllm import LLM, SamplingParams
MODEL = "meta-llama/Llama-3.2-1B"
MODEL = "hmellor/tiny-random-LlamaForCausalLM"
PROMPT = "Hello my name is Robert and I"
@pytest.fixture(scope="module")
def llm() -> LLM:
# Disable prefix caching so that we can test prompt logprobs.
# TODO remove this after https://github.com/vllm-project/vllm/pull/13949
# is merged
return LLM(MODEL, enforce_eager=True, enable_prefix_caching=False)
return LLM(MODEL, enforce_eager=True)
def test_n_gt_1(llm):

View File

@ -15,7 +15,7 @@ from vllm.sampling_params import RequestOutputKind
from vllm.utils import cuda_device_count_stateless
from vllm.v1.engine.async_llm import AsyncLLM
MODELS = ["meta-llama/Llama-3.2-1B"]
MODELS = ["hmellor/tiny-random-LlamaForCausalLM"]
@pytest.mark.asyncio

View File

@ -18,7 +18,7 @@ from vllm.utils import cuda_device_count_stateless
from vllm.v1.engine.async_llm import AsyncLLM
from vllm.v1.engine.exceptions import EngineDeadError
MODELS = ["meta-llama/Llama-3.2-1B"]
MODELS = ["hmellor/tiny-random-LlamaForCausalLM"]
def evil_forward(self, *args, **kwargs):

View File

@ -16,7 +16,7 @@ from vllm.model_executor.models.llama import LlamaForCausalLM
from vllm.utils import cuda_device_count_stateless
from vllm.v1.engine.async_llm import AsyncLLM
MODELS = ["meta-llama/Llama-3.2-1B"]
MODELS = ["hmellor/tiny-random-LlamaForCausalLM"]
def evil_method(self, *args, **kwargs):
@ -76,8 +76,10 @@ def test_llm_startup_error(
Test profiling (forward()) and load weights failures.
TODO(andy) - LLM without multiprocessing.
"""
if model != "meta-llama/Llama-3.2-1B":
pytest.skip(reason="Only test meta-llama/Llama-3.2-1B")
# Skip non-Llama models since we monkeypatch LlamaForCausalLM specifically.
# If MODELS list grows, each architecture needs its own test variant.
if model != "JackFram/llama-68m":
pytest.skip(reason="Only test JackFram/llama-68m")
if cuda_device_count_stateless() < tensor_parallel_size:
pytest.skip(reason="Not enough CUDA devices")