mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 14:53:52 +08:00
[CI/Build] Clean up LoRA tests (#15867)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
This commit is contained in:
@ -11,14 +11,6 @@ MODEL_PATH = "baichuan-inc/Baichuan-7B"
|
||||
PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:""" # noqa: E501
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def v1(run_with_both_engines_lora):
|
||||
# Simple autouse wrapper to run both engines for each test
|
||||
# This can be promoted up to conftest.py to run for every
|
||||
# test in a package
|
||||
pass
|
||||
|
||||
|
||||
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
|
||||
prompts = [
|
||||
PROMPT_TEMPLATE.format(query="How many singers do we have?"),
|
||||
|
@ -1,65 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import pytest
|
||||
|
||||
import vllm
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
MODEL_PATH = "google/gemma-7b"
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def v1(run_with_both_engines_lora):
|
||||
# Simple autouse wrapper to run both engines for each test
|
||||
# This can be promoted up to conftest.py to run for every
|
||||
# test in a package
|
||||
pass
|
||||
|
||||
|
||||
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
|
||||
prompts = [
|
||||
"Quote: Imagination is",
|
||||
"Quote: Be yourself;",
|
||||
"Quote: Painting is poetry that is seen rather than felt,",
|
||||
]
|
||||
sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32)
|
||||
outputs = llm.generate(
|
||||
prompts,
|
||||
sampling_params,
|
||||
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
|
||||
if lora_id else None)
|
||||
# Print the outputs.
|
||||
generated_texts: list[str] = []
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text.strip()
|
||||
generated_texts.append(generated_text)
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
return generated_texts
|
||||
|
||||
|
||||
# The V1 lora test for this model requires more than 24GB.
|
||||
@pytest.mark.skip_v1
|
||||
@pytest.mark.xfail(current_platform.is_rocm(),
|
||||
reason="There can be output mismatch on ROCm")
|
||||
def test_gemma_lora(gemma_lora_files):
|
||||
llm = vllm.LLM(MODEL_PATH,
|
||||
max_model_len=1024,
|
||||
enable_lora=True,
|
||||
max_loras=4,
|
||||
enable_chunked_prefill=True)
|
||||
|
||||
expected_lora_output = [
|
||||
"more important than knowledge.\nAuthor: Albert Einstein\n",
|
||||
"everyone else is already taken.\nAuthor: Oscar Wilde\n",
|
||||
"and poetry is painting that is felt rather than seen.\n"
|
||||
"Author: Leonardo da Vinci\n",
|
||||
]
|
||||
|
||||
output1 = do_sample(llm, gemma_lora_files, lora_id=1)
|
||||
for i in range(len(expected_lora_output)):
|
||||
assert output1[i].startswith(expected_lora_output[i])
|
||||
output2 = do_sample(llm, gemma_lora_files, lora_id=2)
|
||||
for i in range(len(expected_lora_output)):
|
||||
assert output2[i].startswith(expected_lora_output[i])
|
@ -19,7 +19,6 @@ from vllm.lora.fully_sharded_layers import (
|
||||
# yapf conflicts with isort for this block
|
||||
# yapf: disable
|
||||
from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA,
|
||||
LinearScalingRotaryEmbeddingWithLoRA,
|
||||
LogitsProcessorWithLoRA, LoRAMapping,
|
||||
MergedColumnParallelLinearWithLoRA,
|
||||
MergedQKVParallelLinearWithLoRA,
|
||||
@ -28,8 +27,7 @@ from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA,
|
||||
RowParallelLinearWithLoRA,
|
||||
VocabParallelEmbeddingWithLoRA)
|
||||
# yapf: enable
|
||||
from vllm.lora.models import (LongContextLoRAContext, LoRALayerWeights,
|
||||
PackedLoRALayerWeights)
|
||||
from vllm.lora.models import LoRALayerWeights, PackedLoRALayerWeights
|
||||
from vllm.lora.punica_wrapper import get_punica_wrapper
|
||||
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
|
||||
MergedColumnParallelLinear,
|
||||
@ -37,7 +35,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
|
||||
ReplicatedLinear,
|
||||
RowParallelLinear)
|
||||
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
||||
from vllm.model_executor.layers.rotary_embedding import get_rope
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
||||
ParallelLMHead, VocabParallelEmbedding, get_masked_input_and_mask)
|
||||
from vllm.model_executor.utils import set_random_seed
|
||||
@ -59,28 +56,16 @@ DEVICES = ([
|
||||
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
|
||||
] if current_platform.is_cuda_alike() else ["cpu"])
|
||||
|
||||
#For GPU, we will launch different triton kernels between the prefill and decode
|
||||
# stages, so we need to verify this. prefill stage(True) or decode stage(False)
|
||||
# prefill stage(True) or decode stage(False)
|
||||
STAGES = [True, False]
|
||||
|
||||
# With the inclusion of V1 tests (look at the run_with_both_engines_lora),
|
||||
# the tests in this file run twice, once with the V0 engine and then with
|
||||
# the V1 engine.
|
||||
# The NUM_RANDOM_SEEDS value was set to 10 before. It is cut to half
|
||||
# with the inclusion of V1 tests to maintain the CI test times.
|
||||
NUM_RANDOM_SEEDS = 5
|
||||
# The VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS value was set to
|
||||
# 256 before. It is cut to half with the inclusion of V1 tests to maintain
|
||||
# the CI test times.
|
||||
NUM_RANDOM_SEEDS = 10
|
||||
|
||||
VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS = 128
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def v1(run_with_both_engines_lora):
|
||||
# Simple autouse wrapper to run both engines for each test
|
||||
# This can be promoted up to conftest.py to run for every
|
||||
# test in a package
|
||||
|
||||
def clean_cache():
|
||||
# Release any memory we might be holding on to. CI runs OOMs otherwise.
|
||||
from vllm.lora.ops.triton_ops.utils import (_LORA_A_PTR_DICT,
|
||||
_LORA_B_PTR_DICT)
|
||||
@ -90,6 +75,24 @@ def v1(run_with_both_engines_lora):
|
||||
yield
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def skip_cuda_with_stage_false(request):
|
||||
"""
|
||||
On cuda-like platforms, we use the same kernels for prefill and decode
|
||||
stage, and 'stage' is generally ignored, so we only need to test once.
|
||||
"""
|
||||
if current_platform.is_cuda_alike():
|
||||
try:
|
||||
if hasattr(request.node, "callspec") and hasattr(
|
||||
request.node.callspec, "params"):
|
||||
params = request.node.callspec.params
|
||||
if "stage" in params and params["stage"] is False:
|
||||
pytest.skip("Skip test when stage=False")
|
||||
except Exception:
|
||||
pass
|
||||
yield
|
||||
|
||||
|
||||
def get_random_id_to_index(num_loras: int,
|
||||
num_slots: int,
|
||||
log: bool = True) -> list[Optional[int]]:
|
||||
@ -1011,103 +1014,6 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
|
||||
atol=atol)
|
||||
|
||||
|
||||
@torch.inference_mode()
|
||||
@pytest.mark.parametrize("num_loras", [1, 8])
|
||||
@pytest.mark.parametrize("device", ["cuda"])
|
||||
@pytest.mark.parametrize("scaling_factors", [(1.0, ), (4.0, ), (4.0, 8.0),
|
||||
(6.0, 1.0)])
|
||||
@pytest.mark.parametrize("max_position", [11, 4096, 32768])
|
||||
@pytest.mark.parametrize("is_neox_style", [True, False])
|
||||
@pytest.mark.parametrize("rotary_dim", [None, 32])
|
||||
@pytest.mark.parametrize("head_size", [32, 108])
|
||||
@pytest.mark.parametrize("seq_len", [11, 1024])
|
||||
@pytest.mark.skipif(not current_platform.is_cuda_alike(),
|
||||
reason="Only CUDA backends are supported")
|
||||
def test_rotary_embedding_long_context(dist_init, num_loras, device,
|
||||
scaling_factors, max_position,
|
||||
is_neox_style, rotary_dim, head_size,
|
||||
seq_len) -> None:
|
||||
dtype = torch.float16
|
||||
max_loras = 8
|
||||
seed = 0
|
||||
current_platform.seed_everything(seed)
|
||||
torch.set_default_device(device)
|
||||
punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
|
||||
assert check_punica_wrapper(punica_wrapper)
|
||||
lora_config = LoRAConfig(max_loras=max_loras,
|
||||
max_lora_rank=8,
|
||||
long_lora_scaling_factors=scaling_factors,
|
||||
lora_dtype=dtype)
|
||||
|
||||
if rotary_dim is None:
|
||||
rotary_dim = head_size
|
||||
base = 10000
|
||||
batch_size = 5 * num_loras
|
||||
num_heads = 7
|
||||
|
||||
# Verify lora is equivalent to linear scaling rotary embedding.
|
||||
rope = get_rope(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
max_position,
|
||||
base,
|
||||
is_neox_style,
|
||||
)
|
||||
lora_rope = LinearScalingRotaryEmbeddingWithLoRA(rope)
|
||||
lora_rope.set_mapping(punica_wrapper)
|
||||
lora_rope.create_lora_weights(max_loras, lora_config)
|
||||
linear_rope = get_rope(head_size, rotary_dim, max_position, base,
|
||||
is_neox_style, {
|
||||
"rope_type": "linear",
|
||||
"factor": scaling_factors
|
||||
})
|
||||
linear_rope = linear_rope.to(dtype=dtype)
|
||||
id_to_index = get_random_id_to_index(num_loras, max_loras)
|
||||
_, index_mapping, prompt_mapping = create_random_inputs(
|
||||
active_lora_ids=[0],
|
||||
num_inputs=batch_size,
|
||||
input_size=(1, max_position),
|
||||
input_range=(0, lora_config.lora_extra_vocab_size),
|
||||
input_type=torch.float16,
|
||||
device=device)
|
||||
|
||||
lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
|
||||
long_lora_context = LongContextLoRAContext(list(scaling_factors),
|
||||
rotary_dim)
|
||||
|
||||
next_expected_offset = 0
|
||||
# Make sure the offset is correct.
|
||||
scaling_factor_to_offset = lora_rope.scaling_factor_to_offset
|
||||
for scaling_factor, offset in scaling_factor_to_offset.items():
|
||||
assert offset == next_expected_offset
|
||||
next_expected_offset += scaling_factor * max_position
|
||||
|
||||
for i in range(len(scaling_factors)):
|
||||
long_lora_context.offsets_by_lora_id[i] = scaling_factor_to_offset.get(
|
||||
scaling_factors[i], 0)
|
||||
punica_wrapper.update_metadata(
|
||||
lora_mapping,
|
||||
id_to_index,
|
||||
max_loras,
|
||||
512,
|
||||
lora_config.lora_extra_vocab_size,
|
||||
long_lora_context=long_lora_context,
|
||||
)
|
||||
# lora_rope.set_mapping(*mapping_info)
|
||||
|
||||
positions = torch.randint(0, max_position, (batch_size, seq_len))
|
||||
query = torch.randn(batch_size,
|
||||
seq_len,
|
||||
num_heads * head_size,
|
||||
dtype=dtype)
|
||||
key = torch.randn_like(query)
|
||||
ref_q, ref_k = linear_rope(positions, query, key)
|
||||
actual_q, actual_k = lora_rope(positions, query, key)
|
||||
|
||||
torch.allclose(ref_q, actual_q)
|
||||
torch.allclose(ref_k, actual_k)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("tp_size", [1, 2, 4, 8])
|
||||
@pytest.mark.parametrize(
|
||||
"seed", list(range(VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS)))
|
||||
|
@ -78,6 +78,8 @@ def test_minicpmv_lora(minicpmv_lora_files):
|
||||
assert EXPECTED_OUTPUT[i].startswith(output2[i])
|
||||
|
||||
|
||||
@pytest.mark.skipif(current_platform.is_cuda_alike(),
|
||||
reason="Skipping to avoid redundant model tests")
|
||||
@pytest.mark.xfail(
|
||||
current_platform.is_rocm(),
|
||||
reason="MiniCPM-V dependency xformers incompatible with ROCm")
|
||||
@ -99,6 +101,8 @@ def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
|
||||
assert EXPECTED_OUTPUT[i].startswith(output_tp[i])
|
||||
|
||||
|
||||
@pytest.mark.skipif(current_platform.is_cuda_alike(),
|
||||
reason="Skipping to avoid redundant model tests")
|
||||
@pytest.mark.xfail(
|
||||
current_platform.is_rocm(),
|
||||
reason="MiniCPM-V dependency xformers incompatible with ROCm")
|
||||
|
@ -1,7 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import pytest
|
||||
|
||||
import vllm
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
@ -18,14 +16,6 @@ EXPECTED_LORA_OUTPUT = [
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def v1(run_with_both_engines_lora):
|
||||
# Simple autouse wrapper to run both engines for each test
|
||||
# This can be promoted up to conftest.py to run for every
|
||||
# test in a package
|
||||
pass
|
||||
|
||||
|
||||
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
|
||||
prompts = [
|
||||
PROMPT_TEMPLATE.format(query="How many singers do we have?"),
|
||||
|
Reference in New Issue
Block a user