mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 23:03:52 +08:00
[CI/Build] Clean up LoRA test (#23890)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
This commit is contained in:
@ -164,7 +164,6 @@ if [[ $commands == *" entrypoints/llm "* ]]; then
|
||||
--ignore=entrypoints/llm/test_chat.py \
|
||||
--ignore=entrypoints/llm/test_accuracy.py \
|
||||
--ignore=entrypoints/llm/test_init.py \
|
||||
--ignore=entrypoints/llm/test_generate_multiple_loras.py \
|
||||
--ignore=entrypoints/llm/test_prompt_validation.py "}
|
||||
fi
|
||||
|
||||
|
@ -109,10 +109,9 @@ steps:
|
||||
- tests/entrypoints/offline_mode
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_collective_rpc.py
|
||||
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
|
||||
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
|
||||
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
||||
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
|
||||
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
||||
|
||||
- label: Entrypoints Test (API Server) # 40min
|
||||
@ -326,7 +325,7 @@ steps:
|
||||
source_file_dependencies:
|
||||
- vllm/lora
|
||||
- tests/lora
|
||||
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
|
||||
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py
|
||||
parallelism: 4
|
||||
|
||||
- label: PyTorch Compilation Unit Tests
|
||||
@ -807,13 +806,13 @@ steps:
|
||||
# requires multi-GPU testing for validation.
|
||||
- pytest -v -s -x lora/test_chatglm3_tp.py
|
||||
- pytest -v -s -x lora/test_llama_tp.py
|
||||
- pytest -v -s -x lora/test_multi_loras_with_tp.py
|
||||
- pytest -v -s -x lora/test_llm_with_multi_loras.py
|
||||
|
||||
|
||||
- label: Weight Loading Multiple GPU Test # 33min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
num_gpus: 2
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
|
@ -1,80 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import weakref
|
||||
|
||||
import pytest
|
||||
# downloading lora to test lora requests
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
from vllm import LLM
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
|
||||
|
||||
PROMPTS = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
LORA_NAME = "typeof/zephyr-7b-beta-lora"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def monkeypatch_module():
|
||||
from _pytest.monkeypatch import MonkeyPatch
|
||||
mpatch = MonkeyPatch()
|
||||
yield mpatch
|
||||
mpatch.undo()
|
||||
|
||||
|
||||
@pytest.fixture(scope="module", params=[False, True])
|
||||
def llm(request, monkeypatch_module):
|
||||
|
||||
use_v1 = request.param
|
||||
monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0')
|
||||
|
||||
# pytest caches the fixture so we use weakref.proxy to
|
||||
# enable garbage collection
|
||||
llm = LLM(model=MODEL_NAME,
|
||||
tensor_parallel_size=1,
|
||||
max_model_len=8192,
|
||||
enable_lora=True,
|
||||
max_loras=4,
|
||||
max_lora_rank=64,
|
||||
max_num_seqs=128,
|
||||
enforce_eager=True)
|
||||
|
||||
yield weakref.proxy(llm)
|
||||
|
||||
del llm
|
||||
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def zephyr_lora_files():
|
||||
return snapshot_download(repo_id=LORA_NAME)
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_multiple_lora_requests(llm: LLM, zephyr_lora_files):
|
||||
lora_request = [
|
||||
LoRARequest(LORA_NAME + str(idx), idx + 1, zephyr_lora_files)
|
||||
for idx in range(len(PROMPTS))
|
||||
]
|
||||
# Multiple SamplingParams should be matched with each prompt
|
||||
outputs = llm.generate(PROMPTS, lora_request=lora_request)
|
||||
assert len(PROMPTS) == len(outputs)
|
||||
|
||||
# Exception raised, if the size of params does not match the size of prompts
|
||||
with pytest.raises(ValueError):
|
||||
outputs = llm.generate(PROMPTS, lora_request=lora_request[:1])
|
||||
|
||||
# Single LoRARequest should be applied to every prompt
|
||||
single_lora_request = lora_request[0]
|
||||
outputs = llm.generate(PROMPTS, lora_request=single_lora_request)
|
||||
assert len(PROMPTS) == len(outputs)
|
@ -1,8 +1,12 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Script to test multi loras service with tp >= 2
|
||||
This script contains:
|
||||
1. test multi loras service with tp >= 2
|
||||
2. test multi loras request
|
||||
"""
|
||||
import pytest
|
||||
|
||||
from tests.utils import multi_gpu_test
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.lora.request import LoRARequest
|
||||
@ -156,3 +160,34 @@ def test_multi_loras_with_tp_sync():
|
||||
|
||||
output_text = call_llm_get_outputs(prompt, "Alice")
|
||||
check_outputs(output_text, expected_output)
|
||||
|
||||
|
||||
def test_multiple_lora_requests():
|
||||
llm = LLM(
|
||||
model=MODEL_PATH,
|
||||
enable_lora=True,
|
||||
max_loras=4,
|
||||
max_lora_rank=LORA_RANK,
|
||||
max_model_len=512,
|
||||
gpu_memory_utilization=0.5,
|
||||
enforce_eager=True,
|
||||
)
|
||||
PROMPTS = ["Hello, my name is"] * 2
|
||||
LORA_NAME = "Alice"
|
||||
lora_request = [
|
||||
LoRARequest(LORA_NAME + str(idx), idx + 1,
|
||||
LORA_NAME_PATH_MAP[LORA_NAME])
|
||||
for idx in range(len(PROMPTS))
|
||||
]
|
||||
# Multiple SamplingParams should be matched with each prompt
|
||||
outputs = llm.generate(PROMPTS, lora_request=lora_request)
|
||||
assert len(PROMPTS) == len(outputs)
|
||||
|
||||
# Exception raised, if the size of params does not match the size of prompts
|
||||
with pytest.raises(ValueError):
|
||||
outputs = llm.generate(PROMPTS, lora_request=lora_request[:1])
|
||||
|
||||
# Single LoRARequest should be applied to every prompt
|
||||
single_lora_request = lora_request[0]
|
||||
outputs = llm.generate(PROMPTS, lora_request=single_lora_request)
|
||||
assert len(PROMPTS) == len(outputs)
|
Reference in New Issue
Block a user