mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 14:53:52 +08:00
[CI][V0 Deprecation] Removed V0 Only Chunked Prefill and Prefix Caching Tests (#22871)
Signed-off-by: Robert Shaw <robshaw@redhat.com> Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Co-authored-by: Robert Shaw <robshaw@redhat.com> Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
@ -88,15 +88,6 @@ steps:
|
||||
- pytest -v -s basic_correctness/test_cpu_offload.py
|
||||
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
|
||||
|
||||
- label: Chunked Prefill Test
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/basic_correctness/test_chunked_prefill
|
||||
commands:
|
||||
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
|
||||
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
|
||||
|
||||
- label: Core Test # 10min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
fast_check: true
|
||||
@ -295,15 +286,6 @@ steps:
|
||||
- python3 offline_inference/basic/score.py
|
||||
- VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
|
||||
|
||||
- label: Prefix Caching Test # 9min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/prefix_caching
|
||||
commands:
|
||||
- pytest -v -s prefix_caching
|
||||
|
||||
|
||||
- label: Platform Tests (CUDA)
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
|
1
.github/CODEOWNERS
vendored
1
.github/CODEOWNERS
vendored
@ -31,7 +31,6 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
||||
# Test ownership
|
||||
/.buildkite/lm-eval-harness @mgoin @simon-mo
|
||||
/tests/async_engine @njhill @robertgshaw2-redhat @simon-mo
|
||||
/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac
|
||||
/tests/distributed/test_multi_node_assignment.py @youkaichao
|
||||
/tests/distributed/test_pipeline_parallel.py @youkaichao
|
||||
/tests/distributed/test_same_node.py @youkaichao
|
||||
|
@ -1,296 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Compare the outputs of HF and vLLM when using greedy sampling.
|
||||
|
||||
It tests chunked prefill. Chunked prefill can be enabled by
|
||||
enable_chunked_prefill=True. If prefill size exceeds max_num_batched_tokens,
|
||||
prefill requests are chunked.
|
||||
|
||||
Run `pytest tests/models/test_chunked_prefill.py`.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import STR_BACKEND_ENV_VAR
|
||||
|
||||
from ..models.utils import check_logprobs_close, check_outputs_equal
|
||||
from ..utils import multi_gpu_test
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .conftest import HfRunner, VllmRunner
|
||||
|
||||
MODELS = [
|
||||
"facebook/opt-125m",
|
||||
"meta-llama/Llama-3.2-1B-Instruct",
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(scope="function", autouse=True)
|
||||
def use_v0_only(monkeypatch: pytest.MonkeyPatch):
|
||||
"""
|
||||
Since this module is V0 only, set VLLM_USE_V1=0 for
|
||||
all tests in the file.
|
||||
"""
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv('VLLM_USE_V1', '0')
|
||||
yield
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16])
|
||||
@pytest.mark.parametrize("enforce_eager", [False, True])
|
||||
# NOTE: Increasing this in this suite will fail CI because we currently cannot
|
||||
# reset distributed env properly. Use a value > 1 just when you test.
|
||||
@pytest.mark.parametrize("tensor_parallel_size", [1])
|
||||
@pytest.mark.parametrize("attention_backend", [
|
||||
pytest.param("FLASHINFER",
|
||||
marks=pytest.mark.skipif(
|
||||
current_platform.is_rocm(),
|
||||
reason="FLASHINFER isn't supported on ROCm")),
|
||||
"FLASH_ATTN"
|
||||
])
|
||||
def test_models(
|
||||
hf_runner: HfRunner,
|
||||
vllm_runner: VllmRunner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
chunked_prefill_token_size: int,
|
||||
enforce_eager: bool,
|
||||
tensor_parallel_size: int,
|
||||
attention_backend: str,
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
"""
|
||||
Checks exact match decode between huggingface model and vllm runner with
|
||||
chunked prefill.
|
||||
"""
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
|
||||
|
||||
max_num_seqs = chunked_prefill_token_size
|
||||
max_num_batched_tokens = chunked_prefill_token_size
|
||||
|
||||
with hf_runner(model, dtype=dtype) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_num_batched_tokens=max_num_batched_tokens,
|
||||
enable_chunked_prefill=True,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
enforce_eager=enforce_eager,
|
||||
max_num_seqs=max_num_seqs,
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts,
|
||||
max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("attention_backend", [
|
||||
pytest.param("FLASHINFER",
|
||||
marks=pytest.mark.skipif(
|
||||
current_platform.is_rocm(),
|
||||
reason="FLASHINFER isn't supported on ROCm")),
|
||||
"FLASH_ATTN"
|
||||
])
|
||||
def test_models_distributed(
|
||||
hf_runner: HfRunner,
|
||||
vllm_runner: VllmRunner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
distributed_executor_backend: str,
|
||||
attention_backend: str,
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
|
||||
if (model == "meta-llama/Llama-3.2-1B-Instruct"
|
||||
and distributed_executor_backend == "ray"):
|
||||
# test Ray Compiled Graph
|
||||
m.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
|
||||
m.setenv("VLLM_USE_RAY_COMPILED_DAG", "1")
|
||||
|
||||
dtype = "half"
|
||||
max_tokens = 5
|
||||
chunked_prefill_token_size = 16
|
||||
|
||||
# Add a chunked prefill config.
|
||||
max_num_seqs = min(chunked_prefill_token_size, 256)
|
||||
assert chunked_prefill_token_size != -1
|
||||
enable_chunked_prefill = True
|
||||
max_num_batched_tokens = chunked_prefill_token_size
|
||||
|
||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||
# vLLM needs a fresh new process without cuda initialization.
|
||||
# if we run HF first, the cuda initialization will be done and it
|
||||
# will hurt multiprocessing backend with
|
||||
# fork method (the default method).
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
tensor_parallel_size=2,
|
||||
max_num_seqs=max_num_seqs,
|
||||
enable_chunked_prefill=enable_chunked_prefill,
|
||||
max_num_batched_tokens=max_num_batched_tokens,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy(
|
||||
example_prompts,
|
||||
max_tokens,
|
||||
)
|
||||
|
||||
with hf_runner(model, dtype=dtype) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kv_cache_dtype,model",
|
||||
[("fp8_e4m3",
|
||||
"nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme")])
|
||||
# Due to low-precision numerical divergence, we only test logprob of 4 tokens
|
||||
@pytest.mark.parametrize("max_tokens", [4])
|
||||
@pytest.mark.parametrize("chunked_prefill_token_size", [4, 16])
|
||||
@pytest.mark.parametrize("enforce_eager", [False, True])
|
||||
# NOTE: Increasing this in this suite will fail CI because we currently cannot
|
||||
# reset distributed env properly. Use a value > 1 just when you test.
|
||||
@pytest.mark.parametrize("tensor_parallel_size", [1])
|
||||
# Due to low-precision numerical divergence, this test is too sensitive to
|
||||
# the async postprocessor
|
||||
@pytest.mark.parametrize("disable_async_output_proc", [True])
|
||||
@pytest.mark.skipif(current_platform.is_rocm(),
|
||||
reason="machete_prepack_B isn't supported on ROCm")
|
||||
def test_models_with_fp8_kv_cache(
|
||||
vllm_runner: VllmRunner,
|
||||
example_prompts,
|
||||
kv_cache_dtype: str,
|
||||
model: str,
|
||||
max_tokens: int,
|
||||
chunked_prefill_token_size: int,
|
||||
enforce_eager: bool,
|
||||
tensor_parallel_size: int,
|
||||
disable_async_output_proc: bool,
|
||||
) -> None:
|
||||
"""
|
||||
Check output logprobs match between no_chunked_prefill and chunked_prefill
|
||||
with fp8 kv cache. General fp8 kv-cache tests are covered in test_fp8.py,
|
||||
so here we only check chunked prefill.
|
||||
"""
|
||||
NUM_LOG_PROBS = 8
|
||||
|
||||
max_num_seqs = chunked_prefill_token_size
|
||||
max_num_batched_tokens = chunked_prefill_token_size
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
enforce_eager=enforce_eager,
|
||||
max_num_seqs=max_num_seqs,
|
||||
kv_cache_dtype=kv_cache_dtype,
|
||||
disable_async_output_proc=disable_async_output_proc,
|
||||
) as vllm_model:
|
||||
no_chunked_prefill_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, NUM_LOG_PROBS)
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
max_num_batched_tokens=max_num_batched_tokens,
|
||||
enable_chunked_prefill=True,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
enforce_eager=enforce_eager,
|
||||
max_num_seqs=max_num_seqs,
|
||||
kv_cache_dtype=kv_cache_dtype,
|
||||
disable_async_output_proc=disable_async_output_proc,
|
||||
) as vllm_model:
|
||||
chunked_prefill_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, NUM_LOG_PROBS)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=no_chunked_prefill_outputs,
|
||||
outputs_1_lst=chunked_prefill_outputs,
|
||||
name_0="no_chunked_prefill",
|
||||
name_1="chunked_prefill",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("max_tokens", [16])
|
||||
@pytest.mark.parametrize("enforce_eager", [False])
|
||||
@pytest.mark.parametrize("chunk_size", [30, 32])
|
||||
# NOTE: Increasing this in this suite will fail CI because we currently cannot
|
||||
# reset distributed env properly. Use a value > 1 just when you test.
|
||||
@pytest.mark.parametrize("tensor_parallel_size", [1])
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_with_prefix_caching(
|
||||
vllm_runner: VllmRunner,
|
||||
max_tokens: int,
|
||||
enforce_eager: bool,
|
||||
chunk_size: int,
|
||||
tensor_parallel_size: int,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
"""
|
||||
Checks exact match decode with and without prefix caching
|
||||
with chunked prefill enabled.
|
||||
"""
|
||||
model = "meta-llama/Llama-3.2-1B-Instruct"
|
||||
# The common prompt has 142 tokens with Llama-2 tokenizer.
|
||||
common_prompt = "You are a helpful AI assistant " * 20
|
||||
unique_prompts = [
|
||||
"Question", # Warmup
|
||||
"Question", # Fully cached
|
||||
"Another question", # Partial cached
|
||||
]
|
||||
full_prompts = [f"{common_prompt}\n{p}" for p in unique_prompts]
|
||||
|
||||
max_num_batched_tokens = max_num_seqs = chunk_size
|
||||
outputs = {} # type: ignore
|
||||
for enable in (True, False):
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_num_batched_tokens=max_num_batched_tokens,
|
||||
enable_chunked_prefill=True,
|
||||
enable_prefix_caching=enable,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
enforce_eager=enforce_eager,
|
||||
max_num_seqs=max_num_seqs,
|
||||
) as vllm_model:
|
||||
outputs[enable] = []
|
||||
for prompt in full_prompts:
|
||||
outputs[enable] += vllm_model.generate_greedy(
|
||||
[prompt],
|
||||
max_tokens,
|
||||
)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=outputs[False],
|
||||
outputs_1_lst=outputs[True],
|
||||
name_0="w/o prefix caching",
|
||||
name_1="with prefix caching",
|
||||
)
|
@ -1,49 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Compare the with and without prefix caching.
|
||||
|
||||
Run `pytest tests/prefix_caching/test_prefix_caching.py`.
|
||||
"""
|
||||
import pytest
|
||||
|
||||
from vllm import LLM
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
|
||||
MODEL_LEN_LEN = [
|
||||
# Example models with sliding window.
|
||||
("bigcode/starcoder2-3b", 4096, 16384),
|
||||
# ("mistralai/Mistral-7B-v0.1", 4096, 32768), << OOM in CI
|
||||
|
||||
# Confirm model with sliding window works.
|
||||
# config has "use_sliding_window": false
|
||||
("Qwen/Qwen1.5-0.5B-Chat", 32768, 32768),
|
||||
# config has no sliding window attribute.
|
||||
("TinyLlama/TinyLlama-1.1B-Chat-v1.0", 2048, 2048),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_len_len", MODEL_LEN_LEN)
|
||||
def test_disable_sliding_window(model_len_len, ):
|
||||
model, sliding_len, full_len = model_len_len
|
||||
disabled_llm = LLM(model, disable_sliding_window=True)
|
||||
disabled_llm.generate("Hi my name is")
|
||||
model_config = disabled_llm.llm_engine.model_config
|
||||
assert model_config.max_model_len == sliding_len, (
|
||||
"Max len expected to equal sliding_len of %s, but got %s", sliding_len,
|
||||
model_config.max_model_len)
|
||||
|
||||
del disabled_llm
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
enabled_llm = LLM(model,
|
||||
enforce_eager=True,
|
||||
disable_sliding_window=False,
|
||||
enable_prefix_caching=False)
|
||||
enabled_llm.generate("Hi my name is")
|
||||
model_config = enabled_llm.llm_engine.model_config
|
||||
assert model_config.max_model_len == full_len, (
|
||||
"Max len expected to equal full_len of %s, but got %s", full_len,
|
||||
model_config.max_model_len)
|
||||
|
||||
del enabled_llm
|
||||
cleanup_dist_env_and_memory()
|
@ -1,231 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Compare the with and without prefix caching.
|
||||
|
||||
Run `pytest tests/prefix_caching/test_prefix_caching.py`.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.conftest import VllmRunner
|
||||
from tests.core.utils import SchedulerProxy, create_dummy_prompt
|
||||
from vllm import SamplingParams, TokensPrompt
|
||||
from vllm.core.scheduler import Scheduler
|
||||
from vllm.engine.llm_engine import LLMEngine
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import STR_BACKEND_ENV_VAR
|
||||
|
||||
from ..models.utils import check_outputs_equal
|
||||
|
||||
|
||||
@pytest.fixture(scope="function", autouse=True)
|
||||
def use_v0_only(monkeypatch: pytest.MonkeyPatch):
|
||||
"""
|
||||
This module relies on V0 internals, so set VLLM_USE_V1=0.
|
||||
"""
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv('VLLM_USE_V1', '0')
|
||||
yield
|
||||
|
||||
|
||||
MODELS = [
|
||||
"distilbert/distilgpt2",
|
||||
]
|
||||
|
||||
UNSTABLE_PROMPT_SEQUENCE = [
|
||||
([0] * 588) + ([1] * 1332) + ([2] * 30) + ([3] * 1),
|
||||
([0] * 588) + ([1] * 1332) + ([4] * 3) + ([5] * 50),
|
||||
([0] * 588) + ([1] * 1332) + ([2] * 30) + ([6] * 95),
|
||||
([0] * 588) + ([1] * 1332) + ([4] * 3) + ([7] * 174),
|
||||
([0] * 588) + ([8] * 1539),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [5])
|
||||
@pytest.mark.parametrize("cached_position", [0, 1])
|
||||
@pytest.mark.parametrize("enable_chunked_prefill", [True, False])
|
||||
@pytest.mark.parametrize("block_size", [16])
|
||||
def test_mixed_requests(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
backend: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
cached_position: int,
|
||||
enable_chunked_prefill: bool,
|
||||
block_size: int,
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
"""
|
||||
Test the case when some sequences have the prefix cache hit
|
||||
and the others don't. The cached position determines where
|
||||
the sequence is at among the batch of prefills.
|
||||
"""
|
||||
if backend == "FLASHINFER" and current_platform.is_rocm():
|
||||
pytest.skip("Flashinfer does not support ROCm/HIP.")
|
||||
if backend == "XFORMERS" and current_platform.is_rocm():
|
||||
pytest.skip("Xformers does not support ROCm/HIP.")
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv(STR_BACKEND_ENV_VAR, backend)
|
||||
|
||||
with hf_runner(model, dtype=dtype) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
cached_prompt = example_prompts[cached_position]
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
enable_prefix_caching=True,
|
||||
enable_chunked_prefill=enable_chunked_prefill,
|
||||
block_size=block_size,
|
||||
) as vllm_model:
|
||||
# Run the first prompt so the cache is populated
|
||||
vllm_outputs = vllm_model.generate_greedy([cached_prompt],
|
||||
max_tokens)
|
||||
|
||||
# Run all the promopts
|
||||
greedy_params = SamplingParams(temperature=0.0,
|
||||
max_tokens=max_tokens)
|
||||
req_outputs = vllm_model.llm.generate(example_prompts,
|
||||
greedy_params)
|
||||
|
||||
# Verify number of cached tokens
|
||||
for i in range(len(req_outputs)):
|
||||
if i == cached_position:
|
||||
expected_num_cached_tokens = (
|
||||
len(req_outputs[i].prompt_token_ids) //
|
||||
block_size) * block_size
|
||||
else:
|
||||
expected_num_cached_tokens = 0
|
||||
assert (req_outputs[i].num_cached_tokens ==
|
||||
expected_num_cached_tokens)
|
||||
|
||||
vllm_outputs = [(
|
||||
output.prompt_token_ids + list(output.outputs[0].token_ids),
|
||||
output.prompt + output.outputs[0].text,
|
||||
) for output in req_outputs]
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
|
||||
def test_unstable_prompt_sequence(
|
||||
vllm_runner,
|
||||
backend: str,
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
|
||||
if backend == "FLASHINFER" and current_platform.is_rocm():
|
||||
pytest.skip("Flashinfer does not support ROCm/HIP.")
|
||||
if backend == "XFORMERS" and current_platform.is_rocm():
|
||||
pytest.skip("Xformers does not support ROCm/HIP.")
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv(STR_BACKEND_ENV_VAR, backend)
|
||||
|
||||
with vllm_runner(
|
||||
"Qwen/Qwen2.5-0.5B-Instruct",
|
||||
enable_chunked_prefill=True,
|
||||
enable_prefix_caching=True,
|
||||
max_model_len=4096,
|
||||
) as vllm_model:
|
||||
for prompt in UNSTABLE_PROMPT_SEQUENCE:
|
||||
vllm_model.generate(TokensPrompt(prompt_token_ids=prompt),
|
||||
SamplingParams(max_tokens=1))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
def test_fully_cached_prefill_needs_uncached_token(model):
|
||||
block_size = 16
|
||||
max_num_batched_tokens = 16
|
||||
num_output_tokens = 5
|
||||
# Make a vllm engine
|
||||
runner = VllmRunner(
|
||||
model_name=model,
|
||||
gpu_memory_utilization=0.7,
|
||||
enable_chunked_prefill=True,
|
||||
enforce_eager=True,
|
||||
enable_prefix_caching=True,
|
||||
block_size=block_size,
|
||||
max_num_batched_tokens=max_num_batched_tokens,
|
||||
max_num_seqs=max_num_batched_tokens,
|
||||
)
|
||||
engine: LLMEngine = runner.llm.llm_engine
|
||||
|
||||
scheduler: Scheduler = SchedulerProxy(engine.scheduler[0]) # type: ignore
|
||||
engine.scheduler[0] = scheduler
|
||||
|
||||
# SeqA
|
||||
seqA_tokens = list(range(2 * block_size))
|
||||
seqA, seq_groupA = create_dummy_prompt(
|
||||
request_id="0",
|
||||
prompt_tokens=seqA_tokens,
|
||||
max_tokens=num_output_tokens,
|
||||
block_size=block_size,
|
||||
)
|
||||
|
||||
scheduler.add_seq_group(seq_groupA)
|
||||
|
||||
assert seqA.data.get_num_computed_tokens() == 0
|
||||
|
||||
# Prefill seqA
|
||||
while not seqA.is_finished():
|
||||
engine.step()
|
||||
|
||||
# seqB
|
||||
seqB_tokens = [t + 1 for t in seqA_tokens] # shift by 1
|
||||
seqB, seq_groupB = create_dummy_prompt(
|
||||
request_id="1",
|
||||
prompt_tokens=seqB_tokens,
|
||||
max_tokens=num_output_tokens,
|
||||
block_size=block_size,
|
||||
)
|
||||
|
||||
# seqC is the same as seqA
|
||||
seqC, seq_groupC = create_dummy_prompt(
|
||||
request_id="2",
|
||||
prompt_tokens=seqA_tokens,
|
||||
max_tokens=num_output_tokens,
|
||||
block_size=block_size,
|
||||
)
|
||||
|
||||
scheduler.add_seq_group(seq_groupB)
|
||||
scheduler.add_seq_group(seq_groupC)
|
||||
|
||||
# Even seqC is fully cached, it should not be prefilled since we
|
||||
# require at least 1 uncached token.
|
||||
engine.step()
|
||||
|
||||
sched_metas, sched_out, _ = scheduler.last_schedule_ret()
|
||||
assert len(sched_out.scheduled_seq_groups) == 1
|
||||
assert (sched_out.scheduled_seq_groups[0].seq_group.request_id ==
|
||||
seq_groupB.request_id)
|
||||
assert (sched_out.scheduled_seq_groups[0].token_chunk_size ==
|
||||
max_num_batched_tokens)
|
||||
|
||||
# When seqB is finished, seqC could be prefilled.
|
||||
while not seqB.is_finished():
|
||||
engine.step()
|
||||
sched_metas, sched_out, _ = scheduler.last_schedule_ret()
|
||||
assert len(sched_out.scheduled_seq_groups) == 1
|
||||
assert (sched_out.scheduled_seq_groups[0].seq_group.request_id ==
|
||||
seq_groupB.request_id)
|
||||
|
||||
engine.step()
|
||||
sched_metas, sched_out, _ = scheduler.last_schedule_ret()
|
||||
assert len(sched_out.scheduled_seq_groups) == 1
|
||||
assert (sched_out.scheduled_seq_groups[0].seq_group.request_id ==
|
||||
seq_groupC.request_id)
|
||||
assert sched_out.scheduled_seq_groups[0].token_chunk_size == len(
|
||||
seqA_tokens)
|
Reference in New Issue
Block a user