[CI][V0 Deprecation] Removed V0 Only Chunked Prefill and Prefix Caching Tests (#22871)

Signed-off-by: Robert Shaw <robshaw@redhat.com> Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Co-authored-by: Robert Shaw <robshaw@redhat.com> Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-08-18 20:39:01 -04:00
parent 95e3095136
commit 6603288736
6 changed files with 0 additions and 595 deletions
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -88,15 +88,6 @@ steps:
  - pytest -v -s basic_correctness/test_cpu_offload.py
  - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py

- label: Chunked Prefill Test
-  mirror_hardwares: [amdexperimental]
-  source_file_dependencies:
-  - vllm/
-  - tests/basic_correctness/test_chunked_prefill
-  commands:
-  - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
-  - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
-
 - label: Core Test # 10min
  mirror_hardwares: [amdexperimental]
  fast_check: true
@ -295,15 +286,6 @@ steps:
    - python3 offline_inference/basic/score.py
    - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2

- label: Prefix Caching Test # 9min
-  mirror_hardwares: [amdexperimental]
-  source_file_dependencies:
-  - vllm/
-  - tests/prefix_caching
-  commands:
-    - pytest -v -s prefix_caching
-
-
 - label: Platform Tests (CUDA)
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -31,7 +31,6 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 # Test ownership
 /.buildkite/lm-eval-harness @mgoin @simon-mo
 /tests/async_engine @njhill @robertgshaw2-redhat @simon-mo
-/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac
 /tests/distributed/test_multi_node_assignment.py @youkaichao
 /tests/distributed/test_pipeline_parallel.py @youkaichao
 /tests/distributed/test_same_node.py @youkaichao
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@ -1,296 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Compare the outputs of HF and vLLM when using greedy sampling.
-
-It tests chunked prefill. Chunked prefill can be enabled by
-enable_chunked_prefill=True. If prefill size exceeds max_num_batched_tokens,
-prefill requests are chunked.
-
-Run `pytest tests/models/test_chunked_prefill.py`.
-"""
-
-from __future__ import annotations
-
-from typing import TYPE_CHECKING
-
-import pytest
-
-from vllm.platforms import current_platform
-from vllm.utils import STR_BACKEND_ENV_VAR
-
-from ..models.utils import check_logprobs_close, check_outputs_equal
-from ..utils import multi_gpu_test
-
-if TYPE_CHECKING:
-    from .conftest import HfRunner, VllmRunner
-
-MODELS = [
-    "facebook/opt-125m",
-    "meta-llama/Llama-3.2-1B-Instruct",
-]
-
-
-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch: pytest.MonkeyPatch):
-    """
-    Since this module is V0 only, set VLLM_USE_V1=0 for
-    all tests in the file.
-    """
-    with monkeypatch.context() as m:
-        m.setenv('VLLM_USE_V1', '0')
-        yield
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [32])
-@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16])
-@pytest.mark.parametrize("enforce_eager", [False, True])
-# NOTE: Increasing this in this suite will fail CI because we currently cannot
-# reset distributed env properly. Use a value > 1 just when you test.
-@pytest.mark.parametrize("tensor_parallel_size", [1])
-@pytest.mark.parametrize("attention_backend", [
-    pytest.param("FLASHINFER",
-                 marks=pytest.mark.skipif(
-                     current_platform.is_rocm(),
-                     reason="FLASHINFER isn't supported on ROCm")),
-    "FLASH_ATTN"
-])
-def test_models(
-    hf_runner: HfRunner,
-    vllm_runner: VllmRunner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-    chunked_prefill_token_size: int,
-    enforce_eager: bool,
-    tensor_parallel_size: int,
-    attention_backend: str,
-    monkeypatch: pytest.MonkeyPatch,
-) -> None:
-    """
-    Checks exact match decode between huggingface model and vllm runner with
-    chunked prefill.
-    """
-    with monkeypatch.context() as m:
-        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
-
-        max_num_seqs = chunked_prefill_token_size
-        max_num_batched_tokens = chunked_prefill_token_size
-
-        with hf_runner(model, dtype=dtype) as hf_model:
-            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-
-        with vllm_runner(
-                model,
-                dtype=dtype,
-                max_num_batched_tokens=max_num_batched_tokens,
-                enable_chunked_prefill=True,
-                tensor_parallel_size=tensor_parallel_size,
-                enforce_eager=enforce_eager,
-                max_num_seqs=max_num_seqs,
-        ) as vllm_model:
-            vllm_outputs = vllm_model.generate_greedy(example_prompts,
-                                                      max_tokens)
-
-        check_outputs_equal(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=vllm_outputs,
-            name_0="hf",
-            name_1="vllm",
-        )
-
-
-@multi_gpu_test(num_gpus=2)
-@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("attention_backend", [
-    pytest.param("FLASHINFER",
-                 marks=pytest.mark.skipif(
-                     current_platform.is_rocm(),
-                     reason="FLASHINFER isn't supported on ROCm")),
-    "FLASH_ATTN"
-])
-def test_models_distributed(
-    hf_runner: HfRunner,
-    vllm_runner: VllmRunner,
-    example_prompts,
-    model: str,
-    distributed_executor_backend: str,
-    attention_backend: str,
-    monkeypatch: pytest.MonkeyPatch,
-) -> None:
-    with monkeypatch.context() as m:
-        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
-        if (model == "meta-llama/Llama-3.2-1B-Instruct"
-                and distributed_executor_backend == "ray"):
-            # test Ray Compiled Graph
-            m.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
-            m.setenv("VLLM_USE_RAY_COMPILED_DAG", "1")
-
-        dtype = "half"
-        max_tokens = 5
-        chunked_prefill_token_size = 16
-
-        # Add a chunked prefill config.
-        max_num_seqs = min(chunked_prefill_token_size, 256)
-        assert chunked_prefill_token_size != -1
-        enable_chunked_prefill = True
-        max_num_batched_tokens = chunked_prefill_token_size
-
-        # NOTE: take care of the order. run vLLM first, and then run HF.
-        # vLLM needs a fresh new process without cuda initialization.
-        # if we run HF first, the cuda initialization will be done and it
-        # will hurt multiprocessing backend with
-        # fork method (the default method).
-
-        with vllm_runner(
-                model,
-                dtype=dtype,
-                tensor_parallel_size=2,
-                max_num_seqs=max_num_seqs,
-                enable_chunked_prefill=enable_chunked_prefill,
-                max_num_batched_tokens=max_num_batched_tokens,
-                distributed_executor_backend=distributed_executor_backend,
-        ) as vllm_model:
-            vllm_outputs = vllm_model.generate_greedy(
-                example_prompts,
-                max_tokens,
-            )
-
-        with hf_runner(model, dtype=dtype) as hf_model:
-            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-
-        check_outputs_equal(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=vllm_outputs,
-            name_0="hf",
-            name_1="vllm",
-        )
-
-
-@pytest.mark.parametrize(
-    "kv_cache_dtype,model",
-    [("fp8_e4m3",
-      "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme")])
-# Due to low-precision numerical divergence, we only test logprob of 4 tokens
-@pytest.mark.parametrize("max_tokens", [4])
-@pytest.mark.parametrize("chunked_prefill_token_size", [4, 16])
-@pytest.mark.parametrize("enforce_eager", [False, True])
-# NOTE: Increasing this in this suite will fail CI because we currently cannot
-# reset distributed env properly. Use a value > 1 just when you test.
-@pytest.mark.parametrize("tensor_parallel_size", [1])
-# Due to low-precision numerical divergence, this test is too sensitive to
-# the async postprocessor
-@pytest.mark.parametrize("disable_async_output_proc", [True])
-@pytest.mark.skipif(current_platform.is_rocm(),
-                    reason="machete_prepack_B isn't supported on ROCm")
-def test_models_with_fp8_kv_cache(
-    vllm_runner: VllmRunner,
-    example_prompts,
-    kv_cache_dtype: str,
-    model: str,
-    max_tokens: int,
-    chunked_prefill_token_size: int,
-    enforce_eager: bool,
-    tensor_parallel_size: int,
-    disable_async_output_proc: bool,
-) -> None:
-    """
-    Check output logprobs match between no_chunked_prefill and chunked_prefill
-    with fp8 kv cache. General fp8 kv-cache tests are covered in test_fp8.py,
-    so here we only check chunked prefill.
-    """
-    NUM_LOG_PROBS = 8
-
-    max_num_seqs = chunked_prefill_token_size
-    max_num_batched_tokens = chunked_prefill_token_size
-
-    with vllm_runner(
-            model,
-            tensor_parallel_size=tensor_parallel_size,
-            enforce_eager=enforce_eager,
-            max_num_seqs=max_num_seqs,
-            kv_cache_dtype=kv_cache_dtype,
-            disable_async_output_proc=disable_async_output_proc,
-    ) as vllm_model:
-        no_chunked_prefill_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, NUM_LOG_PROBS)
-
-    with vllm_runner(
-            model,
-            max_num_batched_tokens=max_num_batched_tokens,
-            enable_chunked_prefill=True,
-            tensor_parallel_size=tensor_parallel_size,
-            enforce_eager=enforce_eager,
-            max_num_seqs=max_num_seqs,
-            kv_cache_dtype=kv_cache_dtype,
-            disable_async_output_proc=disable_async_output_proc,
-    ) as vllm_model:
-        chunked_prefill_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, NUM_LOG_PROBS)
-
-    check_logprobs_close(
-        outputs_0_lst=no_chunked_prefill_outputs,
-        outputs_1_lst=chunked_prefill_outputs,
-        name_0="no_chunked_prefill",
-        name_1="chunked_prefill",
-    )
-
-
-@pytest.mark.parametrize("max_tokens", [16])
-@pytest.mark.parametrize("enforce_eager", [False])
-@pytest.mark.parametrize("chunk_size", [30, 32])
-# NOTE: Increasing this in this suite will fail CI because we currently cannot
-# reset distributed env properly. Use a value > 1 just when you test.
-@pytest.mark.parametrize("tensor_parallel_size", [1])
-@pytest.mark.parametrize("dtype", ["half"])
-def test_with_prefix_caching(
-    vllm_runner: VllmRunner,
-    max_tokens: int,
-    enforce_eager: bool,
-    chunk_size: int,
-    tensor_parallel_size: int,
-    dtype: str,
-) -> None:
-    """
-    Checks exact match decode with and without prefix caching
-    with chunked prefill enabled.
-    """
-    model = "meta-llama/Llama-3.2-1B-Instruct"
-    # The common prompt has 142 tokens with Llama-2 tokenizer.
-    common_prompt = "You are a helpful AI assistant " * 20
-    unique_prompts = [
-        "Question",  # Warmup
-        "Question",  # Fully cached
-        "Another question",  # Partial cached
-    ]
-    full_prompts = [f"{common_prompt}\n{p}" for p in unique_prompts]
-
-    max_num_batched_tokens = max_num_seqs = chunk_size
-    outputs = {}  # type: ignore
-    for enable in (True, False):
-        with vllm_runner(
-                model,
-                dtype=dtype,
-                max_num_batched_tokens=max_num_batched_tokens,
-                enable_chunked_prefill=True,
-                enable_prefix_caching=enable,
-                tensor_parallel_size=tensor_parallel_size,
-                enforce_eager=enforce_eager,
-                max_num_seqs=max_num_seqs,
-        ) as vllm_model:
-            outputs[enable] = []
-            for prompt in full_prompts:
-                outputs[enable] += vllm_model.generate_greedy(
-                    [prompt],
-                    max_tokens,
-                )
-
-    check_outputs_equal(
-        outputs_0_lst=outputs[False],
-        outputs_1_lst=outputs[True],
-        name_0="w/o prefix caching",
-        name_1="with prefix caching",
-    )
--- a/tests/prefix_caching/init.py
+++ b/tests/prefix_caching/init.py
--- a/tests/prefix_caching/test_disable_sliding_window.py
+++ b/tests/prefix_caching/test_disable_sliding_window.py
@ -1,49 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Compare the with and without prefix caching.
-
-Run `pytest tests/prefix_caching/test_prefix_caching.py`.
-"""
-import pytest
-
-from vllm import LLM
-from vllm.distributed import cleanup_dist_env_and_memory
-
-MODEL_LEN_LEN = [
-    # Example models with sliding window.
-    ("bigcode/starcoder2-3b", 4096, 16384),
-    # ("mistralai/Mistral-7B-v0.1", 4096, 32768), << OOM in CI
-
-    # Confirm model with sliding window works.
-    # config has "use_sliding_window": false
-    ("Qwen/Qwen1.5-0.5B-Chat", 32768, 32768),
-    # config has no sliding window attribute.
-    ("TinyLlama/TinyLlama-1.1B-Chat-v1.0", 2048, 2048),
-]
-
-
-@pytest.mark.parametrize("model_len_len", MODEL_LEN_LEN)
-def test_disable_sliding_window(model_len_len, ):
-    model, sliding_len, full_len = model_len_len
-    disabled_llm = LLM(model, disable_sliding_window=True)
-    disabled_llm.generate("Hi my name is")
-    model_config = disabled_llm.llm_engine.model_config
-    assert model_config.max_model_len == sliding_len, (
-        "Max len expected to equal sliding_len of %s, but got %s", sliding_len,
-        model_config.max_model_len)
-
-    del disabled_llm
-    cleanup_dist_env_and_memory()
-
-    enabled_llm = LLM(model,
-                      enforce_eager=True,
-                      disable_sliding_window=False,
-                      enable_prefix_caching=False)
-    enabled_llm.generate("Hi my name is")
-    model_config = enabled_llm.llm_engine.model_config
-    assert model_config.max_model_len == full_len, (
-        "Max len expected to equal full_len of %s, but got %s", full_len,
-        model_config.max_model_len)
-
-    del enabled_llm
-    cleanup_dist_env_and_memory()
--- a/tests/prefix_caching/test_prefix_caching.py
+++ b/tests/prefix_caching/test_prefix_caching.py
@ -1,231 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Compare the with and without prefix caching.
-
-Run `pytest tests/prefix_caching/test_prefix_caching.py`.
-"""
-
-from __future__ import annotations
-
-import pytest
-
-from tests.conftest import VllmRunner
-from tests.core.utils import SchedulerProxy, create_dummy_prompt
-from vllm import SamplingParams, TokensPrompt
-from vllm.core.scheduler import Scheduler
-from vllm.engine.llm_engine import LLMEngine
-from vllm.platforms import current_platform
-from vllm.utils import STR_BACKEND_ENV_VAR
-
-from ..models.utils import check_outputs_equal
-
-
-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch: pytest.MonkeyPatch):
-    """
-    This module relies on V0 internals, so set VLLM_USE_V1=0.
-    """
-    with monkeypatch.context() as m:
-        m.setenv('VLLM_USE_V1', '0')
-        yield
-
-
-MODELS = [
-    "distilbert/distilgpt2",
-]
-
-UNSTABLE_PROMPT_SEQUENCE = [
-    ([0] * 588) + ([1] * 1332) + ([2] * 30) + ([3] * 1),
-    ([0] * 588) + ([1] * 1332) + ([4] * 3) + ([5] * 50),
-    ([0] * 588) + ([1] * 1332) + ([2] * 30) + ([6] * 95),
-    ([0] * 588) + ([1] * 1332) + ([4] * 3) + ([7] * 174),
-    ([0] * 588) + ([8] * 1539),
-]
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [5])
-@pytest.mark.parametrize("cached_position", [0, 1])
-@pytest.mark.parametrize("enable_chunked_prefill", [True, False])
-@pytest.mark.parametrize("block_size", [16])
-def test_mixed_requests(
-    hf_runner,
-    vllm_runner,
-    example_prompts,
-    model: str,
-    backend: str,
-    dtype: str,
-    max_tokens: int,
-    cached_position: int,
-    enable_chunked_prefill: bool,
-    block_size: int,
-    monkeypatch: pytest.MonkeyPatch,
-) -> None:
-    """
-    Test the case when some sequences have the prefix cache hit
-    and the others don't. The cached position determines where
-    the sequence is at among the batch of prefills.
-    """
-    if backend == "FLASHINFER" and current_platform.is_rocm():
-        pytest.skip("Flashinfer does not support ROCm/HIP.")
-    if backend == "XFORMERS" and current_platform.is_rocm():
-        pytest.skip("Xformers does not support ROCm/HIP.")
-    with monkeypatch.context() as m:
-        m.setenv(STR_BACKEND_ENV_VAR, backend)
-
-        with hf_runner(model, dtype=dtype) as hf_model:
-            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-
-        cached_prompt = example_prompts[cached_position]
-        with vllm_runner(
-                model,
-                dtype=dtype,
-                enable_prefix_caching=True,
-                enable_chunked_prefill=enable_chunked_prefill,
-                block_size=block_size,
-        ) as vllm_model:
-            # Run the first prompt so the cache is populated
-            vllm_outputs = vllm_model.generate_greedy([cached_prompt],
-                                                      max_tokens)
-
-            # Run all the promopts
-            greedy_params = SamplingParams(temperature=0.0,
-                                           max_tokens=max_tokens)
-            req_outputs = vllm_model.llm.generate(example_prompts,
-                                                  greedy_params)
-
-            # Verify number of cached tokens
-            for i in range(len(req_outputs)):
-                if i == cached_position:
-                    expected_num_cached_tokens = (
-                        len(req_outputs[i].prompt_token_ids) //
-                        block_size) * block_size
-                else:
-                    expected_num_cached_tokens = 0
-                assert (req_outputs[i].num_cached_tokens ==
-                        expected_num_cached_tokens)
-
-            vllm_outputs = [(
-                output.prompt_token_ids + list(output.outputs[0].token_ids),
-                output.prompt + output.outputs[0].text,
-            ) for output in req_outputs]
-
-        check_outputs_equal(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=vllm_outputs,
-            name_0="hf",
-            name_1="vllm",
-        )
-
-
-@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
-def test_unstable_prompt_sequence(
-    vllm_runner,
-    backend: str,
-    monkeypatch: pytest.MonkeyPatch,
-) -> None:
-
-    if backend == "FLASHINFER" and current_platform.is_rocm():
-        pytest.skip("Flashinfer does not support ROCm/HIP.")
-    if backend == "XFORMERS" and current_platform.is_rocm():
-        pytest.skip("Xformers does not support ROCm/HIP.")
-    with monkeypatch.context() as m:
-        m.setenv(STR_BACKEND_ENV_VAR, backend)
-
-        with vllm_runner(
-                "Qwen/Qwen2.5-0.5B-Instruct",
-                enable_chunked_prefill=True,
-                enable_prefix_caching=True,
-                max_model_len=4096,
-        ) as vllm_model:
-            for prompt in UNSTABLE_PROMPT_SEQUENCE:
-                vllm_model.generate(TokensPrompt(prompt_token_ids=prompt),
-                                    SamplingParams(max_tokens=1))
-
-
-@pytest.mark.parametrize("model", MODELS)
-def test_fully_cached_prefill_needs_uncached_token(model):
-    block_size = 16
-    max_num_batched_tokens = 16
-    num_output_tokens = 5
-    # Make a vllm engine
-    runner = VllmRunner(
-        model_name=model,
-        gpu_memory_utilization=0.7,
-        enable_chunked_prefill=True,
-        enforce_eager=True,
-        enable_prefix_caching=True,
-        block_size=block_size,
-        max_num_batched_tokens=max_num_batched_tokens,
-        max_num_seqs=max_num_batched_tokens,
-    )
-    engine: LLMEngine = runner.llm.llm_engine
-
-    scheduler: Scheduler = SchedulerProxy(engine.scheduler[0])  # type: ignore
-    engine.scheduler[0] = scheduler
-
-    # SeqA
-    seqA_tokens = list(range(2 * block_size))
-    seqA, seq_groupA = create_dummy_prompt(
-        request_id="0",
-        prompt_tokens=seqA_tokens,
-        max_tokens=num_output_tokens,
-        block_size=block_size,
-    )
-
-    scheduler.add_seq_group(seq_groupA)
-
-    assert seqA.data.get_num_computed_tokens() == 0
-
-    # Prefill seqA
-    while not seqA.is_finished():
-        engine.step()
-
-    # seqB
-    seqB_tokens = [t + 1 for t in seqA_tokens]  # shift by 1
-    seqB, seq_groupB = create_dummy_prompt(
-        request_id="1",
-        prompt_tokens=seqB_tokens,
-        max_tokens=num_output_tokens,
-        block_size=block_size,
-    )
-
-    # seqC is the same as seqA
-    seqC, seq_groupC = create_dummy_prompt(
-        request_id="2",
-        prompt_tokens=seqA_tokens,
-        max_tokens=num_output_tokens,
-        block_size=block_size,
-    )
-
-    scheduler.add_seq_group(seq_groupB)
-    scheduler.add_seq_group(seq_groupC)
-
-    # Even seqC is fully cached, it should not be prefilled since we
-    # require at least 1 uncached token.
-    engine.step()
-
-    sched_metas, sched_out, _ = scheduler.last_schedule_ret()
-    assert len(sched_out.scheduled_seq_groups) == 1
-    assert (sched_out.scheduled_seq_groups[0].seq_group.request_id ==
-            seq_groupB.request_id)
-    assert (sched_out.scheduled_seq_groups[0].token_chunk_size ==
-            max_num_batched_tokens)
-
-    # When seqB is finished, seqC could be prefilled.
-    while not seqB.is_finished():
-        engine.step()
-        sched_metas, sched_out, _ = scheduler.last_schedule_ret()
-        assert len(sched_out.scheduled_seq_groups) == 1
-        assert (sched_out.scheduled_seq_groups[0].seq_group.request_id ==
-                seq_groupB.request_id)
-
-    engine.step()
-    sched_metas, sched_out, _ = scheduler.last_schedule_ret()
-    assert len(sched_out.scheduled_seq_groups) == 1
-    assert (sched_out.scheduled_seq_groups[0].seq_group.request_id ==
-            seq_groupC.request_id)
-    assert sched_out.scheduled_seq_groups[0].token_chunk_size == len(
-        seqA_tokens)