mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 14:53:52 +08:00
Dynamic scheduler delay to improve ITL performance (#3279)
Co-authored-by: Jan van Lunteren <jvl@zurich.ibm.com>
This commit is contained in:
@ -1,5 +1,6 @@
|
||||
from typing import List
|
||||
import pytest # noqa
|
||||
import time
|
||||
|
||||
from vllm.config import CacheConfig, SchedulerConfig
|
||||
from vllm.core.scheduler import Scheduler
|
||||
@ -168,3 +169,36 @@ def test_scheduler_max_seqs():
|
||||
# and one is prompting.
|
||||
_, out = scheduler.schedule()
|
||||
assert set(out.scheduled_seq_groups) == set([all_seq_groups[1]])
|
||||
|
||||
|
||||
def test_scheduler_delay_factor():
|
||||
|
||||
block_size = 4
|
||||
scheduler_config = SchedulerConfig(100, 64, 16, delay_factor=0.5)
|
||||
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
|
||||
cache_config.num_cpu_blocks = 8
|
||||
cache_config.num_gpu_blocks = 8
|
||||
scheduler = Scheduler(scheduler_config, cache_config, None)
|
||||
|
||||
# schedule first prompt
|
||||
_, seq_group = create_dummy_prompt("0", prompt_length=block_size)
|
||||
scheduler.add_seq_group(seq_group)
|
||||
seq_group_meta, out = scheduler.schedule()
|
||||
assert out.prompt_run
|
||||
assert seq_group_meta[0].request_id == '0'
|
||||
|
||||
# wait for a second before scheduling next prompt
|
||||
time.sleep(1)
|
||||
_, seq_group = create_dummy_prompt("1", prompt_length=block_size)
|
||||
scheduler.add_seq_group(seq_group)
|
||||
|
||||
# second prompt should *not* be scheduled
|
||||
seq_group_meta, out = scheduler.schedule()
|
||||
assert not out.prompt_run
|
||||
assert seq_group_meta[0].request_id == '0'
|
||||
|
||||
# wait for more than 0.5 second and try again
|
||||
time.sleep(0.6)
|
||||
seq_group_meta, out = scheduler.schedule()
|
||||
assert out.prompt_run
|
||||
assert seq_group_meta[0].request_id == '1'
|
||||
|
@ -517,6 +517,8 @@ class SchedulerConfig:
|
||||
iteration.
|
||||
max_model_len: Maximum length of a sequence (including prompt
|
||||
and generated text).
|
||||
delay_factor: Apply a delay (of delay factor multiplied by previous
|
||||
prompt latency) before scheduling next prompt.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
@ -524,6 +526,7 @@ class SchedulerConfig:
|
||||
max_num_batched_tokens: Optional[int],
|
||||
max_num_seqs: int,
|
||||
max_model_len: int,
|
||||
delay_factor: float = 0.0,
|
||||
) -> None:
|
||||
if max_num_batched_tokens is not None:
|
||||
self.max_num_batched_tokens = max_num_batched_tokens
|
||||
@ -533,6 +536,7 @@ class SchedulerConfig:
|
||||
self.max_num_batched_tokens = max(max_model_len, 2048)
|
||||
self.max_num_seqs = max_num_seqs
|
||||
self.max_model_len = max_model_len
|
||||
self.delay_factor = delay_factor
|
||||
self._verify_args()
|
||||
|
||||
def _verify_args(self) -> None:
|
||||
|
@ -103,6 +103,13 @@ class Scheduler:
|
||||
# Sequence groups in the SWAPPED state.
|
||||
self.swapped: Deque[SequenceGroup] = deque()
|
||||
|
||||
# Time at previous scheduling step
|
||||
self.prev_time = 0.0
|
||||
# Did we schedule a prompt at previous step?
|
||||
self.prev_prompt = False
|
||||
# Latency of the last prompt step
|
||||
self.last_prompt_latency = 0.0
|
||||
|
||||
@property
|
||||
def lora_enabled(self) -> bool:
|
||||
return bool(self.lora_config)
|
||||
@ -179,7 +186,7 @@ class Scheduler:
|
||||
# are added to the back.
|
||||
leftover_waiting_sequences = deque()
|
||||
num_batched_tokens = 0
|
||||
while self.waiting:
|
||||
while self._passed_delay(now) and self.waiting:
|
||||
seq_group = self.waiting[0]
|
||||
waiting_seqs = seq_group.get_seqs(
|
||||
status=SequenceStatus.WAITING)
|
||||
@ -246,6 +253,7 @@ class Scheduler:
|
||||
self.waiting.extendleft(leftover_waiting_sequences)
|
||||
|
||||
if scheduled or ignored_seq_groups:
|
||||
self.prev_prompt = True
|
||||
scheduler_outputs = SchedulerOutputs(
|
||||
scheduled_seq_groups=scheduled,
|
||||
prompt_run=True,
|
||||
@ -491,3 +499,19 @@ class Scheduler:
|
||||
|
||||
def mark_blocks_as_computed(self, seq_group: SequenceGroup):
|
||||
self.block_manager.mark_blocks_as_computed(seq_group)
|
||||
|
||||
def _passed_delay(self, now: float) -> bool:
|
||||
if self.prev_prompt:
|
||||
self.last_prompt_latency = now - self.prev_time
|
||||
self.prev_time, self.prev_prompt = now, False
|
||||
# Delay scheduling prompts to let waiting queue fill up
|
||||
if self.scheduler_config.delay_factor > 0 and self.waiting:
|
||||
earliest_arrival_time = min(
|
||||
[e.metrics.arrival_time for e in self.waiting])
|
||||
passed_delay = (
|
||||
(now - earliest_arrival_time) >
|
||||
(self.scheduler_config.delay_factor * self.last_prompt_latency)
|
||||
or not self.running)
|
||||
else:
|
||||
passed_delay = True
|
||||
return passed_delay
|
||||
|
@ -51,6 +51,7 @@ class EngineArgs:
|
||||
max_cpu_loras: Optional[int] = None
|
||||
device: str = 'auto'
|
||||
ray_workers_use_nsight: bool = False
|
||||
scheduler_delay_factor: float = 0.0
|
||||
|
||||
def __post_init__(self):
|
||||
if self.tokenizer is None:
|
||||
@ -305,6 +306,12 @@ class EngineArgs:
|
||||
default=EngineArgs.device,
|
||||
choices=["auto", "cuda", "neuron"],
|
||||
help='Device type for vLLM execution.')
|
||||
parser.add_argument(
|
||||
'--scheduler-delay-factor',
|
||||
type=float,
|
||||
default=EngineArgs.scheduler_delay_factor,
|
||||
help='Apply a delay (of delay factor multiplied by previous'
|
||||
'prompt latency) before scheduling next prompt.')
|
||||
return parser
|
||||
|
||||
@classmethod
|
||||
@ -342,7 +349,8 @@ class EngineArgs:
|
||||
), self.ray_workers_use_nsight)
|
||||
scheduler_config = SchedulerConfig(self.max_num_batched_tokens,
|
||||
self.max_num_seqs,
|
||||
model_config.max_model_len)
|
||||
model_config.max_model_len,
|
||||
self.scheduler_delay_factor)
|
||||
lora_config = LoRAConfig(
|
||||
max_lora_rank=self.max_lora_rank,
|
||||
max_loras=self.max_loras,
|
||||
|
Reference in New Issue
Block a user