Dynamic scheduler delay to improve ITL performance (#3279)

Co-authored-by: Jan van Lunteren <jvl@zurich.ibm.com>
This commit is contained in:
Thomas Parnell
2024-03-22 20:28:14 +01:00
committed by GitHub
parent f721096d48
commit cf2f084d56
4 changed files with 72 additions and 2 deletions

View File

@ -1,5 +1,6 @@
from typing import List
import pytest # noqa
import time
from vllm.config import CacheConfig, SchedulerConfig
from vllm.core.scheduler import Scheduler
@ -168,3 +169,36 @@ def test_scheduler_max_seqs():
# and one is prompting.
_, out = scheduler.schedule()
assert set(out.scheduled_seq_groups) == set([all_seq_groups[1]])
def test_scheduler_delay_factor():
block_size = 4
scheduler_config = SchedulerConfig(100, 64, 16, delay_factor=0.5)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 8
cache_config.num_gpu_blocks = 8
scheduler = Scheduler(scheduler_config, cache_config, None)
# schedule first prompt
_, seq_group = create_dummy_prompt("0", prompt_length=block_size)
scheduler.add_seq_group(seq_group)
seq_group_meta, out = scheduler.schedule()
assert out.prompt_run
assert seq_group_meta[0].request_id == '0'
# wait for a second before scheduling next prompt
time.sleep(1)
_, seq_group = create_dummy_prompt("1", prompt_length=block_size)
scheduler.add_seq_group(seq_group)
# second prompt should *not* be scheduled
seq_group_meta, out = scheduler.schedule()
assert not out.prompt_run
assert seq_group_meta[0].request_id == '0'
# wait for more than 0.5 second and try again
time.sleep(0.6)
seq_group_meta, out = scheduler.schedule()
assert out.prompt_run
assert seq_group_meta[0].request_id == '1'

View File

@ -517,6 +517,8 @@ class SchedulerConfig:
iteration.
max_model_len: Maximum length of a sequence (including prompt
and generated text).
delay_factor: Apply a delay (of delay factor multiplied by previous
prompt latency) before scheduling next prompt.
"""
def __init__(
@ -524,6 +526,7 @@ class SchedulerConfig:
max_num_batched_tokens: Optional[int],
max_num_seqs: int,
max_model_len: int,
delay_factor: float = 0.0,
) -> None:
if max_num_batched_tokens is not None:
self.max_num_batched_tokens = max_num_batched_tokens
@ -533,6 +536,7 @@ class SchedulerConfig:
self.max_num_batched_tokens = max(max_model_len, 2048)
self.max_num_seqs = max_num_seqs
self.max_model_len = max_model_len
self.delay_factor = delay_factor
self._verify_args()
def _verify_args(self) -> None:

View File

@ -103,6 +103,13 @@ class Scheduler:
# Sequence groups in the SWAPPED state.
self.swapped: Deque[SequenceGroup] = deque()
# Time at previous scheduling step
self.prev_time = 0.0
# Did we schedule a prompt at previous step?
self.prev_prompt = False
# Latency of the last prompt step
self.last_prompt_latency = 0.0
@property
def lora_enabled(self) -> bool:
return bool(self.lora_config)
@ -179,7 +186,7 @@ class Scheduler:
# are added to the back.
leftover_waiting_sequences = deque()
num_batched_tokens = 0
while self.waiting:
while self._passed_delay(now) and self.waiting:
seq_group = self.waiting[0]
waiting_seqs = seq_group.get_seqs(
status=SequenceStatus.WAITING)
@ -246,6 +253,7 @@ class Scheduler:
self.waiting.extendleft(leftover_waiting_sequences)
if scheduled or ignored_seq_groups:
self.prev_prompt = True
scheduler_outputs = SchedulerOutputs(
scheduled_seq_groups=scheduled,
prompt_run=True,
@ -491,3 +499,19 @@ class Scheduler:
def mark_blocks_as_computed(self, seq_group: SequenceGroup):
self.block_manager.mark_blocks_as_computed(seq_group)
def _passed_delay(self, now: float) -> bool:
if self.prev_prompt:
self.last_prompt_latency = now - self.prev_time
self.prev_time, self.prev_prompt = now, False
# Delay scheduling prompts to let waiting queue fill up
if self.scheduler_config.delay_factor > 0 and self.waiting:
earliest_arrival_time = min(
[e.metrics.arrival_time for e in self.waiting])
passed_delay = (
(now - earliest_arrival_time) >
(self.scheduler_config.delay_factor * self.last_prompt_latency)
or not self.running)
else:
passed_delay = True
return passed_delay

View File

@ -51,6 +51,7 @@ class EngineArgs:
max_cpu_loras: Optional[int] = None
device: str = 'auto'
ray_workers_use_nsight: bool = False
scheduler_delay_factor: float = 0.0
def __post_init__(self):
if self.tokenizer is None:
@ -305,6 +306,12 @@ class EngineArgs:
default=EngineArgs.device,
choices=["auto", "cuda", "neuron"],
help='Device type for vLLM execution.')
parser.add_argument(
'--scheduler-delay-factor',
type=float,
default=EngineArgs.scheduler_delay_factor,
help='Apply a delay (of delay factor multiplied by previous'
'prompt latency) before scheduling next prompt.')
return parser
@classmethod
@ -342,7 +349,8 @@ class EngineArgs:
), self.ray_workers_use_nsight)
scheduler_config = SchedulerConfig(self.max_num_batched_tokens,
self.max_num_seqs,
model_config.max_model_len)
model_config.max_model_len,
self.scheduler_delay_factor)
lora_config = LoRAConfig(
max_lora_rank=self.max_lora_rank,
max_loras=self.max_loras,