mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 14:53:52 +08:00
[Benchmark] Support ready check timeout in vllm bench serve
(#21696)
Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com> Co-authored-by: Roger Wang <hey@rogerw.me>
This commit is contained in:
committed by
GitHub
parent
3dddbf1f25
commit
3f36c325fa
@ -14,8 +14,8 @@ from tqdm import tqdm
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.benchmarks.utils import (convert_to_pytorch_benchmark_format,
|
||||
write_to_json)
|
||||
from vllm.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format,
|
||||
write_to_json)
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.inputs import PromptType
|
||||
from vllm.sampling_params import BeamSearchParams
|
||||
|
3
vllm/benchmarks/lib/__init__.py
Normal file
3
vllm/benchmarks/lib/__init__.py
Normal file
@ -0,0 +1,3 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Benchmark library utilities."""
|
70
vllm/benchmarks/lib/ready_checker.py
Normal file
70
vllm/benchmarks/lib/ready_checker.py
Normal file
@ -0,0 +1,70 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Utilities for checking endpoint readiness."""
|
||||
|
||||
import asyncio
|
||||
import time
|
||||
|
||||
import aiohttp
|
||||
from tqdm.asyncio import tqdm
|
||||
|
||||
from .endpoint_request_func import RequestFuncInput, RequestFuncOutput
|
||||
|
||||
|
||||
async def wait_for_endpoint(
|
||||
request_func,
|
||||
test_input: RequestFuncInput,
|
||||
timeout_seconds: int = 600,
|
||||
retry_interval: int = 5,
|
||||
) -> RequestFuncOutput:
|
||||
"""
|
||||
Wait for an endpoint to become available before starting benchmarks.
|
||||
|
||||
Args:
|
||||
request_func: The async request function to call
|
||||
test_input: The RequestFuncInput to test with
|
||||
timeout_seconds: Maximum time to wait in seconds (default: 10 minutes)
|
||||
retry_interval: Time between retries in seconds (default: 5 seconds)
|
||||
|
||||
Returns:
|
||||
RequestFuncOutput: The successful response
|
||||
|
||||
Raises:
|
||||
ValueError: If the endpoint doesn't become available within the timeout
|
||||
"""
|
||||
deadline = time.perf_counter() + timeout_seconds
|
||||
output = RequestFuncOutput(success=False)
|
||||
print(f"Waiting for endpoint to become up in {timeout_seconds} seconds")
|
||||
|
||||
with tqdm(
|
||||
total=timeout_seconds,
|
||||
bar_format="{desc} |{bar}| {elapsed} elapsed, {remaining} remaining",
|
||||
unit="s",
|
||||
) as pbar:
|
||||
|
||||
while True:
|
||||
# update progress bar
|
||||
remaining = deadline - time.perf_counter()
|
||||
elapsed = timeout_seconds - remaining
|
||||
update_amount = min(elapsed - pbar.n, timeout_seconds - pbar.n)
|
||||
pbar.update(update_amount)
|
||||
pbar.refresh()
|
||||
if remaining <= 0:
|
||||
pbar.close()
|
||||
break
|
||||
|
||||
# ping the endpoint using request_func
|
||||
try:
|
||||
output = await request_func(request_func_input=test_input)
|
||||
if output.success:
|
||||
pbar.close()
|
||||
return output
|
||||
except aiohttp.ClientConnectorError:
|
||||
pass
|
||||
|
||||
# retry after a delay
|
||||
sleep_duration = min(retry_interval, remaining)
|
||||
if sleep_duration > 0:
|
||||
await asyncio.sleep(sleep_duration)
|
||||
|
||||
return output
|
@ -34,12 +34,12 @@ from transformers import PreTrainedTokenizerBase
|
||||
|
||||
from vllm.benchmarks.datasets import (SampleRequest, add_dataset_parser,
|
||||
get_samples)
|
||||
from vllm.benchmarks.endpoint_request_func import (ASYNC_REQUEST_FUNCS,
|
||||
OPENAI_COMPATIBLE_BACKENDS,
|
||||
RequestFuncInput,
|
||||
RequestFuncOutput)
|
||||
from vllm.benchmarks.utils import (convert_to_pytorch_benchmark_format,
|
||||
write_to_json)
|
||||
from vllm.benchmarks.lib.endpoint_request_func import (
|
||||
ASYNC_REQUEST_FUNCS, OPENAI_COMPATIBLE_BACKENDS, RequestFuncInput,
|
||||
RequestFuncOutput)
|
||||
from vllm.benchmarks.lib.ready_checker import wait_for_endpoint
|
||||
from vllm.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format,
|
||||
write_to_json)
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
|
||||
MILLISECONDS_TO_SECONDS_CONVERSION = 1000
|
||||
@ -331,6 +331,7 @@ async def benchmark(
|
||||
ramp_up_strategy: Optional[Literal["linear", "exponential"]] = None,
|
||||
ramp_up_start_rps: Optional[int] = None,
|
||||
ramp_up_end_rps: Optional[int] = None,
|
||||
ready_check_timeout_sec: int = 600,
|
||||
):
|
||||
if endpoint_type in ASYNC_REQUEST_FUNCS:
|
||||
request_func = ASYNC_REQUEST_FUNCS[endpoint_type]
|
||||
@ -359,7 +360,8 @@ async def benchmark(
|
||||
extra_body=extra_body,
|
||||
)
|
||||
|
||||
test_output = await request_func(request_func_input=test_input)
|
||||
test_output = await wait_for_endpoint(
|
||||
request_func, test_input, timeout_seconds=ready_check_timeout_sec)
|
||||
if not test_output.success:
|
||||
raise ValueError(
|
||||
"Initial test run failed - Please make sure benchmark arguments "
|
||||
@ -907,6 +909,13 @@ def add_cli_args(parser: argparse.ArgumentParser):
|
||||
help="The ending request rate for ramp-up (RPS). "
|
||||
"Needs to be specified when --ramp-up-strategy is used.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ready-check-timeout-sec",
|
||||
type=int,
|
||||
default=600,
|
||||
help="Maximum time to wait for the endpoint to become ready "
|
||||
"in seconds (default: 600 seconds / 10 minutes).",
|
||||
)
|
||||
|
||||
|
||||
def main(args: argparse.Namespace):
|
||||
@ -1012,6 +1021,7 @@ def main(args: argparse.Namespace):
|
||||
ramp_up_strategy=args.ramp_up_strategy,
|
||||
ramp_up_start_rps=args.ramp_up_start_rps,
|
||||
ramp_up_end_rps=args.ramp_up_end_rps,
|
||||
ready_check_timeout_sec=args.ready_check_timeout_sec,
|
||||
))
|
||||
|
||||
# Save config and results to json
|
||||
|
@ -21,8 +21,8 @@ from vllm.benchmarks.datasets import (AIMODataset, BurstGPTDataset,
|
||||
InstructCoderDataset, RandomDataset,
|
||||
SampleRequest, ShareGPTDataset,
|
||||
SonnetDataset, VisionArenaDataset)
|
||||
from vllm.benchmarks.utils import (convert_to_pytorch_benchmark_format,
|
||||
write_to_json)
|
||||
from vllm.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format,
|
||||
write_to_json)
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
|
||||
from vllm.entrypoints.openai.api_server import (
|
||||
build_async_engine_client_from_engine_args)
|
||||
|
Reference in New Issue
Block a user