[Benchmark] Support ready check timeout in vllm bench serve (#21696)

Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
Co-authored-by: Roger Wang <hey@rogerw.me>
This commit is contained in:
Ye (Charlotte) Qi
2025-08-03 00:52:38 -07:00
committed by GitHub
parent 3dddbf1f25
commit 3f36c325fa
7 changed files with 94 additions and 11 deletions

View File

@ -14,8 +14,8 @@ from tqdm import tqdm
import vllm.envs as envs
from vllm import LLM, SamplingParams
from vllm.benchmarks.utils import (convert_to_pytorch_benchmark_format,
write_to_json)
from vllm.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format,
write_to_json)
from vllm.engine.arg_utils import EngineArgs
from vllm.inputs import PromptType
from vllm.sampling_params import BeamSearchParams

View File

@ -0,0 +1,3 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Benchmark library utilities."""

View File

@ -0,0 +1,70 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Utilities for checking endpoint readiness."""
import asyncio
import time
import aiohttp
from tqdm.asyncio import tqdm
from .endpoint_request_func import RequestFuncInput, RequestFuncOutput
async def wait_for_endpoint(
request_func,
test_input: RequestFuncInput,
timeout_seconds: int = 600,
retry_interval: int = 5,
) -> RequestFuncOutput:
"""
Wait for an endpoint to become available before starting benchmarks.
Args:
request_func: The async request function to call
test_input: The RequestFuncInput to test with
timeout_seconds: Maximum time to wait in seconds (default: 10 minutes)
retry_interval: Time between retries in seconds (default: 5 seconds)
Returns:
RequestFuncOutput: The successful response
Raises:
ValueError: If the endpoint doesn't become available within the timeout
"""
deadline = time.perf_counter() + timeout_seconds
output = RequestFuncOutput(success=False)
print(f"Waiting for endpoint to become up in {timeout_seconds} seconds")
with tqdm(
total=timeout_seconds,
bar_format="{desc} |{bar}| {elapsed} elapsed, {remaining} remaining",
unit="s",
) as pbar:
while True:
# update progress bar
remaining = deadline - time.perf_counter()
elapsed = timeout_seconds - remaining
update_amount = min(elapsed - pbar.n, timeout_seconds - pbar.n)
pbar.update(update_amount)
pbar.refresh()
if remaining <= 0:
pbar.close()
break
# ping the endpoint using request_func
try:
output = await request_func(request_func_input=test_input)
if output.success:
pbar.close()
return output
except aiohttp.ClientConnectorError:
pass
# retry after a delay
sleep_duration = min(retry_interval, remaining)
if sleep_duration > 0:
await asyncio.sleep(sleep_duration)
return output

View File

@ -34,12 +34,12 @@ from transformers import PreTrainedTokenizerBase
from vllm.benchmarks.datasets import (SampleRequest, add_dataset_parser,
get_samples)
from vllm.benchmarks.endpoint_request_func import (ASYNC_REQUEST_FUNCS,
OPENAI_COMPATIBLE_BACKENDS,
RequestFuncInput,
RequestFuncOutput)
from vllm.benchmarks.utils import (convert_to_pytorch_benchmark_format,
write_to_json)
from vllm.benchmarks.lib.endpoint_request_func import (
ASYNC_REQUEST_FUNCS, OPENAI_COMPATIBLE_BACKENDS, RequestFuncInput,
RequestFuncOutput)
from vllm.benchmarks.lib.ready_checker import wait_for_endpoint
from vllm.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format,
write_to_json)
from vllm.transformers_utils.tokenizer import get_tokenizer
MILLISECONDS_TO_SECONDS_CONVERSION = 1000
@ -331,6 +331,7 @@ async def benchmark(
ramp_up_strategy: Optional[Literal["linear", "exponential"]] = None,
ramp_up_start_rps: Optional[int] = None,
ramp_up_end_rps: Optional[int] = None,
ready_check_timeout_sec: int = 600,
):
if endpoint_type in ASYNC_REQUEST_FUNCS:
request_func = ASYNC_REQUEST_FUNCS[endpoint_type]
@ -359,7 +360,8 @@ async def benchmark(
extra_body=extra_body,
)
test_output = await request_func(request_func_input=test_input)
test_output = await wait_for_endpoint(
request_func, test_input, timeout_seconds=ready_check_timeout_sec)
if not test_output.success:
raise ValueError(
"Initial test run failed - Please make sure benchmark arguments "
@ -907,6 +909,13 @@ def add_cli_args(parser: argparse.ArgumentParser):
help="The ending request rate for ramp-up (RPS). "
"Needs to be specified when --ramp-up-strategy is used.",
)
parser.add_argument(
"--ready-check-timeout-sec",
type=int,
default=600,
help="Maximum time to wait for the endpoint to become ready "
"in seconds (default: 600 seconds / 10 minutes).",
)
def main(args: argparse.Namespace):
@ -1012,6 +1021,7 @@ def main(args: argparse.Namespace):
ramp_up_strategy=args.ramp_up_strategy,
ramp_up_start_rps=args.ramp_up_start_rps,
ramp_up_end_rps=args.ramp_up_end_rps,
ready_check_timeout_sec=args.ready_check_timeout_sec,
))
# Save config and results to json

View File

@ -21,8 +21,8 @@ from vllm.benchmarks.datasets import (AIMODataset, BurstGPTDataset,
InstructCoderDataset, RandomDataset,
SampleRequest, ShareGPTDataset,
SonnetDataset, VisionArenaDataset)
from vllm.benchmarks.utils import (convert_to_pytorch_benchmark_format,
write_to_json)
from vllm.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format,
write_to_json)
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
from vllm.entrypoints.openai.api_server import (
build_async_engine_client_from_engine_args)