From 31f58be96ae3e5dfc542604553ecead06cb5c487 Mon Sep 17 00:00:00 2001 From: liusiqian-tal <141730978+liusiqian-tal@users.noreply.github.com> Date: Tue, 10 Jun 2025 05:41:21 +0800 Subject: [PATCH] [Frontend] Make TIMEOUT_KEEP_ALIVE configurable through env var (#18472) Signed-off-by: liusiqian --- tests/async_engine/api_server_async_engine.py | 12 ++++++------ vllm/entrypoints/api_server.py | 4 ++-- vllm/entrypoints/openai/api_server.py | 4 +--- vllm/envs.py | 5 +++++ 4 files changed, 14 insertions(+), 11 deletions(-) diff --git a/tests/async_engine/api_server_async_engine.py b/tests/async_engine/api_server_async_engine.py index 163185b90b..ec6b20f5e0 100644 --- a/tests/async_engine/api_server_async_engine.py +++ b/tests/async_engine/api_server_async_engine.py @@ -8,6 +8,7 @@ import uvicorn from fastapi.responses import JSONResponse, Response import vllm.entrypoints.api_server +import vllm.envs as envs from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.utils import FlexibleArgumentParser @@ -46,9 +47,8 @@ if __name__ == "__main__": engine_args = AsyncEngineArgs.from_cli_args(args) engine = AsyncLLMEngineWithStats.from_engine_args(engine_args) vllm.entrypoints.api_server.engine = engine - uvicorn.run( - app, - host=args.host, - port=args.port, - log_level="debug", - timeout_keep_alive=vllm.entrypoints.api_server.TIMEOUT_KEEP_ALIVE) + uvicorn.run(app, + host=args.host, + port=args.port, + log_level="debug", + timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE) diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py index 56f8754c26..3d1e5dc14d 100644 --- a/vllm/entrypoints/api_server.py +++ b/vllm/entrypoints/api_server.py @@ -17,6 +17,7 @@ from typing import Any, Optional from fastapi import FastAPI, Request from fastapi.responses import JSONResponse, Response, StreamingResponse +import vllm.envs as envs from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.entrypoints.launcher import serve_http @@ -29,7 +30,6 @@ from vllm.version import __version__ as VLLM_VERSION logger = init_logger("vllm.entrypoints.api_server") -TIMEOUT_KEEP_ALIVE = 5 # seconds. app = FastAPI() engine = None @@ -134,7 +134,7 @@ async def run_server(args: Namespace, host=args.host, port=args.port, log_level=args.log_level, - timeout_keep_alive=TIMEOUT_KEEP_ALIVE, + timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE, ssl_keyfile=args.ssl_keyfile, ssl_certfile=args.ssl_certfile, ssl_ca_certs=args.ssl_ca_certs, diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 2f8819bca6..62f1c6a7c1 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -103,8 +103,6 @@ from vllm.utils import (Device, FlexibleArgumentParser, get_open_zmq_ipc_path, from vllm.v1.metrics.prometheus import get_prometheus_registry from vllm.version import __version__ as VLLM_VERSION -TIMEOUT_KEEP_ALIVE = 5 # seconds - prometheus_multiproc_dir: tempfile.TemporaryDirectory # Cannot use __name__ (https://github.com/vllm-project/vllm/pull/4765) @@ -1360,7 +1358,7 @@ async def run_server_worker(listen_address, # NOTE: When the 'disable_uvicorn_access_log' value is True, # no access log will be output. access_log=not args.disable_uvicorn_access_log, - timeout_keep_alive=TIMEOUT_KEEP_ALIVE, + timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE, ssl_keyfile=args.ssl_keyfile, ssl_certfile=args.ssl_certfile, ssl_ca_certs=args.ssl_ca_certs, diff --git a/vllm/envs.py b/vllm/envs.py index 9511ed1cb4..6f876d3df6 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -71,6 +71,7 @@ if TYPE_CHECKING: VERBOSE: bool = False VLLM_ALLOW_LONG_MAX_MODEL_LEN: bool = False VLLM_RPC_TIMEOUT: int = 10000 # ms + VLLM_HTTP_TIMEOUT_KEEP_ALIVE: int = 5 # seconds VLLM_PLUGINS: Optional[list[str]] = None VLLM_LORA_RESOLVER_CACHE_DIR: Optional[str] = None VLLM_TORCH_PROFILER_DIR: Optional[str] = None @@ -557,6 +558,10 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_RPC_TIMEOUT": lambda: int(os.getenv("VLLM_RPC_TIMEOUT", "10000")), + # Timeout in seconds for keeping HTTP connections alive in API server + "VLLM_HTTP_TIMEOUT_KEEP_ALIVE": + lambda: int(os.environ.get("VLLM_HTTP_TIMEOUT_KEEP_ALIVE", "5")), + # a list of plugin names to load, separated by commas. # if this is not set, it means all plugins will be loaded # if this is set to an empty string, no plugins will be loaded