mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 23:03:52 +08:00
Compare commits
3 Commits
Author | SHA1 | Date | |
---|---|---|---|
1da94e673c | |||
d8b736f913 | |||
3a8708f60a |
10
vllm/entrypoints/constants.py
Normal file
10
vllm/entrypoints/constants.py
Normal file
@ -0,0 +1,10 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Shared constants for vLLM entrypoints.
|
||||
"""
|
||||
|
||||
# HTTP header limits for h11 parser
|
||||
# These constants help mitigate header abuse attacks
|
||||
H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT = 4194304 # 4 MB
|
||||
H11_MAX_HEADER_COUNT_DEFAULT = 256
|
@ -14,6 +14,8 @@ from vllm import envs
|
||||
from vllm.engine.async_llm_engine import AsyncEngineDeadError
|
||||
from vllm.engine.multiprocessing import MQEngineDeadError
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.entrypoints.constants import (H11_MAX_HEADER_COUNT_DEFAULT,
|
||||
H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT)
|
||||
from vllm.entrypoints.ssl import SSLCertRefresher
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils import find_process_using_port
|
||||
@ -26,6 +28,11 @@ async def serve_http(app: FastAPI,
|
||||
sock: Optional[socket.socket],
|
||||
enable_ssl_refresh: bool = False,
|
||||
**uvicorn_kwargs: Any):
|
||||
"""
|
||||
Start a FastAPI app using Uvicorn, with support for custom Uvicorn config
|
||||
options. Supports http header limits via h11_max_incomplete_event_size and
|
||||
h11_max_header_count.
|
||||
"""
|
||||
logger.info("Available routes are:")
|
||||
for route in app.routes:
|
||||
methods = getattr(route, "methods", None)
|
||||
@ -36,7 +43,21 @@ async def serve_http(app: FastAPI,
|
||||
|
||||
logger.info("Route: %s, Methods: %s", path, ', '.join(methods))
|
||||
|
||||
# Extract header limit options if present
|
||||
h11_max_incomplete_event_size = uvicorn_kwargs.pop(
|
||||
"h11_max_incomplete_event_size", None)
|
||||
h11_max_header_count = uvicorn_kwargs.pop("h11_max_header_count", None)
|
||||
|
||||
# Set safe defaults if not provided
|
||||
if h11_max_incomplete_event_size is None:
|
||||
h11_max_incomplete_event_size = H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT
|
||||
if h11_max_header_count is None:
|
||||
h11_max_header_count = H11_MAX_HEADER_COUNT_DEFAULT
|
||||
|
||||
config = uvicorn.Config(app, **uvicorn_kwargs)
|
||||
# Set header limits
|
||||
config.h11_max_incomplete_event_size = h11_max_incomplete_event_size
|
||||
config.h11_max_header_count = h11_max_header_count
|
||||
config.load()
|
||||
server = uvicorn.Server(config)
|
||||
_add_shutdown_handlers(app, server)
|
||||
|
@ -1894,6 +1894,8 @@ async def run_server_worker(listen_address,
|
||||
ssl_certfile=args.ssl_certfile,
|
||||
ssl_ca_certs=args.ssl_ca_certs,
|
||||
ssl_cert_reqs=args.ssl_cert_reqs,
|
||||
h11_max_incomplete_event_size=args.h11_max_incomplete_event_size,
|
||||
h11_max_header_count=args.h11_max_header_count,
|
||||
**uvicorn_kwargs,
|
||||
)
|
||||
|
||||
|
@ -20,6 +20,8 @@ from vllm.config import config
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs, optional_type
|
||||
from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption,
|
||||
validate_chat_template)
|
||||
from vllm.entrypoints.constants import (H11_MAX_HEADER_COUNT_DEFAULT,
|
||||
H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT)
|
||||
from vllm.entrypoints.openai.serving_models import LoRAModulePath
|
||||
from vllm.entrypoints.openai.tool_parsers import ToolParserManager
|
||||
from vllm.logger import init_logger
|
||||
@ -172,6 +174,12 @@ schema. Example: `[{"type": "text", "text": "Hello world!"}]`"""
|
||||
enable_log_outputs: bool = False
|
||||
"""If set to True, enable logging of model outputs (generations)
|
||||
in addition to the input logging that is enabled by default."""
|
||||
h11_max_incomplete_event_size: int = H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT
|
||||
"""Maximum size (bytes) of an incomplete HTTP event (header or body) for
|
||||
h11 parser. Helps mitigate header abuse. Default: 4194304 (4 MB)."""
|
||||
h11_max_header_count: int = H11_MAX_HEADER_COUNT_DEFAULT
|
||||
"""Maximum number of HTTP headers allowed in a request for h11 parser.
|
||||
Helps mitigate header abuse. Default: 256."""
|
||||
|
||||
@staticmethod
|
||||
def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
|
||||
|
@ -208,15 +208,10 @@ class Qwen3CoderToolParser(ToolParser):
|
||||
"valid JSON object in tool '%s', will try other "
|
||||
"methods to parse it.", param_value, param_name,
|
||||
func_name)
|
||||
try:
|
||||
converted_value = eval(param_value)
|
||||
return converted_value
|
||||
except Exception:
|
||||
logger.warning(
|
||||
"Parsed value '%s' of parameter '%s' cannot be "
|
||||
"converted via Python `eval()` in tool '%s', "
|
||||
"degenerating to string.", param_value, param_name,
|
||||
func_name)
|
||||
logger.warning(
|
||||
"Parameter '%s' has unknown type '%s'. "
|
||||
"The value will be treated as a string.", param_name,
|
||||
param_type)
|
||||
return param_value
|
||||
|
||||
# Extract function name
|
||||
|
@ -21,7 +21,7 @@ logger = init_logger(__name__)
|
||||
|
||||
class CutlassMLAMetadataBuilder(MLACommonMetadataBuilder[MLACommonMetadata]):
|
||||
# enable full CUDA Graph support for decode-only capture
|
||||
attn_cudagraph_support: ClassVar[
|
||||
cudagraph_support: ClassVar[
|
||||
AttentionCGSupport] = AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user