mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 14:53:52 +08:00
Add "/server_info" endpoint in api_server to retrieve the vllm_config. (#16572)
Signed-off-by: Xihui Cang <xihuicang@gmail.com>
This commit is contained in:
@ -1167,6 +1167,10 @@ class AsyncLLMEngine(EngineClient):
|
||||
exception=asyncio.CancelledError,
|
||||
verbose=self.log_requests)
|
||||
|
||||
async def get_vllm_config(self) -> VllmConfig:
|
||||
"""Get the vllm configuration of the vLLM engine."""
|
||||
return self.engine.get_vllm_config()
|
||||
|
||||
async def get_model_config(self) -> ModelConfig:
|
||||
"""Get the model configuration of the vLLM engine."""
|
||||
return self.engine.get_model_config()
|
||||
|
@ -914,6 +914,10 @@ class LLMEngine:
|
||||
scheduler.abort_seq_group(
|
||||
request_id, seq_id_to_seq_group=self.seq_id_to_seq_group)
|
||||
|
||||
def get_vllm_config(self) -> VllmConfig:
|
||||
"""Gets the vllm configuration."""
|
||||
return self.vllm_config
|
||||
|
||||
def get_model_config(self) -> ModelConfig:
|
||||
"""Gets the model configuration."""
|
||||
return self.model_config
|
||||
|
@ -93,6 +93,7 @@ class MQLLMEngineClient(EngineClient):
|
||||
self._errored_with: Optional[BaseException] = None
|
||||
|
||||
# Get the configs.
|
||||
self.vllm_config = engine_config
|
||||
self.model_config = engine_config.model_config
|
||||
self.decoding_config = engine_config.decoding_config
|
||||
|
||||
@ -377,6 +378,9 @@ class MQLLMEngineClient(EngineClient):
|
||||
async def get_tokenizer(self, lora_request: Optional[LoRARequest] = None):
|
||||
return await self.tokenizer.get_lora_tokenizer_async(lora_request)
|
||||
|
||||
async def get_vllm_config(self) -> VllmConfig:
|
||||
return self.vllm_config
|
||||
|
||||
async def get_decoding_config(self) -> DecodingConfig:
|
||||
return self.decoding_config
|
||||
|
||||
|
@ -5,7 +5,7 @@ from abc import ABC, abstractmethod
|
||||
from typing import AsyncGenerator, List, Mapping, Optional
|
||||
|
||||
from vllm.beam_search import BeamSearchSequence, create_sort_beams_key_function
|
||||
from vllm.config import DecodingConfig, ModelConfig
|
||||
from vllm.config import DecodingConfig, ModelConfig, VllmConfig
|
||||
from vllm.core.scheduler import SchedulerOutputs
|
||||
from vllm.inputs.data import PromptType, TokensPrompt
|
||||
from vllm.inputs.parse import is_explicit_encoder_decoder_prompt
|
||||
@ -220,6 +220,11 @@ class EngineClient(ABC):
|
||||
"""
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
async def get_vllm_config(self) -> VllmConfig:
|
||||
"""Get the vllm configuration of the vLLM engine."""
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
async def get_model_config(self) -> ModelConfig:
|
||||
"""Get the model configuration of the vLLM engine."""
|
||||
|
@ -30,7 +30,7 @@ from starlette.routing import Mount
|
||||
from typing_extensions import assert_never
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||
from vllm.engine.async_llm_engine import AsyncLLMEngine # type: ignore
|
||||
from vllm.engine.multiprocessing.client import MQLLMEngineClient
|
||||
@ -327,6 +327,7 @@ def mount_metrics(app: FastAPI):
|
||||
"/load",
|
||||
"/ping",
|
||||
"/version",
|
||||
"/server_info",
|
||||
],
|
||||
registry=registry,
|
||||
).add().instrument(app).expose(app)
|
||||
@ -687,6 +688,11 @@ TASK_HANDLERS: dict[str, dict[str, tuple]] = {
|
||||
|
||||
if envs.VLLM_SERVER_DEV_MODE:
|
||||
|
||||
@router.get("/server_info")
|
||||
async def show_server_info(raw_request: Request):
|
||||
server_info = {"vllm_config": str(raw_request.app.state.vllm_config)}
|
||||
return JSONResponse(content=server_info)
|
||||
|
||||
@router.post("/reset_prefix_cache")
|
||||
async def reset_prefix_cache(raw_request: Request):
|
||||
"""
|
||||
@ -894,7 +900,7 @@ def build_app(args: Namespace) -> FastAPI:
|
||||
|
||||
async def init_app_state(
|
||||
engine_client: EngineClient,
|
||||
model_config: ModelConfig,
|
||||
vllm_config: VllmConfig,
|
||||
state: State,
|
||||
args: Namespace,
|
||||
) -> None:
|
||||
@ -915,6 +921,8 @@ async def init_app_state(
|
||||
|
||||
state.engine_client = engine_client
|
||||
state.log_stats = not args.disable_log_stats
|
||||
state.vllm_config = vllm_config
|
||||
model_config = vllm_config.model_config
|
||||
|
||||
resolved_chat_template = load_chat_template(args.chat_template)
|
||||
if resolved_chat_template is not None:
|
||||
@ -1069,8 +1077,8 @@ async def run_server(args, **uvicorn_kwargs) -> None:
|
||||
async with build_async_engine_client(args) as engine_client:
|
||||
app = build_app(args)
|
||||
|
||||
model_config = await engine_client.get_model_config()
|
||||
await init_app_state(engine_client, model_config, app.state, args)
|
||||
vllm_config = await engine_client.get_vllm_config()
|
||||
await init_app_state(engine_client, vllm_config, app.state, args)
|
||||
|
||||
def _listen_addr(a: str) -> str:
|
||||
if is_valid_ipv6_address(a):
|
||||
|
@ -64,7 +64,7 @@ class AsyncLLM(EngineClient):
|
||||
assert start_engine_loop
|
||||
|
||||
self.model_config = vllm_config.model_config
|
||||
|
||||
self.vllm_config = vllm_config
|
||||
self.log_requests = log_requests
|
||||
self.log_stats = log_stats
|
||||
|
||||
@ -379,6 +379,9 @@ class AsyncLLM(EngineClient):
|
||||
):
|
||||
raise ValueError("Not Supported on V1 yet.")
|
||||
|
||||
async def get_vllm_config(self) -> VllmConfig:
|
||||
return self.vllm_config
|
||||
|
||||
async def get_model_config(self) -> ModelConfig:
|
||||
return self.model_config
|
||||
|
||||
|
@ -230,6 +230,9 @@ class LLMEngine:
|
||||
|
||||
return processed_outputs.request_outputs
|
||||
|
||||
def get_vllm_config(self):
|
||||
return self.vllm_config
|
||||
|
||||
def get_model_config(self):
|
||||
return self.model_config
|
||||
|
||||
|
Reference in New Issue
Block a user