mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 14:53:52 +08:00
[Docs] Add comprehensive CLI reference for all large vllm
subcommands (#22601)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
@ -11,7 +11,7 @@ nav:
|
|||||||
- Quick Links:
|
- Quick Links:
|
||||||
- User Guide: usage/README.md
|
- User Guide: usage/README.md
|
||||||
- Developer Guide: contributing/README.md
|
- Developer Guide: contributing/README.md
|
||||||
- API Reference: api/summary.md
|
- API Reference: api/README.md
|
||||||
- CLI Reference: cli/README.md
|
- CLI Reference: cli/README.md
|
||||||
- Timeline:
|
- Timeline:
|
||||||
- Roadmap: https://roadmap.vllm.ai
|
- Roadmap: https://roadmap.vllm.ai
|
||||||
@ -58,11 +58,9 @@ nav:
|
|||||||
- CI: contributing/ci
|
- CI: contributing/ci
|
||||||
- Design Documents: design
|
- Design Documents: design
|
||||||
- API Reference:
|
- API Reference:
|
||||||
- Summary: api/summary.md
|
- api/README.md
|
||||||
- Contents:
|
- api/vllm/*
|
||||||
- api/vllm/*
|
- CLI Reference: cli
|
||||||
- CLI Reference:
|
|
||||||
- Summary: cli/README.md
|
|
||||||
- Community:
|
- Community:
|
||||||
- community/*
|
- community/*
|
||||||
- Blog: https://blog.vllm.ai
|
- Blog: https://blog.vllm.ai
|
||||||
|
1
docs/cli/.meta.yml
Normal file
1
docs/cli/.meta.yml
Normal file
@ -0,0 +1 @@
|
|||||||
|
toc_depth: 3
|
8
docs/cli/.nav.yml
Normal file
8
docs/cli/.nav.yml
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
nav:
|
||||||
|
- README.md
|
||||||
|
- serve.md
|
||||||
|
- chat.md
|
||||||
|
- complete.md
|
||||||
|
- run-batch.md
|
||||||
|
- vllm bench:
|
||||||
|
- bench/*.md
|
@ -1,7 +1,3 @@
|
|||||||
---
|
|
||||||
toc_depth: 4
|
|
||||||
---
|
|
||||||
|
|
||||||
# vLLM CLI Guide
|
# vLLM CLI Guide
|
||||||
|
|
||||||
The vllm command-line tool is used to run and manage vLLM models. You can start by viewing the help message with:
|
The vllm command-line tool is used to run and manage vLLM models. You can start by viewing the help message with:
|
||||||
@ -16,52 +12,48 @@ Available Commands:
|
|||||||
vllm {chat,complete,serve,bench,collect-env,run-batch}
|
vllm {chat,complete,serve,bench,collect-env,run-batch}
|
||||||
```
|
```
|
||||||
|
|
||||||
When passing JSON CLI arguments, the following sets of arguments are equivalent:
|
|
||||||
|
|
||||||
- `--json-arg '{"key1": "value1", "key2": {"key3": "value2"}}'`
|
|
||||||
- `--json-arg.key1 value1 --json-arg.key2.key3 value2`
|
|
||||||
|
|
||||||
Additionally, list elements can be passed individually using `+`:
|
|
||||||
|
|
||||||
- `--json-arg '{"key4": ["value3", "value4", "value5"]}'`
|
|
||||||
- `--json-arg.key4+ value3 --json-arg.key4+='value4,value5'`
|
|
||||||
|
|
||||||
## serve
|
## serve
|
||||||
|
|
||||||
Start the vLLM OpenAI Compatible API server.
|
Starts the vLLM OpenAI Compatible API server.
|
||||||
|
|
||||||
??? console "Examples"
|
Start with a model:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Start with a model
|
vllm serve meta-llama/Llama-2-7b-hf
|
||||||
vllm serve meta-llama/Llama-2-7b-hf
|
```
|
||||||
|
|
||||||
# Specify the port
|
Specify the port:
|
||||||
vllm serve meta-llama/Llama-2-7b-hf --port 8100
|
|
||||||
|
|
||||||
# Serve over a Unix domain socket
|
```bash
|
||||||
vllm serve meta-llama/Llama-2-7b-hf --uds /tmp/vllm.sock
|
vllm serve meta-llama/Llama-2-7b-hf --port 8100
|
||||||
|
```
|
||||||
|
|
||||||
# Check with --help for more options
|
Serve over a Unix domain socket:
|
||||||
# To list all groups
|
|
||||||
vllm serve --help=listgroup
|
|
||||||
|
|
||||||
# To view a argument group
|
```bash
|
||||||
vllm serve --help=ModelConfig
|
vllm serve meta-llama/Llama-2-7b-hf --uds /tmp/vllm.sock
|
||||||
|
```
|
||||||
|
|
||||||
# To view a single argument
|
Check with --help for more options:
|
||||||
vllm serve --help=max-num-seqs
|
|
||||||
|
|
||||||
# To search by keyword
|
```bash
|
||||||
vllm serve --help=max
|
# To list all groups
|
||||||
|
vllm serve --help=listgroup
|
||||||
|
|
||||||
# To view full help with pager (less/more)
|
# To view a argument group
|
||||||
vllm serve --help=page
|
vllm serve --help=ModelConfig
|
||||||
```
|
|
||||||
|
|
||||||
### Options
|
# To view a single argument
|
||||||
|
vllm serve --help=max-num-seqs
|
||||||
|
|
||||||
--8<-- "docs/argparse/serve.md"
|
# To search by keyword
|
||||||
|
vllm serve --help=max
|
||||||
|
|
||||||
|
# To view full help with pager (less/more)
|
||||||
|
vllm serve --help=page
|
||||||
|
```
|
||||||
|
|
||||||
|
See [vllm serve](./serve.md) for the full reference of all available arguments.
|
||||||
|
|
||||||
## chat
|
## chat
|
||||||
|
|
||||||
@ -78,6 +70,8 @@ vllm chat --url http://{vllm-serve-host}:{vllm-serve-port}/v1
|
|||||||
vllm chat --quick "hi"
|
vllm chat --quick "hi"
|
||||||
```
|
```
|
||||||
|
|
||||||
|
See [vllm chat](./chat.md) for the full reference of all available arguments.
|
||||||
|
|
||||||
## complete
|
## complete
|
||||||
|
|
||||||
Generate text completions based on the given prompt via the running API server.
|
Generate text completions based on the given prompt via the running API server.
|
||||||
@ -93,7 +87,7 @@ vllm complete --url http://{vllm-serve-host}:{vllm-serve-port}/v1
|
|||||||
vllm complete --quick "The future of AI is"
|
vllm complete --quick "The future of AI is"
|
||||||
```
|
```
|
||||||
|
|
||||||
</details>
|
See [vllm complete](./complete.md) for the full reference of all available arguments.
|
||||||
|
|
||||||
## bench
|
## bench
|
||||||
|
|
||||||
@ -120,6 +114,8 @@ vllm bench latency \
|
|||||||
--load-format dummy
|
--load-format dummy
|
||||||
```
|
```
|
||||||
|
|
||||||
|
See [vllm bench latency](./bench/latency.md) for the full reference of all available arguments.
|
||||||
|
|
||||||
### serve
|
### serve
|
||||||
|
|
||||||
Benchmark the online serving throughput.
|
Benchmark the online serving throughput.
|
||||||
@ -134,6 +130,8 @@ vllm bench serve \
|
|||||||
--num-prompts 5
|
--num-prompts 5
|
||||||
```
|
```
|
||||||
|
|
||||||
|
See [vllm bench serve](./bench/serve.md) for the full reference of all available arguments.
|
||||||
|
|
||||||
### throughput
|
### throughput
|
||||||
|
|
||||||
Benchmark offline inference throughput.
|
Benchmark offline inference throughput.
|
||||||
@ -147,6 +145,8 @@ vllm bench throughput \
|
|||||||
--load-format dummy
|
--load-format dummy
|
||||||
```
|
```
|
||||||
|
|
||||||
|
See [vllm bench throughput](./bench/throughput.md) for the full reference of all available arguments.
|
||||||
|
|
||||||
## collect-env
|
## collect-env
|
||||||
|
|
||||||
Start collecting environment information.
|
Start collecting environment information.
|
||||||
@ -159,24 +159,25 @@ vllm collect-env
|
|||||||
|
|
||||||
Run batch prompts and write results to file.
|
Run batch prompts and write results to file.
|
||||||
|
|
||||||
<details>
|
Running with a local file:
|
||||||
<summary>Examples</summary>
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Running with a local file
|
|
||||||
vllm run-batch \
|
vllm run-batch \
|
||||||
-i offline_inference/openai_batch/openai_example_batch.jsonl \
|
-i offline_inference/openai_batch/openai_example_batch.jsonl \
|
||||||
-o results.jsonl \
|
-o results.jsonl \
|
||||||
--model meta-llama/Meta-Llama-3-8B-Instruct
|
--model meta-llama/Meta-Llama-3-8B-Instruct
|
||||||
|
```
|
||||||
|
|
||||||
# Using remote file
|
Using remote file:
|
||||||
|
|
||||||
|
```bash
|
||||||
vllm run-batch \
|
vllm run-batch \
|
||||||
-i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl \
|
-i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl \
|
||||||
-o results.jsonl \
|
-o results.jsonl \
|
||||||
--model meta-llama/Meta-Llama-3-8B-Instruct
|
--model meta-llama/Meta-Llama-3-8B-Instruct
|
||||||
```
|
```
|
||||||
|
|
||||||
</details>
|
See [vllm run-batch](./run-batch.md) for the full reference of all available arguments.
|
||||||
|
|
||||||
## More Help
|
## More Help
|
||||||
|
|
||||||
|
9
docs/cli/bench/latency.md
Normal file
9
docs/cli/bench/latency.md
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
# vllm bench latency
|
||||||
|
|
||||||
|
## JSON CLI Arguments
|
||||||
|
|
||||||
|
--8<-- "docs/cli/json_tip.inc.md"
|
||||||
|
|
||||||
|
## Options
|
||||||
|
|
||||||
|
--8<-- "docs/argparse/bench_latency.md"
|
9
docs/cli/bench/serve.md
Normal file
9
docs/cli/bench/serve.md
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
# vllm bench serve
|
||||||
|
|
||||||
|
## JSON CLI Arguments
|
||||||
|
|
||||||
|
--8<-- "docs/cli/json_tip.inc.md"
|
||||||
|
|
||||||
|
## Options
|
||||||
|
|
||||||
|
--8<-- "docs/argparse/bench_serve.md"
|
9
docs/cli/bench/throughput.md
Normal file
9
docs/cli/bench/throughput.md
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
# vllm bench throughput
|
||||||
|
|
||||||
|
## JSON CLI Arguments
|
||||||
|
|
||||||
|
--8<-- "docs/cli/json_tip.inc.md"
|
||||||
|
|
||||||
|
## Options
|
||||||
|
|
||||||
|
--8<-- "docs/argparse/bench_throughput.md"
|
5
docs/cli/chat.md
Normal file
5
docs/cli/chat.md
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
# vllm chat
|
||||||
|
|
||||||
|
## Options
|
||||||
|
|
||||||
|
--8<-- "docs/argparse/chat.md"
|
5
docs/cli/complete.md
Normal file
5
docs/cli/complete.md
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
# vllm complete
|
||||||
|
|
||||||
|
## Options
|
||||||
|
|
||||||
|
--8<-- "docs/argparse/complete.md"
|
9
docs/cli/json_tip.inc.md
Normal file
9
docs/cli/json_tip.inc.md
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
When passing JSON CLI arguments, the following sets of arguments are equivalent:
|
||||||
|
|
||||||
|
- `--json-arg '{"key1": "value1", "key2": {"key3": "value2"}}'`
|
||||||
|
- `--json-arg.key1 value1 --json-arg.key2.key3 value2`
|
||||||
|
|
||||||
|
Additionally, list elements can be passed individually using `+`:
|
||||||
|
|
||||||
|
- `--json-arg '{"key4": ["value3", "value4", "value5"]}'`
|
||||||
|
- `--json-arg.key4+ value3 --json-arg.key4+='value4,value5'`
|
9
docs/cli/run-batch.md
Normal file
9
docs/cli/run-batch.md
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
# vllm run-batch
|
||||||
|
|
||||||
|
## JSON CLI Arguments
|
||||||
|
|
||||||
|
--8<-- "docs/cli/json_tip.inc.md"
|
||||||
|
|
||||||
|
## Options
|
||||||
|
|
||||||
|
--8<-- "docs/argparse/run-batch.md"
|
9
docs/cli/serve.md
Normal file
9
docs/cli/serve.md
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
# vllm serve
|
||||||
|
|
||||||
|
## JSON CLI Arguments
|
||||||
|
|
||||||
|
--8<-- "docs/cli/json_tip.inc.md"
|
||||||
|
|
||||||
|
## Options
|
||||||
|
|
||||||
|
--8<-- "docs/argparse/serve.md"
|
@ -11,15 +11,7 @@ Engine arguments control the behavior of the vLLM engine.
|
|||||||
|
|
||||||
The engine argument classes, [EngineArgs][vllm.engine.arg_utils.EngineArgs] and [AsyncEngineArgs][vllm.engine.arg_utils.AsyncEngineArgs], are a combination of the configuration classes defined in [vllm.config][]. Therefore, if you are interested in developer documentation, we recommend looking at these configuration classes as they are the source of truth for types, defaults and docstrings.
|
The engine argument classes, [EngineArgs][vllm.engine.arg_utils.EngineArgs] and [AsyncEngineArgs][vllm.engine.arg_utils.AsyncEngineArgs], are a combination of the configuration classes defined in [vllm.config][]. Therefore, if you are interested in developer documentation, we recommend looking at these configuration classes as they are the source of truth for types, defaults and docstrings.
|
||||||
|
|
||||||
When passing JSON CLI arguments, the following sets of arguments are equivalent:
|
--8<-- "docs/cli/json_tip.inc.md"
|
||||||
|
|
||||||
- `--json-arg '{"key1": "value1", "key2": {"key3": "value2"}}'`
|
|
||||||
- `--json-arg.key1 value1 --json-arg.key2.key3 value2`
|
|
||||||
|
|
||||||
Additionally, list elements can be passed individually using `+`:
|
|
||||||
|
|
||||||
- `--json-arg '{"key4": ["value3", "value4", "value5"]}'`
|
|
||||||
- `--json-arg.key4+ value3 --json-arg.key4+='value4,value5'`
|
|
||||||
|
|
||||||
## `EngineArgs`
|
## `EngineArgs`
|
||||||
|
|
||||||
|
@ -15,8 +15,14 @@ sys.modules["aiohttp"] = MagicMock()
|
|||||||
sys.modules["blake3"] = MagicMock()
|
sys.modules["blake3"] = MagicMock()
|
||||||
sys.modules["vllm._C"] = MagicMock()
|
sys.modules["vllm._C"] = MagicMock()
|
||||||
|
|
||||||
|
from vllm.benchmarks import latency # noqa: E402
|
||||||
|
from vllm.benchmarks import serve # noqa: E402
|
||||||
|
from vllm.benchmarks import throughput # noqa: E402
|
||||||
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs # noqa: E402
|
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs # noqa: E402
|
||||||
from vllm.entrypoints.openai.cli_args import make_arg_parser # noqa: E402
|
from vllm.entrypoints.cli.openai import ChatCommand # noqa: E402
|
||||||
|
from vllm.entrypoints.cli.openai import CompleteCommand # noqa: E402
|
||||||
|
from vllm.entrypoints.openai import cli_args # noqa: E402
|
||||||
|
from vllm.entrypoints.openai import run_batch # noqa: E402
|
||||||
from vllm.utils import FlexibleArgumentParser # noqa: E402
|
from vllm.utils import FlexibleArgumentParser # noqa: E402
|
||||||
|
|
||||||
logger = logging.getLogger("mkdocs")
|
logger = logging.getLogger("mkdocs")
|
||||||
@ -68,7 +74,8 @@ class MarkdownFormatter(HelpFormatter):
|
|||||||
self._markdown_output.append(
|
self._markdown_output.append(
|
||||||
f"Possible choices: {metavar}\n\n")
|
f"Possible choices: {metavar}\n\n")
|
||||||
|
|
||||||
self._markdown_output.append(f"{action.help}\n\n")
|
if action.help:
|
||||||
|
self._markdown_output.append(f"{action.help}\n\n")
|
||||||
|
|
||||||
if (default := action.default) != SUPPRESS:
|
if (default := action.default) != SUPPRESS:
|
||||||
self._markdown_output.append(f"Default: `{default}`\n\n")
|
self._markdown_output.append(f"Default: `{default}`\n\n")
|
||||||
@ -78,7 +85,7 @@ class MarkdownFormatter(HelpFormatter):
|
|||||||
return "".join(self._markdown_output)
|
return "".join(self._markdown_output)
|
||||||
|
|
||||||
|
|
||||||
def create_parser(cls, **kwargs) -> FlexibleArgumentParser:
|
def create_parser(add_cli_args, **kwargs) -> FlexibleArgumentParser:
|
||||||
"""Create a parser for the given class with markdown formatting.
|
"""Create a parser for the given class with markdown formatting.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -88,18 +95,12 @@ def create_parser(cls, **kwargs) -> FlexibleArgumentParser:
|
|||||||
Returns:
|
Returns:
|
||||||
FlexibleArgumentParser: A parser with markdown formatting for the class.
|
FlexibleArgumentParser: A parser with markdown formatting for the class.
|
||||||
"""
|
"""
|
||||||
parser = FlexibleArgumentParser()
|
parser = FlexibleArgumentParser(add_json_tip=False)
|
||||||
parser.formatter_class = MarkdownFormatter
|
parser.formatter_class = MarkdownFormatter
|
||||||
with patch("vllm.config.DeviceConfig.__post_init__"):
|
with patch("vllm.config.DeviceConfig.__post_init__"):
|
||||||
return cls.add_cli_args(parser, **kwargs)
|
_parser = add_cli_args(parser, **kwargs)
|
||||||
|
# add_cli_args might be in-place so return parser if _parser is None
|
||||||
|
return _parser or parser
|
||||||
def create_serve_parser() -> FlexibleArgumentParser:
|
|
||||||
"""Create a parser for the serve command with markdown formatting."""
|
|
||||||
parser = FlexibleArgumentParser()
|
|
||||||
parser.formatter_class = lambda prog: MarkdownFormatter(
|
|
||||||
prog, starting_heading_level=4)
|
|
||||||
return make_arg_parser(parser)
|
|
||||||
|
|
||||||
|
|
||||||
def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
|
def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
|
||||||
@ -113,10 +114,24 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
|
|||||||
|
|
||||||
# Create parsers to document
|
# Create parsers to document
|
||||||
parsers = {
|
parsers = {
|
||||||
"engine_args": create_parser(EngineArgs),
|
"engine_args":
|
||||||
"async_engine_args": create_parser(AsyncEngineArgs,
|
create_parser(EngineArgs.add_cli_args),
|
||||||
async_args_only=True),
|
"async_engine_args":
|
||||||
"serve": create_serve_parser(),
|
create_parser(AsyncEngineArgs.add_cli_args, async_args_only=True),
|
||||||
|
"serve":
|
||||||
|
create_parser(cli_args.make_arg_parser),
|
||||||
|
"chat":
|
||||||
|
create_parser(ChatCommand.add_cli_args),
|
||||||
|
"complete":
|
||||||
|
create_parser(CompleteCommand.add_cli_args),
|
||||||
|
"bench_latency":
|
||||||
|
create_parser(latency.add_cli_args),
|
||||||
|
"bench_throughput":
|
||||||
|
create_parser(throughput.add_cli_args),
|
||||||
|
"bench_serve":
|
||||||
|
create_parser(serve.add_cli_args),
|
||||||
|
"run-batch":
|
||||||
|
create_parser(run_batch.make_arg_parser),
|
||||||
}
|
}
|
||||||
|
|
||||||
# Generate documentation for each parser
|
# Generate documentation for each parser
|
||||||
|
@ -29,3 +29,5 @@ setproctitle
|
|||||||
torch
|
torch
|
||||||
transformers
|
transformers
|
||||||
zmq
|
zmq
|
||||||
|
uvloop
|
||||||
|
prometheus-client
|
||||||
|
@ -24,8 +24,6 @@ from vllm.benchmarks.datasets import (AIMODataset, BurstGPTDataset,
|
|||||||
from vllm.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format,
|
from vllm.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format,
|
||||||
write_to_json)
|
write_to_json)
|
||||||
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
|
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
|
||||||
from vllm.entrypoints.openai.api_server import (
|
|
||||||
build_async_engine_client_from_engine_args)
|
|
||||||
from vllm.inputs import TextPrompt, TokensPrompt
|
from vllm.inputs import TextPrompt, TokensPrompt
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
from vllm.outputs import RequestOutput
|
from vllm.outputs import RequestOutput
|
||||||
@ -146,6 +144,8 @@ async def run_vllm_async(
|
|||||||
disable_detokenize: bool = False,
|
disable_detokenize: bool = False,
|
||||||
) -> float:
|
) -> float:
|
||||||
from vllm import SamplingParams
|
from vllm import SamplingParams
|
||||||
|
from vllm.entrypoints.openai.api_server import (
|
||||||
|
build_async_engine_client_from_engine_args)
|
||||||
|
|
||||||
async with build_async_engine_client_from_engine_args(
|
async with build_async_engine_client_from_engine_args(
|
||||||
engine_args,
|
engine_args,
|
||||||
|
@ -130,28 +130,33 @@ class ChatCommand(CLISubcommand):
|
|||||||
conversation.append(response_message) # type: ignore
|
conversation.append(response_message) # type: ignore
|
||||||
print(output)
|
print(output)
|
||||||
|
|
||||||
def subparser_init(
|
@staticmethod
|
||||||
self,
|
def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
|
||||||
subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
|
"""Add CLI arguments for the chat command."""
|
||||||
chat_parser = subparsers.add_parser(
|
_add_query_options(parser)
|
||||||
"chat",
|
parser.add_argument(
|
||||||
help="Generate chat completions via the running API server.",
|
|
||||||
description="Generate chat completions via the running API server.",
|
|
||||||
usage="vllm chat [options]")
|
|
||||||
_add_query_options(chat_parser)
|
|
||||||
chat_parser.add_argument(
|
|
||||||
"--system-prompt",
|
"--system-prompt",
|
||||||
type=str,
|
type=str,
|
||||||
default=None,
|
default=None,
|
||||||
help=("The system prompt to be added to the chat template, "
|
help=("The system prompt to be added to the chat template, "
|
||||||
"used for models that support system prompts."))
|
"used for models that support system prompts."))
|
||||||
chat_parser.add_argument("-q",
|
parser.add_argument("-q",
|
||||||
"--quick",
|
"--quick",
|
||||||
type=str,
|
type=str,
|
||||||
metavar="MESSAGE",
|
metavar="MESSAGE",
|
||||||
help=("Send a single prompt as MESSAGE "
|
help=("Send a single prompt as MESSAGE "
|
||||||
"and print the response, then exit."))
|
"and print the response, then exit."))
|
||||||
return chat_parser
|
return parser
|
||||||
|
|
||||||
|
def subparser_init(
|
||||||
|
self,
|
||||||
|
subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
|
||||||
|
parser = subparsers.add_parser(
|
||||||
|
"chat",
|
||||||
|
help="Generate chat completions via the running API server.",
|
||||||
|
description="Generate chat completions via the running API server.",
|
||||||
|
usage="vllm chat [options]")
|
||||||
|
return ChatCommand.add_cli_args(parser)
|
||||||
|
|
||||||
|
|
||||||
class CompleteCommand(CLISubcommand):
|
class CompleteCommand(CLISubcommand):
|
||||||
@ -179,25 +184,30 @@ class CompleteCommand(CLISubcommand):
|
|||||||
output = completion.choices[0].text
|
output = completion.choices[0].text
|
||||||
print(output)
|
print(output)
|
||||||
|
|
||||||
def subparser_init(
|
@staticmethod
|
||||||
self,
|
def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
|
||||||
subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
|
"""Add CLI arguments for the complete command."""
|
||||||
complete_parser = subparsers.add_parser(
|
_add_query_options(parser)
|
||||||
"complete",
|
parser.add_argument(
|
||||||
help=("Generate text completions based on the given prompt "
|
|
||||||
"via the running API server."),
|
|
||||||
description=("Generate text completions based on the given prompt "
|
|
||||||
"via the running API server."),
|
|
||||||
usage="vllm complete [options]")
|
|
||||||
_add_query_options(complete_parser)
|
|
||||||
complete_parser.add_argument(
|
|
||||||
"-q",
|
"-q",
|
||||||
"--quick",
|
"--quick",
|
||||||
type=str,
|
type=str,
|
||||||
metavar="PROMPT",
|
metavar="PROMPT",
|
||||||
help=
|
help=
|
||||||
"Send a single prompt and print the completion output, then exit.")
|
"Send a single prompt and print the completion output, then exit.")
|
||||||
return complete_parser
|
return parser
|
||||||
|
|
||||||
|
def subparser_init(
|
||||||
|
self,
|
||||||
|
subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
|
||||||
|
parser = subparsers.add_parser(
|
||||||
|
"complete",
|
||||||
|
help=("Generate text completions based on the given prompt "
|
||||||
|
"via the running API server."),
|
||||||
|
description=("Generate text completions based on the given prompt "
|
||||||
|
"via the running API server."),
|
||||||
|
usage="vllm complete [options]")
|
||||||
|
return CompleteCommand.add_cli_args(parser)
|
||||||
|
|
||||||
|
|
||||||
def cmd_init() -> list[CLISubcommand]:
|
def cmd_init() -> list[CLISubcommand]:
|
||||||
|
@ -20,7 +20,6 @@ from vllm.engine.arg_utils import AsyncEngineArgs, optional_type
|
|||||||
from vllm.engine.protocol import EngineClient
|
from vllm.engine.protocol import EngineClient
|
||||||
from vllm.entrypoints.logger import RequestLogger
|
from vllm.entrypoints.logger import RequestLogger
|
||||||
# yapf: disable
|
# yapf: disable
|
||||||
from vllm.entrypoints.openai.api_server import build_async_engine_client
|
|
||||||
from vllm.entrypoints.openai.protocol import (BatchRequestInput,
|
from vllm.entrypoints.openai.protocol import (BatchRequestInput,
|
||||||
BatchRequestOutput,
|
BatchRequestOutput,
|
||||||
BatchResponseData,
|
BatchResponseData,
|
||||||
@ -34,7 +33,6 @@ from vllm.entrypoints.openai.serving_models import (BaseModelPath,
|
|||||||
OpenAIServingModels)
|
OpenAIServingModels)
|
||||||
from vllm.entrypoints.openai.serving_score import ServingScores
|
from vllm.entrypoints.openai.serving_score import ServingScores
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.usage.usage_lib import UsageContext
|
|
||||||
from vllm.utils import FlexibleArgumentParser, random_uuid
|
from vllm.utils import FlexibleArgumentParser, random_uuid
|
||||||
from vllm.version import __version__ as VLLM_VERSION
|
from vllm.version import __version__ as VLLM_VERSION
|
||||||
|
|
||||||
@ -469,6 +467,9 @@ async def run_batch(
|
|||||||
|
|
||||||
|
|
||||||
async def main(args: Namespace):
|
async def main(args: Namespace):
|
||||||
|
from vllm.entrypoints.openai.api_server import build_async_engine_client
|
||||||
|
from vllm.usage.usage_lib import UsageContext
|
||||||
|
|
||||||
async with build_async_engine_client(
|
async with build_async_engine_client(
|
||||||
args,
|
args,
|
||||||
usage_context=UsageContext.OPENAI_BATCH_RUNNER,
|
usage_context=UsageContext.OPENAI_BATCH_RUNNER,
|
||||||
|
@ -1682,6 +1682,8 @@ class FlexibleArgumentParser(ArgumentParser):
|
|||||||
# Set the default "formatter_class" to SortedHelpFormatter
|
# Set the default "formatter_class" to SortedHelpFormatter
|
||||||
if "formatter_class" not in kwargs:
|
if "formatter_class" not in kwargs:
|
||||||
kwargs["formatter_class"] = SortedHelpFormatter
|
kwargs["formatter_class"] = SortedHelpFormatter
|
||||||
|
# Pop kwarg "add_json_tip" to control whether to add the JSON tip
|
||||||
|
self.add_json_tip = kwargs.pop("add_json_tip", True)
|
||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
|
|
||||||
if sys.version_info < (3, 13):
|
if sys.version_info < (3, 13):
|
||||||
@ -1726,7 +1728,8 @@ class FlexibleArgumentParser(ArgumentParser):
|
|||||||
def format_help(self) -> str:
|
def format_help(self) -> str:
|
||||||
# Add tip about JSON arguments to the epilog
|
# Add tip about JSON arguments to the epilog
|
||||||
epilog = self.epilog or ""
|
epilog = self.epilog or ""
|
||||||
if not epilog.startswith(FlexibleArgumentParser._json_tip):
|
if (self.add_json_tip
|
||||||
|
and not epilog.startswith(FlexibleArgumentParser._json_tip)):
|
||||||
self.epilog = FlexibleArgumentParser._json_tip + epilog
|
self.epilog = FlexibleArgumentParser._json_tip + epilog
|
||||||
return super().format_help()
|
return super().format_help()
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user