[Benchmark] Add flag --served-model-name to benchmark_serving_multi_turn (#22889)
Signed-off-by: daniels <daniels@pliops.com>
This commit is contained in:
committed by
GitHub
parent
01a08739e0
commit
3c8a787247
@ -5,11 +5,13 @@ The requirements (pip) for `benchmark_serving_multi_turn.py` can be found in `re
|
|||||||
First start serving your model
|
First start serving your model
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
export MODEL_NAME=/models/meta-llama/Meta-Llama-3.1-8B-Instruct/
|
export MODEL_PATH=/models/meta-llama/Meta-Llama-3.1-8B-Instruct/
|
||||||
|
|
||||||
vllm serve $MODEL_NAME --disable-log-requests
|
vllm serve $MODEL_PATH --served-model-name Llama --disable-log-requests
|
||||||
```
|
```
|
||||||
|
|
||||||
|
The variable `MODEL_PATH` should be a path to the model files (e.g. downloaded from huggingface).
|
||||||
|
|
||||||
## Synthetic Multi-Turn Conversations
|
## Synthetic Multi-Turn Conversations
|
||||||
|
|
||||||
Download the following text file (used for generation of synthetic conversations)
|
Download the following text file (used for generation of synthetic conversations)
|
||||||
@ -26,10 +28,10 @@ But you may use other text files if you prefer (using this specific file is not
|
|||||||
Then run the benchmarking script
|
Then run the benchmarking script
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
export MODEL_NAME=/models/meta-llama/Meta-Llama-3.1-8B-Instruct/
|
export MODEL_PATH=/models/meta-llama/Meta-Llama-3.1-8B-Instruct/
|
||||||
|
|
||||||
python benchmark_serving_multi_turn.py --model $MODEL_NAME --input-file generate_multi_turn.json \
|
python benchmark_serving_multi_turn.py --model $MODEL_PATH --served-model-name Llama \
|
||||||
--num-clients 2 --max-active-conversations 6
|
--input-file generate_multi_turn.json --num-clients 2 --max-active-conversations 6
|
||||||
```
|
```
|
||||||
|
|
||||||
You can edit the file `generate_multi_turn.json` to change the conversation parameters (number of turns, etc.).
|
You can edit the file `generate_multi_turn.json` to change the conversation parameters (number of turns, etc.).
|
||||||
|
@ -825,9 +825,11 @@ def get_client_config(
|
|||||||
|
|
||||||
# Arguments for API requests
|
# Arguments for API requests
|
||||||
chat_url = f"{args.url}/v1/chat/completions"
|
chat_url = f"{args.url}/v1/chat/completions"
|
||||||
|
model_name = args.served_model_name if args.served_model_name else args.model
|
||||||
|
|
||||||
req_args = RequestArgs(
|
req_args = RequestArgs(
|
||||||
chat_url=chat_url,
|
chat_url=chat_url,
|
||||||
model=args.model,
|
model=model_name,
|
||||||
stream=not args.no_stream,
|
stream=not args.no_stream,
|
||||||
limit_min_tokens=args.limit_min_tokens,
|
limit_min_tokens=args.limit_min_tokens,
|
||||||
limit_max_tokens=args.limit_max_tokens,
|
limit_max_tokens=args.limit_max_tokens,
|
||||||
@ -1247,9 +1249,19 @@ async def main() -> None:
|
|||||||
default=0,
|
default=0,
|
||||||
help="Seed for random number generators (default: 0)",
|
help="Seed for random number generators (default: 0)",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"-m", "--model", type=str, required=True, help="Path of the LLM model"
|
"-m", "--model", type=str, required=True, help="Path of the LLM model"
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--served-model-name",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="The model name used in the API. "
|
||||||
|
"If not specified, the model name will be the "
|
||||||
|
"same as the ``--model`` argument. ",
|
||||||
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"-u",
|
"-u",
|
||||||
"--url",
|
"--url",
|
||||||
|
Reference in New Issue
Block a user