From 3c8a7872471a96b414f9861653dded55aa431795 Mon Sep 17 00:00:00 2001 From: Daniel Serebrenik <74646983+pliops-daniels@users.noreply.github.com> Date: Tue, 19 Aug 2025 10:48:07 +0300 Subject: [PATCH] [Benchmark] Add flag --served-model-name to benchmark_serving_multi_turn (#22889) Signed-off-by: daniels --- benchmarks/multi_turn/README.md | 12 +++++++----- .../multi_turn/benchmark_serving_multi_turn.py | 14 +++++++++++++- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/benchmarks/multi_turn/README.md b/benchmarks/multi_turn/README.md index ae0866ae60..7adf97bcf5 100644 --- a/benchmarks/multi_turn/README.md +++ b/benchmarks/multi_turn/README.md @@ -5,11 +5,13 @@ The requirements (pip) for `benchmark_serving_multi_turn.py` can be found in `re First start serving your model ```bash -export MODEL_NAME=/models/meta-llama/Meta-Llama-3.1-8B-Instruct/ +export MODEL_PATH=/models/meta-llama/Meta-Llama-3.1-8B-Instruct/ -vllm serve $MODEL_NAME --disable-log-requests +vllm serve $MODEL_PATH --served-model-name Llama --disable-log-requests ``` +The variable `MODEL_PATH` should be a path to the model files (e.g. downloaded from huggingface). + ## Synthetic Multi-Turn Conversations Download the following text file (used for generation of synthetic conversations) @@ -26,10 +28,10 @@ But you may use other text files if you prefer (using this specific file is not Then run the benchmarking script ```bash -export MODEL_NAME=/models/meta-llama/Meta-Llama-3.1-8B-Instruct/ +export MODEL_PATH=/models/meta-llama/Meta-Llama-3.1-8B-Instruct/ -python benchmark_serving_multi_turn.py --model $MODEL_NAME --input-file generate_multi_turn.json \ ---num-clients 2 --max-active-conversations 6 +python benchmark_serving_multi_turn.py --model $MODEL_PATH --served-model-name Llama \ +--input-file generate_multi_turn.json --num-clients 2 --max-active-conversations 6 ``` You can edit the file `generate_multi_turn.json` to change the conversation parameters (number of turns, etc.). diff --git a/benchmarks/multi_turn/benchmark_serving_multi_turn.py b/benchmarks/multi_turn/benchmark_serving_multi_turn.py index 53c3207491..d23b7b6e45 100644 --- a/benchmarks/multi_turn/benchmark_serving_multi_turn.py +++ b/benchmarks/multi_turn/benchmark_serving_multi_turn.py @@ -825,9 +825,11 @@ def get_client_config( # Arguments for API requests chat_url = f"{args.url}/v1/chat/completions" + model_name = args.served_model_name if args.served_model_name else args.model + req_args = RequestArgs( chat_url=chat_url, - model=args.model, + model=model_name, stream=not args.no_stream, limit_min_tokens=args.limit_min_tokens, limit_max_tokens=args.limit_max_tokens, @@ -1247,9 +1249,19 @@ async def main() -> None: default=0, help="Seed for random number generators (default: 0)", ) + parser.add_argument( "-m", "--model", type=str, required=True, help="Path of the LLM model" ) + parser.add_argument( + "--served-model-name", + type=str, + default=None, + help="The model name used in the API. " + "If not specified, the model name will be the " + "same as the ``--model`` argument. ", + ) + parser.add_argument( "-u", "--url",