mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 14:53:52 +08:00
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
413 lines
13 KiB
Python
413 lines
13 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import shlex
|
|
from importlib import util
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
import pandas as pd
|
|
import psutil
|
|
import regex as re
|
|
from tabulate import tabulate
|
|
|
|
# latency results and the keys that will be printed into markdown
|
|
latency_results = []
|
|
latency_column_mapping = {
|
|
"test_name": "Test name",
|
|
"gpu_type": "GPU",
|
|
"avg_latency": "Mean latency (ms)",
|
|
# "P10": "P10 (s)",
|
|
# "P25": "P25 (s)",
|
|
"P50": "Median latency (ms)",
|
|
# "P75": "P75 (s)",
|
|
# "P90": "P90 (s)",
|
|
"P99": "P99 latency (ms)",
|
|
}
|
|
|
|
# throughput tests and the keys that will be printed into markdown
|
|
throughput_results = []
|
|
throughput_results_column_mapping = {
|
|
"test_name": "Test name",
|
|
"gpu_type": "GPU",
|
|
"num_requests": "# of req.",
|
|
"total_num_tokens": "Total # of tokens",
|
|
"elapsed_time": "Elapsed time (s)",
|
|
"requests_per_second": "Tput (req/s)",
|
|
"tokens_per_second": "Tput (tok/s)",
|
|
}
|
|
|
|
# serving results and the keys that will be printed into markdown
|
|
serving_results = []
|
|
serving_column_mapping = {
|
|
"test_name": "Test name",
|
|
"model_id": "Model",
|
|
"dataset_name": "Dataset Name",
|
|
"input_len": "Input Len",
|
|
"output_len": "Output Len",
|
|
"tp_size": "TP Size",
|
|
"pp_size": "PP Size",
|
|
"dtype": "dtype",
|
|
"gpu_type": "GPU",
|
|
"completed": "# of req.",
|
|
"qps": "qps",
|
|
"max_concurrency": "# of max concurrency.",
|
|
"request_throughput": "Tput (req/s)",
|
|
"total_token_throughput": "Total Token Tput (tok/s)",
|
|
"output_throughput": "Output Tput (tok/s)",
|
|
# "total_input_tokens": "Total input tokens",
|
|
# "total_output_tokens": "Total output tokens",
|
|
"mean_ttft_ms": "Mean TTFT (ms)",
|
|
"median_ttft_ms": "Median TTFT (ms)",
|
|
"p99_ttft_ms": "P99 TTFT (ms)",
|
|
"mean_tpot_ms": "Mean TPOT (ms)",
|
|
"median_tpot_ms": "Median",
|
|
"p99_tpot_ms": "P99",
|
|
"mean_itl_ms": "Mean ITL (ms)",
|
|
"median_itl_ms": "Median ITL (ms)",
|
|
"p99_itl_ms": "P99 ITL (ms)",
|
|
}
|
|
|
|
|
|
def read_markdown(file):
|
|
if os.path.exists(file):
|
|
with open(file) as f:
|
|
return f.read() + "\n"
|
|
else:
|
|
return f"{file} not found.\n"
|
|
|
|
|
|
def results_to_json(latency, throughput, serving):
|
|
return json.dumps(
|
|
{
|
|
"latency": latency.to_dict(),
|
|
"throughput": throughput.to_dict(),
|
|
"serving": serving.to_dict(),
|
|
}
|
|
)
|
|
|
|
|
|
def get_size_with_unit(bytes, suffix="B"):
|
|
"""
|
|
Scale bytes to its proper format
|
|
e.g:
|
|
1253656 => '1.20MB'
|
|
1253656678 => '1.17GB'
|
|
"""
|
|
factor = 1024
|
|
for unit in ["", "K", "M", "G", "T", "P"]:
|
|
if bytes < factor:
|
|
return f"{bytes:.2f}{unit}{suffix}"
|
|
bytes /= factor
|
|
|
|
|
|
def _coerce(val: str) -> Any:
|
|
"""Best-effort type coercion from string to Python types."""
|
|
low = val.lower()
|
|
if low == "null":
|
|
return None
|
|
if low == "true":
|
|
return True
|
|
if low == "false":
|
|
return False
|
|
# integers
|
|
if re.fullmatch(r"[+-]?\d+", val):
|
|
try:
|
|
return int(val)
|
|
except ValueError:
|
|
pass
|
|
# floats (keep 'inf'/'-inf'/'nan' as strings)
|
|
if re.fullmatch(r"[+-]?\d*\.\d+", val):
|
|
try:
|
|
return float(val)
|
|
except ValueError:
|
|
pass
|
|
return val
|
|
|
|
|
|
def parse_client_command(cmd: str) -> dict[str, Any]:
|
|
"""Parse the client_command shell string into {executable, script, args}."""
|
|
toks = shlex.split(cmd)
|
|
if len(toks) < 2:
|
|
raise ValueError("client_command must include an executable and a script")
|
|
executable, script = toks[0], toks[1]
|
|
args: dict[str, Any] = {}
|
|
|
|
i = 2
|
|
while i < len(toks):
|
|
t = toks[i]
|
|
if t.startswith("--"):
|
|
# --key=value or --key (value) or boolean flag
|
|
if "=" in t:
|
|
key, val = t.split("=", 1)
|
|
if key == "--metadata":
|
|
md = {}
|
|
if val:
|
|
if "=" in val:
|
|
k, v = val.split("=", 1)
|
|
md[k] = _coerce(v)
|
|
else:
|
|
md[val] = True
|
|
args[key] = md
|
|
else:
|
|
args[key] = _coerce(val)
|
|
i += 1
|
|
continue
|
|
|
|
key = t
|
|
|
|
# Special: consume metadata k=v pairs until next --flag
|
|
if key == "--metadata":
|
|
i += 1
|
|
md = {}
|
|
while i < len(toks) and not toks[i].startswith("--"):
|
|
pair = toks[i]
|
|
if "=" in pair:
|
|
k, v = pair.split("=", 1)
|
|
md[k] = _coerce(v)
|
|
else:
|
|
md[pair] = True
|
|
i += 1
|
|
args[key] = md
|
|
continue
|
|
|
|
# Standard: check if next token is a value (not a flag)
|
|
if i + 1 < len(toks) and not toks[i + 1].startswith("--"):
|
|
args[key] = _coerce(toks[i + 1])
|
|
i += 2
|
|
else:
|
|
# lone flag -> True
|
|
args[key] = True
|
|
i += 1
|
|
else:
|
|
# unexpected positional; skip
|
|
i += 1
|
|
|
|
return {"executable": executable, "script": script, "args": args}
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument(
|
|
"-r",
|
|
"--result",
|
|
type=str,
|
|
default="results",
|
|
help="Folder name for benchmark output results.",
|
|
)
|
|
args = parser.parse_args()
|
|
results_folder = Path(args.result)
|
|
if not results_folder.exists():
|
|
raise FileNotFoundError(f"results folder does not exist: {results_folder}")
|
|
# collect results
|
|
for test_file in results_folder.glob("*.json"):
|
|
with open(test_file) as f:
|
|
raw_result = json.loads(f.read())
|
|
|
|
if "serving" in str(test_file):
|
|
# this result is generated via `vllm bench serve` command
|
|
# attach the benchmarking command to raw_result
|
|
try:
|
|
with open(test_file.with_suffix(".commands")) as f:
|
|
command = json.loads(f.read())
|
|
except OSError as e:
|
|
print(e)
|
|
continue
|
|
# Parse Server Command Arg
|
|
out: dict[str, Any] = {
|
|
"server_command": parse_client_command(command["server_command"])
|
|
}
|
|
parse_args = [
|
|
"--tensor-parallel-size",
|
|
"--pipeline-parallel-size",
|
|
"--dtype",
|
|
]
|
|
col_mapping = ["tp_size", "pp_size", "dtype"]
|
|
for index, arg in enumerate(parse_args):
|
|
if arg in out["server_command"]["args"]:
|
|
raw_result.update(
|
|
{col_mapping[index]: out["server_command"]["args"][arg]}
|
|
)
|
|
|
|
# Parse Client Command Arg
|
|
out: dict[str, Any] = {
|
|
"client_command": parse_client_command(command["client_command"])
|
|
}
|
|
parse_args = [
|
|
"--dataset-name",
|
|
"--random-input-len",
|
|
"--random-output-len",
|
|
"--request-rate",
|
|
]
|
|
col_mapping = ["dataset_name", "input_len", "output_len", "qps"]
|
|
|
|
for index, arg in enumerate(parse_args):
|
|
if arg in out["client_command"]["args"]:
|
|
raw_result.update(
|
|
{col_mapping[index]: out["client_command"]["args"][arg]}
|
|
)
|
|
# Add Server, Client command
|
|
raw_result.update(command)
|
|
|
|
# update the test name of this result
|
|
raw_result.update({"test_name": test_file.stem})
|
|
# add the result to raw_result
|
|
serving_results.append(raw_result)
|
|
continue
|
|
|
|
elif "latency" in f.name:
|
|
# this result is generated via `vllm bench latency` command
|
|
|
|
# attach the benchmarking command to raw_result
|
|
try:
|
|
with open(test_file.with_suffix(".commands")) as f:
|
|
command = json.loads(f.read())
|
|
except OSError as e:
|
|
print(e)
|
|
continue
|
|
|
|
raw_result.update(command)
|
|
|
|
# update the test name of this result
|
|
raw_result.update({"test_name": test_file.stem})
|
|
|
|
# get different percentiles
|
|
for perc in [10, 25, 50, 75, 90, 99]:
|
|
# Multiply 1000 to convert the time unit from s to ms
|
|
raw_result.update(
|
|
{f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]}
|
|
)
|
|
raw_result["avg_latency"] = raw_result["avg_latency"] * 1000
|
|
|
|
# add the result to raw_result
|
|
latency_results.append(raw_result)
|
|
continue
|
|
|
|
elif "throughput" in f.name:
|
|
# this result is generated via `vllm bench throughput` command
|
|
|
|
# attach the benchmarking command to raw_result
|
|
try:
|
|
with open(test_file.with_suffix(".commands")) as f:
|
|
command = json.loads(f.read())
|
|
except OSError as e:
|
|
print(e)
|
|
continue
|
|
|
|
raw_result.update(command)
|
|
|
|
# update the test name of this result
|
|
raw_result.update({"test_name": test_file.stem})
|
|
|
|
# add the result to raw_result
|
|
throughput_results.append(raw_result)
|
|
continue
|
|
|
|
print(f"Skipping {test_file}")
|
|
|
|
latency_results = pd.DataFrame.from_dict(latency_results)
|
|
serving_results = pd.DataFrame.from_dict(serving_results)
|
|
throughput_results = pd.DataFrame.from_dict(throughput_results)
|
|
|
|
svmem = psutil.virtual_memory()
|
|
platform_data = {
|
|
"Physical cores": [psutil.cpu_count(logical=False)],
|
|
"Total cores": [psutil.cpu_count(logical=True)],
|
|
"Total Memory": [get_size_with_unit(svmem.total)],
|
|
}
|
|
|
|
if util.find_spec("numa") is not None:
|
|
from numa import info
|
|
|
|
platform_data["Total NUMA nodes"] = [info.get_num_configured_nodes()]
|
|
|
|
if util.find_spec("cpuinfo") is not None:
|
|
from cpuinfo import get_cpu_info
|
|
|
|
platform_data["CPU Brand"] = [get_cpu_info()["brand_raw"]]
|
|
|
|
platform_results = pd.DataFrame.from_dict(
|
|
platform_data, orient="index", columns=["Platform Info"]
|
|
)
|
|
|
|
raw_results_json = results_to_json(
|
|
latency_results, throughput_results, serving_results
|
|
)
|
|
|
|
# remapping the key, for visualization purpose
|
|
if not latency_results.empty:
|
|
latency_results = latency_results[list(latency_column_mapping.keys())].rename(
|
|
columns=latency_column_mapping
|
|
)
|
|
if not serving_results.empty:
|
|
valid_columns = [
|
|
col for col in serving_column_mapping if col in serving_results.columns
|
|
]
|
|
serving_results = serving_results[valid_columns].rename(
|
|
columns=serving_column_mapping
|
|
)
|
|
if not throughput_results.empty:
|
|
throughput_results = throughput_results[
|
|
list(throughput_results_column_mapping.keys())
|
|
].rename(columns=throughput_results_column_mapping)
|
|
|
|
processed_results_json = results_to_json(
|
|
latency_results, throughput_results, serving_results
|
|
)
|
|
|
|
for df in [latency_results, serving_results, throughput_results]:
|
|
if df.empty:
|
|
continue
|
|
|
|
# Sort all dataframes by their respective "Test name" columns
|
|
df.sort_values(by="Test name", inplace=True)
|
|
|
|
# The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
|
|
# we want to turn it into "8xGPUTYPE"
|
|
df["GPU"] = df["GPU"].apply(
|
|
lambda x: f"{len(x.splitlines())}x{x.splitlines()[0]}"
|
|
)
|
|
|
|
# get markdown tables
|
|
latency_md_table = tabulate(
|
|
latency_results, headers="keys", tablefmt="pipe", showindex=False
|
|
)
|
|
serving_md_table = tabulate(
|
|
serving_results, headers="keys", tablefmt="pipe", showindex=False
|
|
)
|
|
throughput_md_table = tabulate(
|
|
throughput_results, headers="keys", tablefmt="pipe", showindex=False
|
|
)
|
|
platform_md_table = tabulate(
|
|
platform_results, headers="keys", tablefmt="pipe", showindex=True
|
|
)
|
|
|
|
# document the result
|
|
md_file = "benchmark_results.md"
|
|
json_file = "benchmark_results.json"
|
|
with open(results_folder / md_file, "w") as f:
|
|
results = read_markdown(
|
|
"../.buildkite/nightly-benchmarks/"
|
|
+ "performance-benchmarks-descriptions.md"
|
|
)
|
|
results = results.format(
|
|
latency_tests_markdown_table=latency_md_table,
|
|
throughput_tests_markdown_table=throughput_md_table,
|
|
serving_tests_markdown_table=serving_md_table,
|
|
platform_markdown_table=platform_md_table,
|
|
benchmarking_results_in_json_string=processed_results_json,
|
|
)
|
|
f.write(results)
|
|
|
|
# document benchmarking results in json
|
|
with open(results_folder / json_file, "w") as f:
|
|
results = (
|
|
latency_results.to_dict(orient="records")
|
|
+ throughput_results.to_dict(orient="records")
|
|
+ serving_results.to_dict(orient="records")
|
|
)
|
|
f.write(json.dumps(results))
|