Compare commits

..

1 Commits

Author SHA1 Message Date
1db4b78a13 Mock gguf in doc build 2025-07-10 13:39:35 -07:00
2301 changed files with 156646 additions and 243619 deletions

View File

@ -5,11 +5,11 @@ import os
import sys
import zipfile
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 450 MiB
# Note that we have 800 MiB quota, please use it wisely.
# See https://github.com/pypi/support/issues/6326 .
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 400 MiB
# Note that we have 400 MiB quota, please use it wisely.
# See https://github.com/pypi/support/issues/3792 .
# Please also sync the value with the one in Dockerfile.
VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 450))
VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 400))
def print_top_10_largest_files(zip_file):

View File

@ -8,8 +8,7 @@ template = """<!DOCTYPE html>
<html>
<body>
<h1>Links for vLLM</h1/>
<a href="../{x86_wheel_html_escaped}">{x86_wheel}</a><br/>
<a href="../{arm_wheel_html_escaped}">{arm_wheel}</a><br/>
<a href="../{wheel_html_escaped}">{wheel}</a><br/>
</body>
</html>
"""
@ -22,25 +21,7 @@ filename = os.path.basename(args.wheel)
with open("index.html", "w") as f:
print(f"Generated index.html for {args.wheel}")
# sync the abi tag with .buildkite/scripts/upload-wheels.sh
if "x86_64" in filename:
x86_wheel = filename
arm_wheel = filename.replace("x86_64", "aarch64").replace(
"manylinux1", "manylinux2014"
)
elif "aarch64" in filename:
x86_wheel = filename.replace("aarch64", "x86_64").replace(
"manylinux2014", "manylinux1"
)
arm_wheel = filename
else:
raise ValueError(f"Unsupported wheel: {filename}")
# cloudfront requires escaping the '+' character
f.write(
template.format(
x86_wheel=x86_wheel,
x86_wheel_html_escaped=x86_wheel.replace("+", "%2B"),
arm_wheel=arm_wheel,
arm_wheel_html_escaped=arm_wheel.replace("+", "%2B"),
)
template.format(wheel=filename, wheel_html_escaped=filename.replace("+", "%2B"))
)

View File

@ -0,0 +1,12 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
model_name: "HandH1998/QQQ-Llama-3-8b-g128"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.419
- name: "exact_match,flexible-extract"
value: 0.416
limit: 1000
num_fewshot: 5

View File

@ -3,3 +3,4 @@ Meta-Llama-3-70B-Instruct.yaml
Mixtral-8x7B-Instruct-v0.1.yaml
Qwen2-57B-A14-Instruct.yaml
DeepSeek-V2-Lite-Chat.yaml
Meta-Llama-3-8B-QQQ.yaml

View File

@ -2,7 +2,7 @@
# We can use this script to compute baseline accuracy on GSM for transformers.
#
# Make sure you have lm-eval-harness installed:
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
# pip install lm-eval==0.4.4
usage() {
echo``

View File

@ -3,7 +3,7 @@
# We use this for fp8, which HF does not support.
#
# Make sure you have lm-eval-harness installed:
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
# pip install lm-eval==0.4.4
usage() {
echo``

View File

@ -7,7 +7,7 @@ This directory contains two sets of benchmark for vllm.
- Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance
- Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm.
See [vLLM performance dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
## Performance benchmark quick overview
@ -28,7 +28,6 @@ See [vLLM performance dashboard](https://hud.pytorch.org/benchmark/llms?repoName
## Trigger the benchmark
Performance benchmark will be triggered when:
- A PR being merged into vllm.
- Every commit for those PRs with `perf-benchmarks` label AND `ready` label.
@ -39,7 +38,6 @@ bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
```
Runtime environment variables:
- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
- `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
- `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
@ -48,14 +46,12 @@ Runtime environment variables:
- `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.
Nightly benchmark will be triggered when:
- Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
## Performance benchmark details
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
> NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead.
>
### Latency test
Here is an example of one test inside `latency-tests.json`:
@ -78,7 +74,7 @@ Here is an example of one test inside `latency-tests.json`:
In this example:
- The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
- The `parameters` attribute control the command line arguments to be used for `vllm bench latency`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `vllm bench latency`. For example, the corresponding command line arguments for `vllm bench latency` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
- The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
@ -86,13 +82,13 @@ WARNING: The benchmarking script will save json results by itself, so please do
### Throughput test
The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `vllm bench throughput`.
The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`.
The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
### Serving test
We test the throughput by using `vllm bench serve` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
```json
[
@ -104,6 +100,7 @@ We test the throughput by using `vllm bench serve` with request rate = inf to co
"tensor_parallel_size": 1,
"swap_space": 16,
"disable_log_stats": "",
"disable_log_requests": "",
"load_format": "dummy"
},
"client_parameters": {
@ -121,8 +118,8 @@ Inside this example:
- The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`.
- The `server-parameters` includes the command line arguments for vLLM server.
- The `client-parameters` includes the command line arguments for `vllm bench serve`.
- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `vllm bench serve`
- The `client-parameters` includes the command line arguments for `benchmark_serving.py`.
- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `benchmark_serving.py`
The number of this test is less stable compared to the delay and latency benchmarks (due to randomized sharegpt dataset sampling inside `benchmark_serving.py`), but a large change on this number (e.g. 5% change) still vary the output greatly.
@ -138,20 +135,27 @@ The raw benchmarking results (in the format of json files) are in the `Artifacts
The `compare-json-results.py` helps to compare benchmark results JSON files converted using `convert-results-json-to-markdown.py`.
When run, benchmark script generates results under `benchmark/results` folder, along with the `benchmark_results.md` and `benchmark_results.json`.
`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.
If only one benchmark_results.json is passed, `compare-json-results.py` compares different TP and PP configurations in the benchmark_results.json instead.
`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.
Here is an example using the script to compare result_a and result_b with Model, Dataset name, input/output length, max concurrency and qps.
Here is an example using the script to compare result_a and result_b without detail test name.
`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json --ignore_test_name`
| | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio |
|----|----------------------------------------|----------------------------------------|----------|
| 0 | 142.633982 | 156.526018 | 1.097396 |
| 1 | 241.620334 | 294.018783 | 1.216863 |
| 2 | 218.298905 | 262.664916 | 1.203235 |
| 3 | 242.743860 | 299.816190 | 1.235113 |
Here is an example using the script to compare result_a and result_b with detail test name.
`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json`
| | Model | Dataset Name | Input Len | Output Len | # of max concurrency | qps | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio |
|----|---------------------------------------|--------|-----|-----|------|-----|-----------|----------|----------|
| 0 | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | 1 | 142.633982 | 156.526018 | 1.097396 |
| 1 | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | inf| 241.620334 | 294.018783 | 1.216863 |
A comparison diagram will be generated below the table.
Here is an example to compare between 96c/results_gnr_96c_091_tp2pp3 and 128c/results_gnr_128c_091_tp2pp3
<img width="1886" height="828" alt="image" src="https://github.com/user-attachments/assets/c02a43ef-25d0-4fd6-90e5-2169a28682dd" />
| | results_a/benchmark_results.json_name | results_a/benchmark_results.json | results_b/benchmark_results.json_name | results_b/benchmark_results.json | perf_ratio |
|---|---------------------------------------------|----------------------------------------|---------------------------------------------|----------------------------------------|----------|
| 0 | serving_llama8B_tp1_sharegpt_qps_1 | 142.633982 | serving_llama8B_tp1_sharegpt_qps_1 | 156.526018 | 1.097396 |
| 1 | serving_llama8B_tp1_sharegpt_qps_16 | 241.620334 | serving_llama8B_tp1_sharegpt_qps_16 | 294.018783 | 1.216863 |
| 2 | serving_llama8B_tp1_sharegpt_qps_4 | 218.298905 | serving_llama8B_tp1_sharegpt_qps_4 | 262.664916 | 1.203235 |
| 3 | serving_llama8B_tp1_sharegpt_qps_inf | 242.743860 | serving_llama8B_tp1_sharegpt_qps_inf | 299.816190 | 1.235113 |
| 4 | serving_llama8B_tp2_random_1024_128_qps_1 | 96.613390 | serving_llama8B_tp4_random_1024_128_qps_1 | 108.404853 | 1.122048 |
## Nightly test details
@ -160,9 +164,9 @@ See [nightly-descriptions.md](nightly-descriptions.md) for the detailed descript
### Workflow
- The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines.
- Inside each container, we run [scripts/run-nightly-benchmarks.sh](scripts/run-nightly-benchmarks.sh), which will probe the serving engine of the current container.
- The `scripts/run-nightly-benchmarks.sh` will parse the workload described in [nightly-tests.json](tests/nightly-tests.json) and launch the right benchmark for the specified serving engine via `scripts/launch-server.sh`.
- At last, we run [scripts/summary-nightly-results.py](scripts/summary-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
- Inside each container, we run [run-nightly-suite.sh](run-nightly-suite.sh), which will probe the serving engine of the current container.
- The `run-nightly-suite.sh` will redirect the request to `tests/run-[llm serving engine name]-nightly.sh`, which parses the workload described in [nightly-tests.json](tests/nightly-tests.json) and performs the benchmark.
- At last, we run [scripts/plot-nightly-results.py](scripts/plot-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
### Nightly tests
@ -172,6 +176,6 @@ In [nightly-tests.json](tests/nightly-tests.json), we include the command line a
The docker containers for benchmarking are specified in `nightly-pipeline.yaml`.
WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `scripts/run-nightly-benchmarks.sh` and `scripts/launch-server.sh`.
WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `tests/run-[llm serving engine name]-nightly.sh`.
WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git).

View File

@ -1,4 +1,3 @@
# Nightly benchmark annotation
## Description
@ -14,15 +13,15 @@ Please download the visualization scripts in the post
- Find the docker we use in `benchmarking pipeline`
- Deploy the docker, and inside the docker:
- Download `nightly-benchmarks.zip`.
- In the same folder, run the following code:
- Download `nightly-benchmarks.zip`.
- In the same folder, run the following code:
```bash
export HF_TOKEN=<your HF token>
apt update
apt install -y git
unzip nightly-benchmarks.zip
VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
```
```bash
export HF_TOKEN=<your HF token>
apt update
apt install -y git
unzip nightly-benchmarks.zip
VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
```
And the results will be inside `./benchmarks/results`.

View File

@ -8,30 +8,30 @@ This benchmark aims to:
Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end.
Latest reproduction guide: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
## Setup
- Docker images:
- vLLM: `vllm/vllm-openai:v0.6.2`
- SGLang: `lmsysorg/sglang:v0.3.2-cu121`
- LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
- TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
- *NOTE: we use r24.07 as the current implementation only works for this version. We are going to bump this up.*
- Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
- vLLM: `vllm/vllm-openai:v0.6.2`
- SGLang: `lmsysorg/sglang:v0.3.2-cu121`
- LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
- TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
- *NOTE: we uses r24.07 as the current implementation only works for this version. We are going to bump this up.*
- Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
- Hardware
- 8x Nvidia A100 GPUs
- 8x Nvidia A100 GPUs
- Workload:
- Dataset
- ShareGPT dataset
- Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output)
- Decode-heavy dataset (in average 462 input tokens, 256 output tokens)
- Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use.
- Models: llama-3 8B, llama-3 70B.
- We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)).
- Average QPS (query per second): 2, 4, 8, 16, 32 and inf.
- Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
- Dataset
- ShareGPT dataset
- Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output)
- Decode-heavy dataset (in average 462 input tokens, 256 output tokens)
- Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use.
- Models: llama-3 8B, llama-3 70B.
- We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)).
- Average QPS (query per second): 2, 4, 8, 16, 32 and inf.
- Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
## Known issues

View File

@ -1,4 +1,3 @@
# Performance benchmarks descriptions
## Latency tests

View File

@ -1,202 +1,33 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import json
import os
from importlib import util
import pandas as pd
plotly_found = util.find_spec("plotly.express") is not None
def compare_data_columns(
files, name_column, data_column, info_cols, drop_column, debug=False
files, name_column, data_column, drop_column, ignore_test_name=False
):
"""
Align concatenation by keys derived from info_cols instead of row order.
- Pick one canonical key list: subset of info_cols present in ALL files.
- For each file: set index to those keys, aggregate duplicates
- (mean for metric, first for names).
- Concat along axis=1 (indexes align), then reset_index so callers can
- group by columns.
- If --debug, add a <file_label>_name column per file.
"""
print("\ncompare_data_column:", data_column)
print("\ncompare_data_column: " + data_column)
frames = []
raw_data_cols = []
compare_frames = []
# 1) choose a canonical key list from info_cols that exists in ALL files
cols_per_file = []
for f in files:
try:
df_tmp = pd.read_json(f, orient="records")
except Exception as err:
raise ValueError(f"Failed to read {f}") from err
cols_per_file.append(set(df_tmp.columns))
key_cols = [c for c in info_cols if all(c in cset for cset in cols_per_file)]
if not key_cols:
# soft fallback: use any info_cols present in the first file
key_cols = [c for c in info_cols if c in list(cols_per_file[0])]
if not key_cols:
raise ValueError(
"No common key columns found from info_cols across the input files."
)
# 2) build a single "meta" block (keys as columns) once, aligned by the key index
meta_added = False
for file in files:
df = pd.read_json(file, orient="records")
# Keep rows that actually have the compared metric (same as original behavior)
if drop_column in df.columns:
df = df.dropna(subset=[drop_column], ignore_index=True)
# Stabilize numeric key columns (harmless if missing)
for c in (
"Input Len",
"Output Len",
"TP Size",
"PP Size",
"# of max concurrency.",
"qps",
):
if c in df.columns:
df[c] = pd.to_numeric(df[c], errors="coerce")
# Ensure all key columns exist
for c in key_cols:
if c not in df.columns:
df[c] = pd.NA
# Set index = key_cols and aggregate duplicates → unique MultiIndex
df_idx = df.set_index(key_cols, drop=False)
# meta (key columns), unique per key
meta = df_idx[key_cols]
if not meta.index.is_unique:
meta = meta.groupby(level=key_cols, dropna=False).first()
# metric series for this file, aggregated to one row per key
file_label = "/".join(file.split("/")[:-1]) or os.path.basename(file)
s = df_idx[data_column]
if not s.index.is_unique:
s = s.groupby(level=key_cols, dropna=False).mean()
s.name = file_label # column label like original
# add meta once (from first file) so keys are the leftmost columns
if not meta_added:
frames.append(meta)
meta_added = True
# (NEW) debug: aligned test-name column per file
if debug and name_column in df_idx.columns:
name_s = df_idx[name_column]
if not name_s.index.is_unique:
name_s = name_s.groupby(level=key_cols, dropna=False).first()
name_s.name = f"{file_label}_name"
frames.append(name_s)
frames.append(s)
raw_data_cols.append(file_label)
compare_frames.append(s)
# Generalize ratio: for any file N>=2, add ratio (fileN / file1)
data_df = pd.read_json(file)
serving_df = data_df.dropna(subset=[drop_column], ignore_index=True)
if ignore_test_name is False:
serving_df = serving_df.rename(columns={name_column: file + "_name"})
frames.append(serving_df[file + "_name"])
serving_df = serving_df.rename(columns={data_column: file})
frames.append(serving_df[file])
compare_frames.append(serving_df[file])
if len(compare_frames) >= 2:
base = compare_frames[0]
current = compare_frames[-1]
ratio = current / base
ratio = ratio.mask(base == 0) # avoid inf when baseline is 0
ratio.name = f"Ratio 1 vs {len(compare_frames)}"
frames.append(ratio)
# Compare numbers among two files
ratio_df = compare_frames[1] / compare_frames[0]
frames.append(ratio_df)
compare_frames.pop(1)
# 4) concat on columns with aligned MultiIndex;
# then reset_index to return keys as columns
concat_df = pd.concat(frames, axis=1)
concat_df = concat_df.reset_index(drop=True).reset_index()
if "index" in concat_df.columns:
concat_df = concat_df.drop(columns=["index"])
# Ensure key/info columns appear first (in your info_cols order)
front = [c for c in info_cols if c in concat_df.columns]
rest = [c for c in concat_df.columns if c not in front]
concat_df = concat_df[front + rest]
print(raw_data_cols)
return concat_df, raw_data_cols
def split_json_by_tp_pp(
input_file: str = "benchmark_results.json", output_root: str = "."
) -> list[str]:
"""
Split a benchmark JSON into separate folders by (TP Size, PP Size).
Creates: <output_root>/tp{TP}_pp{PP}/benchmark_results.json
Returns: list of file paths written.
"""
# Load JSON data into DataFrame
with open(input_file, encoding="utf-8") as f:
data = json.load(f)
# If the JSON is a dict with a list under common keys, use that list
if isinstance(data, dict):
for key in ("results", "serving_results", "benchmarks", "data"):
if isinstance(data.get(key), list):
data = data[key]
break
df = pd.DataFrame(data)
# Keep only "serving" tests
name_col = next(
(c for c in ["Test name", "test_name", "Test Name"] if c in df.columns), None
)
if name_col:
df = df[
df[name_col].astype(str).str.contains(r"serving", case=False, na=False)
].copy()
# Handle alias column names
rename_map = {
"tp_size": "TP Size",
"tensor_parallel_size": "TP Size",
"pp_size": "PP Size",
"pipeline_parallel_size": "PP Size",
}
df.rename(
columns={k: v for k, v in rename_map.items() if k in df.columns}, inplace=True
)
# Ensure TP/PP columns exist (default to 1 if missing)
if "TP Size" not in df.columns:
df["TP Size"] = 1
if "PP Size" not in df.columns:
df["PP Size"] = 1
# make sure TP/PP are numeric ints with no NaN
df["TP Size"] = (
pd.to_numeric(df.get("TP Size", 1), errors="coerce").fillna(1).astype(int)
)
df["PP Size"] = (
pd.to_numeric(df.get("PP Size", 1), errors="coerce").fillna(1).astype(int)
)
# Split into separate folders
saved_paths: list[str] = []
for (tp, pp), group_df in df.groupby(["TP Size", "PP Size"], dropna=False):
folder_name = os.path.join(output_root, f"tp{int(tp)}_pp{int(pp)}")
os.makedirs(folder_name, exist_ok=True)
filepath = os.path.join(folder_name, "benchmark_results.json")
group_df.to_json(filepath, orient="records", indent=2, force_ascii=False)
print(f"Saved: {filepath}")
saved_paths.append(filepath)
return saved_paths
return concat_df
if __name__ == "__main__":
@ -205,103 +36,31 @@ if __name__ == "__main__":
"-f", "--file", action="append", type=str, help="input file name"
)
parser.add_argument(
"--debug", action="store_true", help="show all information for debugging"
)
parser.add_argument(
"--plot",
action=argparse.BooleanOptionalAction,
default=True,
help="plot perf diagrams or not --no-plot --plot",
)
parser.add_argument(
"-x",
"--xaxis",
type=str,
default="# of max concurrency.",
help="column name to use as X Axis in comparison graph",
"--ignore_test_name", action="store_true", help="ignore_test_name or not"
)
args = parser.parse_args()
files = args.file
print("comparing : " + ", ".join(files))
drop_column = "P99"
name_column = "Test name"
info_cols = [
"Model",
"Dataset Name",
"Input Len",
"Output Len",
"TP Size",
"PP Size",
"# of max concurrency.",
"qps",
]
data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
html_msgs_for_data_cols = [
"Compare Output Tokens /n",
"Median TTFT /n",
"Median TPOT /n",
]
if len(args.file) == 1:
files = split_json_by_tp_pp(args.file[0], output_root="splits")
info_cols = [c for c in info_cols if c not in ("TP Size", "PP Size")]
else:
files = args.file
print("comparing : " + ", ".join(files))
debug = args.debug
plot = args.plot
# For Plot feature, assign y axis from one of info_cols
y_axis_index = info_cols.index(args.xaxis) if args.xaxis in info_cols else 6
ignore_test_name = args.ignore_test_name
with open("perf_comparison.html", "w") as text_file:
for i in range(len(data_cols_to_compare)):
output_df, raw_data_cols = compare_data_columns(
output_df = compare_data_columns(
files,
name_column,
data_cols_to_compare[i],
info_cols,
drop_column,
debug=debug,
ignore_test_name=ignore_test_name,
)
# For Plot feature, insert y axis from one of info_cols
raw_data_cols.insert(0, info_cols[y_axis_index])
filtered_info_cols = info_cols[:-2]
existing_group_cols = [
c for c in filtered_info_cols if c in output_df.columns
]
if not existing_group_cols:
raise ValueError(
f"No valid group-by columns "
f"Expected subset: {filtered_info_cols}, "
f"but DataFrame has: {list(output_df.columns)}"
)
output_df_sorted = output_df.sort_values(by=existing_group_cols)
output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False)
for name, group in output_groups:
html = group.to_html()
text_file.write(html_msgs_for_data_cols[i])
text_file.write(html)
if plot and plotly_found:
import plotly.express as px
df = group[raw_data_cols]
df_sorted = df.sort_values(by=info_cols[y_axis_index])
# Melt DataFrame for plotting
df_melted = df_sorted.melt(
id_vars=info_cols[y_axis_index],
var_name="Configuration",
value_name=data_cols_to_compare[i],
)
title = data_cols_to_compare[i] + " vs " + info_cols[y_axis_index]
# Create Plotly line chart
fig = px.line(
df_melted,
x=info_cols[y_axis_index],
y=data_cols_to_compare[i],
color="Configuration",
title=title,
markers=True,
)
# Export to HTML
text_file.write(fig.to_html(full_html=True, include_plotlyjs="cdn"))
print(output_df)
html = output_df.to_html()
text_file.write(html_msgs_for_data_cols[i])
text_file.write(html)

View File

@ -1,19 +1,17 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import json
import os
import shlex
from importlib import util
from pathlib import Path
from typing import Any
import pandas as pd
import psutil
import regex as re
from tabulate import tabulate
results_folder = Path("results/")
# latency results and the keys that will be printed into markdown
latency_results = []
latency_column_mapping = {
@ -44,22 +42,13 @@ throughput_results_column_mapping = {
serving_results = []
serving_column_mapping = {
"test_name": "Test name",
"model_id": "Model",
"dataset_name": "Dataset Name",
"input_len": "Input Len",
"output_len": "Output Len",
"tp_size": "TP Size",
"pp_size": "PP Size",
"dtype": "dtype",
"gpu_type": "GPU",
"completed": "# of req.",
"qps": "qps",
"max_concurrency": "# of max concurrency.",
"request_throughput": "Tput (req/s)",
"total_token_throughput": "Total Token Tput (tok/s)",
"output_throughput": "Output Tput (tok/s)",
# "total_input_tokens": "Total input tokens",
# "total_output_tokens": "Total output tokens",
"total_input_tokens": "Total input tokens",
"total_output_tokens": "Total output tokens",
"mean_ttft_ms": "Mean TTFT (ms)",
"median_ttft_ms": "Median TTFT (ms)",
"p99_ttft_ms": "P99 TTFT (ms)",
@ -104,111 +93,15 @@ def get_size_with_unit(bytes, suffix="B"):
bytes /= factor
def _coerce(val: str) -> Any:
"""Best-effort type coercion from string to Python types."""
low = val.lower()
if low == "null":
return None
if low == "true":
return True
if low == "false":
return False
# integers
if re.fullmatch(r"[+-]?\d+", val):
try:
return int(val)
except ValueError:
pass
# floats (keep 'inf'/'-inf'/'nan' as strings)
if re.fullmatch(r"[+-]?\d*\.\d+", val):
try:
return float(val)
except ValueError:
pass
return val
def parse_client_command(cmd: str) -> dict[str, Any]:
"""Parse the client_command shell string into {executable, script, args}."""
toks = shlex.split(cmd)
if len(toks) < 2:
raise ValueError("client_command must include an executable and a script")
executable, script = toks[0], toks[1]
args: dict[str, Any] = {}
i = 2
while i < len(toks):
t = toks[i]
if t.startswith("--"):
# --key=value or --key (value) or boolean flag
if "=" in t:
key, val = t.split("=", 1)
if key == "--metadata":
md = {}
if val:
if "=" in val:
k, v = val.split("=", 1)
md[k] = _coerce(v)
else:
md[val] = True
args[key] = md
else:
args[key] = _coerce(val)
i += 1
continue
key = t
# Special: consume metadata k=v pairs until next --flag
if key == "--metadata":
i += 1
md = {}
while i < len(toks) and not toks[i].startswith("--"):
pair = toks[i]
if "=" in pair:
k, v = pair.split("=", 1)
md[k] = _coerce(v)
else:
md[pair] = True
i += 1
args[key] = md
continue
# Standard: check if next token is a value (not a flag)
if i + 1 < len(toks) and not toks[i + 1].startswith("--"):
args[key] = _coerce(toks[i + 1])
i += 2
else:
# lone flag -> True
args[key] = True
i += 1
else:
# unexpected positional; skip
i += 1
return {"executable": executable, "script": script, "args": args}
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"-r",
"--result",
type=str,
default="results",
help="Folder name for benchmark output results.",
)
args = parser.parse_args()
results_folder = Path(args.result)
if not results_folder.exists():
raise FileNotFoundError(f"results folder does not exist: {results_folder}")
# collect results
for test_file in results_folder.glob("*.json"):
with open(test_file) as f:
raw_result = json.loads(f.read())
if "serving" in str(test_file):
# this result is generated via `vllm bench serve` command
# this result is generated via `benchmark_serving.py`
# attach the benchmarking command to raw_result
try:
with open(test_file.with_suffix(".commands")) as f:
@ -216,50 +109,18 @@ if __name__ == "__main__":
except OSError as e:
print(e)
continue
# Parse Server Command Arg
out: dict[str, Any] = {
"server_command": parse_client_command(command["server_command"])
}
parse_args = [
"--tensor-parallel-size",
"--pipeline-parallel-size",
"--dtype",
]
col_mapping = ["tp_size", "pp_size", "dtype"]
for index, arg in enumerate(parse_args):
if arg in out["server_command"]["args"]:
raw_result.update(
{col_mapping[index]: out["server_command"]["args"][arg]}
)
# Parse Client Command Arg
out: dict[str, Any] = {
"client_command": parse_client_command(command["client_command"])
}
parse_args = [
"--dataset-name",
"--random-input-len",
"--random-output-len",
"--request-rate",
]
col_mapping = ["dataset_name", "input_len", "output_len", "qps"]
for index, arg in enumerate(parse_args):
if arg in out["client_command"]["args"]:
raw_result.update(
{col_mapping[index]: out["client_command"]["args"][arg]}
)
# Add Server, Client command
raw_result.update(command)
# update the test name of this result
raw_result.update({"test_name": test_file.stem})
# add the result to raw_result
serving_results.append(raw_result)
continue
elif "latency" in f.name:
# this result is generated via `vllm bench latency` command
# this result is generated via `benchmark_latency.py`
# attach the benchmarking command to raw_result
try:
@ -287,7 +148,7 @@ if __name__ == "__main__":
continue
elif "throughput" in f.name:
# this result is generated via `vllm bench throughput` command
# this result is generated via `benchmark_throughput.py`
# attach the benchmarking command to raw_result
try:
@ -343,10 +204,7 @@ if __name__ == "__main__":
columns=latency_column_mapping
)
if not serving_results.empty:
valid_columns = [
col for col in serving_column_mapping if col in serving_results.columns
]
serving_results = serving_results[valid_columns].rename(
serving_results = serving_results[list(serving_column_mapping.keys())].rename(
columns=serving_column_mapping
)
if not throughput_results.empty:
@ -386,9 +244,7 @@ if __name__ == "__main__":
)
# document the result
md_file = "benchmark_results.md"
json_file = "benchmark_results.json"
with open(results_folder / md_file, "w") as f:
with open(results_folder / "benchmark_results.md", "w") as f:
results = read_markdown(
"../.buildkite/nightly-benchmarks/"
+ "performance-benchmarks-descriptions.md"
@ -403,7 +259,7 @@ if __name__ == "__main__":
f.write(results)
# document benchmarking results in json
with open(results_folder / json_file, "w") as f:
with open(results_folder / "benchmark_results.json", "w") as f:
results = (
latency_results.to_dict(orient="records")
+ throughput_results.to_dict(orient="records")

View File

@ -73,7 +73,7 @@ get_current_llm_serving_engine() {
echo "Container: vllm"
# move to a completely irrelevant directory, to avoid import vllm from current folder
export CURRENT_LLM_SERVING_ENGINE=vllm
return
fi
}
@ -95,14 +95,12 @@ json2args() {
}
kill_gpu_processes() {
pkill -f '[p]ython'
pkill -f '[p]ython3'
pkill -f '[t]ritonserver'
pkill -f '[p]t_main_thread'
pkill -f '[t]ext-generation'
pkill -f '[l]mdeploy'
# vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
pkill -f '[V]LLM'
pkill -f python
pkill -f python3
pkill -f tritonserver
pkill -f pt_main_thread
pkill -f text-generation
pkill -f lmdeploy
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
sleep 1
@ -127,7 +125,7 @@ ensure_installed() {
}
run_serving_tests() {
# run serving tests using `vllm bench serve` command
# run serving tests using `benchmark_serving.py`
# $1: a json file specifying serving test cases
local serving_test_file
@ -227,7 +225,7 @@ run_serving_tests() {
if [[ "$dataset_name" = "sharegpt" ]]; then
client_command="vllm bench serve \
client_command="python3 benchmark_serving.py \
--backend $backend \
--tokenizer /tokenizer_cache \
--model $model \
@ -248,7 +246,7 @@ run_serving_tests() {
sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
client_command="vllm bench serve \
client_command="python3 benchmark_serving.py \
--backend $backend \
--tokenizer /tokenizer_cache \
--model $model \
@ -267,13 +265,13 @@ run_serving_tests() {
$client_args"
else
echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name."
exit 1
fi
echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command"
@ -304,7 +302,7 @@ run_serving_tests() {
}
run_genai_perf_tests() {
# run genai-perf tests
# run genai-perf tests
# $1: a json file specifying genai-perf test cases
local genai_perf_test_file
@ -313,14 +311,14 @@ run_genai_perf_tests() {
# Iterate over genai-perf tests
jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name')
test_name=$(echo "$params" | jq -r '.test_name')
# if TEST_SELECTOR is set, only run the test cases that match the selector
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
echo "Skip test case $test_name."
continue
fi
# prepend the current serving engine to the test name
test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
@ -371,10 +369,10 @@ run_genai_perf_tests() {
qps=$num_prompts
echo "now qps is $qps"
fi
new_test_name=$test_name"_qps_"$qps
backend=$CURRENT_LLM_SERVING_ENGINE
if [[ "$backend" == *"vllm"* ]]; then
backend="vllm"
fi
@ -382,7 +380,7 @@ run_genai_perf_tests() {
client_command="genai-perf profile \
-m $model \
--service-kind openai \
--backend "$backend" \
--backend vllm \
--endpoint-type chat \
--streaming \
--url localhost:$port \
@ -415,7 +413,7 @@ prepare_dataset() {
do
cat sonnet.txt >> sonnet_4x.txt
done
}
main() {

View File

@ -33,7 +33,7 @@ check_gpus() {
check_cpus() {
# check the number of CPUs and NUMA Node and GPU type.
declare -g numa_count=$(lscpu | grep "NUMA node(s):" | awk '{print $3}')
declare -g numa_count=$(python3 -c "from numa import info;numa_size = info.get_num_configured_nodes(); print(numa_size)")
if [[ $numa_count -gt 0 ]]; then
echo "NUMA found."
echo $numa_count
@ -126,8 +126,7 @@ kill_gpu_processes() {
ps -aux
lsof -t -i:8000 | xargs -r kill -9
pgrep python3 | xargs -r kill -9
# vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
pgrep VLLM | xargs -r kill -9
# wait until GPU memory usage smaller than 1GB
if command -v nvidia-smi; then
@ -165,7 +164,7 @@ upload_to_buildkite() {
}
run_latency_tests() {
# run latency tests using `vllm bench latency` command
# run latency tests using `benchmark_latency.py`
# $1: a json file specifying latency test cases
local latency_test_file
@ -194,11 +193,9 @@ run_latency_tests() {
# check if there is enough GPU to run the test
tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
if [ "$ON_CPU" == "1" ]; then
pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size')
world_size=$(($tp*$pp))
if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
if [ "$ON_CPU" == "1" ];then
if [[ $numa_count -lt $tp ]]; then
echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name."
continue
fi
else
@ -208,7 +205,7 @@ run_latency_tests() {
fi
fi
latency_command=" $latency_envs vllm bench latency \
latency_command=" $latency_envs python3 benchmark_latency.py \
--output-json $RESULTS_FOLDER/${test_name}.json \
$latency_args"
@ -234,7 +231,7 @@ run_latency_tests() {
}
run_throughput_tests() {
# run throughput tests using `vllm bench throughput`
# run throughput tests using `benchmark_throughput.py`
# $1: a json file specifying throughput test cases
local throughput_test_file
@ -263,11 +260,9 @@ run_throughput_tests() {
# check if there is enough GPU to run the test
tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
if [ "$ON_CPU" == "1" ]; then
pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size')
world_size=$(($tp*$pp))
if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
if [ "$ON_CPU" == "1" ];then
if [[ $numa_count -lt $tp ]]; then
echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name."
continue
fi
else
@ -277,7 +272,7 @@ run_throughput_tests() {
fi
fi
throughput_command=" $throughput_envs vllm bench throughput \
throughput_command=" $throughput_envs python3 benchmark_throughput.py \
--output-json $RESULTS_FOLDER/${test_name}.json \
$throughput_args"
@ -302,7 +297,7 @@ run_throughput_tests() {
}
run_serving_tests() {
# run serving tests using `vllm bench serve` command
# run serving tests using `benchmark_serving.py`
# $1: a json file specifying serving test cases
local serving_test_file
@ -333,21 +328,12 @@ run_serving_tests() {
qps_list=$(echo "$params" | jq -r '.qps_list')
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
echo "Running over qps list $qps_list"
max_concurrency_list=$(echo "$params" | jq -r '.max_concurrency_list')
if [[ -z "$max_concurrency_list" || "$max_concurrency_list" == "null" ]]; then
num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
max_concurrency_list="[$num_prompts]"
fi
max_concurrency_list=$(echo "$max_concurrency_list" | jq -r '.[] | @sh')
echo "Running over max concurrency list $max_concurrency_list"
# check if there is enough resources to run the test
tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
if [ "$ON_CPU" == "1" ]; then
pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size')
world_size=$(($tp*$pp))
if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
if [ "$ON_CPU" == "1" ];then
if [[ $numa_count -lt $tp ]]; then
echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name."
continue
fi
else
@ -403,39 +389,35 @@ run_serving_tests() {
echo "now qps is $qps"
fi
# iterate over different max_concurrency
for max_concurrency in $max_concurrency_list; do
new_test_name=$test_name"_qps_"$qps"_concurrency_"$max_concurrency
echo " new test name $new_test_name"
# pass the tensor parallel size to the client so that it can be displayed
# on the benchmark dashboard
client_command="vllm bench serve \
--save-result \
--result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \
--request-rate $qps \
--max-concurrency $max_concurrency \
--metadata "tensor_parallel_size=$tp" \
$client_args $client_remote_args "
new_test_name=$test_name"_qps_"$qps
echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command"
# pass the tensor parallel size to the client so that it can be displayed
# on the benchmark dashboard
client_command="python3 benchmark_serving.py \
--save-result \
--result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \
--request-rate $qps \
--metadata "tensor_parallel_size=$tp" \
$client_args $client_remote_args "
bash -c "$client_command"
echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command"
# record the benchmarking commands
jq_output=$(jq -n \
--arg server "$server_command" \
--arg client "$client_command" \
--arg gpu "$gpu_type" \
'{
server_command: $server,
client_command: $client,
gpu_type: $gpu
}')
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
bash -c "$client_command"
# record the benchmarking commands
jq_output=$(jq -n \
--arg server "$server_command" \
--arg client "$client_command" \
--arg gpu "$gpu_type" \
'{
server_command: $server,
client_command: $client,
gpu_type: $gpu
}')
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
done
done
# clean up
@ -465,7 +447,7 @@ main() {
(which jq) || (apt-get update && apt-get -y install jq)
(which lsof) || (apt-get update && apt-get install -y lsof)
# get the current IP address, required by `vllm bench serve` command
# get the current IP address, required by benchmark_serving.py
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
# turn of the reporting of the status of each request, to clean up the terminal output
export VLLM_LOGGING_LEVEL="WARNING"

View File

@ -11,7 +11,9 @@
},
"vllm_server_parameters": {
"disable_log_stats": "",
"disable_log_requests": "",
"gpu_memory_utilization": 0.9,
"num_scheduler_steps": 10,
"max_num_seqs": 512,
"dtype": "bfloat16"
},

View File

@ -6,7 +6,7 @@
"VLLM_CPU_KVCACHE_SPACE": 40
},
"parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"load_format": "dummy",
"num_iters_warmup": 5,
@ -20,7 +20,7 @@
"VLLM_CPU_KVCACHE_SPACE": 40
},
"parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"load_format": "dummy",
"num_iters_warmup": 5,

View File

@ -35,7 +35,9 @@
},
"vllm_server_parameters": {
"disable_log_stats": "",
"disable_log_requests": "",
"gpu_memory_utilization": 0.9,
"num_scheduler_steps": 10,
"max_num_seqs": 512,
"dtype": "bfloat16"
},
@ -88,7 +90,9 @@
},
"vllm_server_parameters": {
"disable_log_stats": "",
"disable_log_requests": "",
"gpu_memory_utilization": 0.9,
"num_scheduler_steps": 10,
"max_num_seqs": 512,
"dtype": "bfloat16"
},
@ -141,7 +145,9 @@
},
"vllm_server_parameters": {
"disable_log_stats": "",
"disable_log_requests": "",
"gpu_memory_utilization": 0.9,
"num_scheduler_steps": 10,
"max_num_seqs": 512,
"dtype": "bfloat16"
},
@ -191,7 +197,9 @@
},
"vllm_server_parameters": {
"disable_log_stats": "",
"disable_log_requests": "",
"gpu_memory_utilization": 0.9,
"num_scheduler_steps": 10,
"max_num_seqs": 512,
"dtype": "bfloat16"
},
@ -243,7 +251,9 @@
},
"vllm_server_parameters": {
"disable_log_stats": "",
"disable_log_requests": "",
"gpu_memory_utilization": 0.9,
"num_scheduler_steps": 10,
"max_num_seqs": 512,
"dtype": "bfloat16"
},
@ -295,7 +305,9 @@
},
"vllm_server_parameters": {
"disable_log_stats": "",
"disable_log_requests": "",
"gpu_memory_utilization": 0.9,
"num_scheduler_steps": 10,
"max_num_seqs": 512,
"dtype": "bfloat16"
},

View File

@ -1,610 +0,0 @@
[
{
"test_name": "serving_llama8B_bf16_tp1_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_bf16_tp2_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_bf16_tp4_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_bf16_tp1_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_bf16_tp2_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_bf16_tp4_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_int8_tp1_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_int8_tp2_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_int8_tp4_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_int8_tp1_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_int8_tp2_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_int8_tp4_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_int4_tp1_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"quantization": "awq",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_int4_tp2_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"quantization": "awq",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_int4_tp4_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"quantization": "awq",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_int4_tp1_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"quantization": "awq",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_int4_tp2_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"quantization": "awq",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_int4_tp4_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"quantization": "awq",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
}
]

View File

@ -1,820 +0,0 @@
[
{
"test_name": "serving_llama8B_bf16_pp1_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"pipeline_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_bf16_tp2_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_bf16_pp3_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"pipeline_parallel_size": 3,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_bf16_tp2pp3_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"pipeline_parallel_size": 3,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_bf16_pp1_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"pipeline_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_bf16_tp2_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_bf16_pp3_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"pipeline_parallel_size": 3,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_bf16_tp2pp3_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"pipeline_parallel_size": 3,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_int8_pp1_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"pipeline_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_int8_tp2_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_int8_pp3_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"pipeline_parallel_size": 3,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_int8_tp2pp3_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 2,
"pipeline_parallel_size": 3,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_int8_pp1_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"pipeline_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_int8_tp2_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_int8_pp3_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"pipeline_parallel_size": 3,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_int8_tp2pp3_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 2,
"pipeline_parallel_size": 3,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_int4_pp1_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"quantization": "awq",
"pipeline_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_int4_tp2_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"quantization": "awq",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_int4_pp3_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"quantization": "awq",
"pipeline_parallel_size": 3,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_int4_tp2pp3_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"quantization": "awq",
"tensor_parallel_size": 2,
"pipeline_parallel_size": 3,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_int4_pp1_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"quantization": "awq",
"pipeline_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_int4_tp2_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"quantization": "awq",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_int4_pp3_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"quantization": "awq",
"pipeline_parallel_size": 3,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_int4_tp2pp3_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"quantization": "awq",
"tensor_parallel_size": 2,
"pipeline_parallel_size": 3,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
}
]

View File

@ -2,112 +2,104 @@
{
"test_name": "serving_llama8B_tp1_sharegpt",
"qps_list": [1, 4, 16, "inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"disable_log_requests": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"max_concurrency": 60,
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_tp2_sharegpt",
"qps_list": [1, 4, 16, "inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"disable_log_requests": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"max_concurrency": 60,
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_tp4_sharegpt",
"qps_list": [1, 4, 16, "inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"disable_log_requests": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"max_concurrency": 60,
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_tp4_random_1024_128",
"qps_list": [1, 4, 16, "inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
@ -115,34 +107,32 @@
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"disable_log_requests": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 1024,
"random-output-len": 128,
"ignore-eos": "",
"max_concurrency": 100,
"num_prompts": 100
}
},
{
"test_name": "serving_llama8B_pp6_random_1024_128",
"qps_list": [1, 4, 16, "inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"pipeline_parallel_size": 6,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
@ -150,18 +140,18 @@
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"disable_log_requests": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 1024,
"random-output-len": 128,
"ignore-eos": "",
"max_concurrency": 100,
"num_prompts": 100
}
}

View File

@ -7,6 +7,7 @@
"tensor_parallel_size": 1,
"swap_space": 16,
"disable_log_stats": "",
"disable_log_requests": "",
"load_format": "dummy"
},
"client_parameters": {
@ -25,6 +26,7 @@
"tensor_parallel_size": 4,
"swap_space": 16,
"disable_log_stats": "",
"disable_log_requests": "",
"load_format": "dummy"
},
"client_parameters": {
@ -43,6 +45,7 @@
"tensor_parallel_size": 2,
"swap_space": 16,
"disable_log_stats": "",
"disable_log_requests": "",
"load_format": "dummy"
},
"client_parameters": {
@ -57,7 +60,8 @@
"test_name": "serving_llama70B_tp4_sharegpt_specdecode",
"qps_list": [2],
"server_parameters": {
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"disable_log_requests": "",
"tensor_parallel_size": 4,
"swap_space": 16,
"speculative_config": {

View File

@ -6,7 +6,7 @@
"VLLM_CPU_KVCACHE_SPACE": 40
},
"parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"load_format": "dummy",
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
@ -21,7 +21,7 @@
"VLLM_CPU_KVCACHE_SPACE": 40
},
"parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"load_format": "dummy",
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",

View File

@ -1,22 +1,5 @@
steps:
# aarch64 + CUDA builds. PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
- label: "Build arm64 wheel - CUDA 12.9"
depends_on: ~
id: build-wheel-arm64-cuda-12-9
agents:
queue: arm64_cpu_queue_postmerge
commands:
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh"
env:
DOCKER_BUILDKIT: "1"
- label: "Build wheel - CUDA 12.8"
depends_on: ~
id: build-wheel-cuda-12-8
agents:
queue: cpu_queue_postmerge
@ -29,7 +12,6 @@ steps:
DOCKER_BUILDKIT: "1"
- label: "Build wheel - CUDA 12.6"
depends_on: ~
id: build-wheel-cuda-12-6
agents:
queue: cpu_queue_postmerge
@ -41,61 +23,44 @@ steps:
env:
DOCKER_BUILDKIT: "1"
# x86 + CUDA builds
- label: "Build wheel - CUDA 12.9"
depends_on: ~
id: build-wheel-cuda-12-9
# Note(simon): We can always build CUDA 11.8 wheel to ensure the build is working.
# However, this block can be uncommented to save some compute hours.
# - block: "Build CUDA 11.8 wheel"
# key: block-build-cu118-wheel
- label: "Build wheel - CUDA 11.8"
# depends_on: block-build-cu118-wheel
id: build-wheel-cuda-11-8
agents:
queue: cpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh"
env:
DOCKER_BUILDKIT: "1"
- label: "Build release image (x86)"
- block: "Build release image"
depends_on: ~
id: build-release-image-x86
key: block-release-image-build
- label: "Build release image"
depends_on: block-release-image-build
id: build-release-image
agents:
queue: cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
# re-tag to default image tag and push, just in case arm64 build fails
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
# PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
- label: "Build release image (arm64)"
depends_on: ~
id: build-release-image-arm64
agents:
queue: arm64_cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
# Add job to create multi-arch manifest
- label: "Create multi-arch manifest"
depends_on:
- build-release-image-x86
- build-release-image-arm64
id: create-multi-arch-manifest
agents:
queue: cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- label: "Annotate release workflow"
depends_on:
- create-multi-arch-manifest
- build-release-image
- build-wheel-cuda-12-8
- build-wheel-cuda-12-6
- build-wheel-cuda-11-8
id: annotate-release-workflow
agents:
queue: cpu_queue_postmerge
@ -142,24 +107,18 @@ steps:
env:
DOCKER_BUILDKIT: "1"
- label: "Build and publish nightly multi-arch image to DockerHub"
depends_on:
- create-multi-arch-manifest
if: build.env("NIGHTLY") == "1"
- block: "Build Neuron release image"
key: block-neuron-release-image-build
depends_on: ~
- label: "Build and publish Neuron release image"
depends_on: block-neuron-release-image-build
agents:
queue: cpu_queue_postmerge
queue: neuron-postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly"
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
- "docker push vllm/vllm-openai:nightly"
- "docker push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
# Clean up old nightly builds (keep only last 14)
- "bash .buildkite/scripts/cleanup-nightly-builds.sh"
plugins:
- docker-login#v3.0.0:
username: vllmbot
password-env: DOCKERHUB_TOKEN
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest"
- "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)"
env:
DOCKER_BUILDKIT: "1"

View File

@ -14,33 +14,18 @@ buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
To download the wheel:
\`\`\`
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu118/vllm-${RELEASE_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl .
\`\`\`
To download and upload the image:
\`\`\`
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
docker push vllm/vllm-openai:latest-x86_64
docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64
docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64
docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
docker push vllm/vllm-openai:latest-aarch64
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 --amend
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 --amend
docker manifest push vllm/vllm-openai:latest
docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} vllm/vllm-openai
docker tag vllm/vllm-openai vllm/vllm-openai:latest
docker tag vllm/vllm-openai vllm/vllm-openai:v${RELEASE_VERSION}
docker push vllm/vllm-openai:latest
docker push vllm/vllm-openai:v${RELEASE_VERSION}
\`\`\`
EOF

View File

@ -1,97 +0,0 @@
#!/bin/bash
set -ex
# Clean up old nightly builds from DockerHub, keeping only the last 14 builds
# This script uses DockerHub API to list and delete old tags with "nightly-" prefix
# DockerHub API endpoint for vllm/vllm-openai repository
REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
# Get DockerHub token from environment
if [ -z "$DOCKERHUB_TOKEN" ]; then
echo "Error: DOCKERHUB_TOKEN environment variable is not set"
exit 1
fi
# Function to get all tags from DockerHub
get_all_tags() {
local page=1
local all_tags=""
while true; do
local response=$(curl -s -H "Authorization: Bearer $DOCKERHUB_TOKEN" \
"$REPO_API_URL?page=$page&page_size=100")
# Get both last_updated timestamp and tag name, separated by |
local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"')
if [ -z "$tags" ]; then
break
fi
all_tags="$all_tags$tags"$'\n'
page=$((page + 1))
done
# Sort by timestamp (newest first) and extract just the tag names
echo "$all_tags" | sort -r | cut -d'|' -f2
}
delete_tag() {
local tag_name="$1"
echo "Deleting tag: $tag_name"
local delete_url="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags/$tag_name"
local response=$(curl -s -X DELETE -H "Authorization: Bearer $DOCKERHUB_TOKEN" "$delete_url")
if echo "$response" | jq -e '.detail' > /dev/null 2>&1; then
echo "Warning: Failed to delete tag $tag_name: $(echo "$response" | jq -r '.detail')"
else
echo "Successfully deleted tag: $tag_name"
fi
}
# Get all nightly- prefixed tags, sorted by last_updated timestamp (newest first)
echo "Fetching all tags from DockerHub..."
all_tags=$(get_all_tags)
if [ -z "$all_tags" ]; then
echo "No tags found to clean up"
exit 0
fi
# Count total tags
total_tags=$(echo "$all_tags" | wc -l)
echo "Found $total_tags tags"
# Keep only the last 14 builds (including the current one)
tags_to_keep=14
tags_to_delete=$((total_tags - tags_to_keep))
if [ $tags_to_delete -le 0 ]; then
echo "No tags need to be deleted (only $total_tags tags found, keeping $tags_to_keep)"
exit 0
fi
echo "Will delete $tags_to_delete old tags, keeping the newest $tags_to_keep"
# Get tags to delete (skip the first $tags_to_keep tags)
tags_to_delete_list=$(echo "$all_tags" | tail -n +$((tags_to_keep + 1)))
if [ -z "$tags_to_delete_list" ]; then
echo "No tags to delete"
exit 0
fi
# Delete old tags
echo "Deleting old tags..."
while IFS= read -r tag; do
if [ -n "$tag" ]; then
delete_tag "$tag"
# Add a small delay to avoid rate limiting
sleep 1
fi
done <<< "$tags_to_delete_list"
echo "Cleanup completed successfully"

View File

@ -86,6 +86,10 @@ if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
fi
if [[ $commands == *"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"* ]]; then
commands=${commands//"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"/"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2 and not BambaForCausalLM and not Gemma2ForCausalLM and not Grok1ModelForCausalLM and not Zamba2ForCausalLM and not Gemma2Model and not GritLM'"}
fi
if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
fi
@ -104,6 +108,7 @@ fi
if [[ $commands == *" kernels/attention"* ]]; then
commands="${commands} \
--ignore=kernels/attention/test_attention_selector.py \
--ignore=kernels/attention/test_blocksparse_attention.py \
--ignore=kernels/attention/test_encoder_decoder_attn.py \
--ignore=kernels/attention/test_flash_attn.py \
--ignore=kernels/attention/test_flashinfer.py \
@ -117,6 +122,7 @@ fi
if [[ $commands == *" kernels/quantization"* ]]; then
commands="${commands} \
--ignore=kernels/quantization/test_int8_quant.py \
--ignore=kernels/quantization/test_aqlm.py \
--ignore=kernels/quantization/test_machete_mm.py \
--ignore=kernels/quantization/test_block_fp8.py \
--ignore=kernels/quantization/test_block_int8.py \
@ -160,9 +166,16 @@ if [[ $commands == *" entrypoints/llm "* ]]; then
--ignore=entrypoints/llm/test_chat.py \
--ignore=entrypoints/llm/test_accuracy.py \
--ignore=entrypoints/llm/test_init.py \
--ignore=entrypoints/llm/test_generate_multiple_loras.py \
--ignore=entrypoints/llm/test_prompt_validation.py "}
fi
#Obsolete currently
##ignore certain Entrypoints/llm tests
#if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
# commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
#fi
# --ignore=entrypoints/openai/test_encoder_decoder.py \
# --ignore=entrypoints/openai/test_embedding.py \
# --ignore=entrypoints/openai/test_oot_registration.py

View File

@ -6,16 +6,15 @@ set -ex
# allow to bind to different cores
CORE_RANGE=${CORE_RANGE:-48-95}
# used for TP/PP E2E test
OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
NUMA_NODE=${NUMA_NODE:-1}
export CMAKE_BUILD_PARALLEL_LEVEL=32
# Setup cleanup
remove_docker_container() {
set -e;
docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;
remove_docker_container() {
set -e;
docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;
}
trap remove_docker_container EXIT
remove_docker_container
@ -25,8 +24,8 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
# Run the image, setting --shm-size=4g for tensor parallel.
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
function cpu_tests() {
set -e
@ -46,74 +45,58 @@ function cpu_tests() {
set -e
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
# Run kernel tests
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pytest -x -v -s tests/kernels/test_onednn.py"
# Run basic model test
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
# Note: disable until supports V1
# pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
# pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
# pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
# pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
pytest -x -v -s tests/models/language/generation -m cpu_model
VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model
# Note: disable Bart until supports V1
pytest -v -s tests/models/language/generation -m cpu_model \
--ignore=tests/models/language/generation/test_bart.py
VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model \
--ignore=tests/models/language/generation/test_bart.py
pytest -x -v -s tests/models/language/pooling -m cpu_model
pytest -x -v -s tests/models/multimodal/generation \
pytest -v -s tests/models/language/pooling -m cpu_model
pytest -v -s tests/models/multimodal/generation \
--ignore=tests/models/multimodal/generation/test_mllama.py \
--ignore=tests/models/multimodal/generation/test_pixtral.py \
-m cpu_model"
# Run compressed-tensor test
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pytest -x -s -v \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
pytest -s -v \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
# Note: disable it until supports V1
# Run AWQ test
# docker exec cpu-test-"$NUMA_NODE" bash -c "
# set -e
# VLLM_USE_V1=0 pytest -x -s -v \
# VLLM_USE_V1=0 pytest -s -v \
# tests/quantization/test_ipex_quant.py"
# online serving
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half &
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
VLLM_CPU_CI_ENV=0 python3 benchmarks/benchmark_serving.py \
--backend vllm \
--dataset-name random \
--model facebook/opt-125m \
--num-prompts 20 \
--endpoint /v1/completions \
--tokenizer facebook/opt-125m"
# Run multi-lora tests
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pytest -x -s -v \
pytest -s -v \
tests/lora/test_qwen2vl.py"
# online serving: tp+pp
docker exec cpu-test-"$NUMA_NODE" bash -c '
set -e
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
server_pid=$!
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
vllm bench serve \
--backend vllm \
--dataset-name random \
--model meta-llama/Llama-3.2-3B-Instruct \
--num-prompts 20 \
--endpoint /v1/completions
kill -s SIGTERM $server_pid &'
# online serving: tp+dp
docker exec cpu-test-"$NUMA_NODE" bash -c '
set -e
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 &
server_pid=$!
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
vllm bench serve \
--backend vllm \
--dataset-name random \
--model meta-llama/Llama-3.2-3B-Instruct \
--num-prompts 20 \
--endpoint /v1/completions
kill -s SIGTERM $server_pid &'
}
# All of CPU tests are expected to be finished less than 40 mins.
export -f cpu_tests
timeout 2h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
timeout 1.5h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"

View File

@ -16,7 +16,8 @@ DOCKER_BUILDKIT=1 docker build . \
--build-arg max_jobs=66 \
--build-arg nvcc_threads=2 \
--build-arg RUN_WHEEL_CHECK=false \
--build-arg torch_cuda_arch_list="9.0+PTX"
--build-arg torch_cuda_arch_list="9.0+PTX" \
--build-arg vllm_fa_cmake_gpu_arches="90-real"
# Setup cleanup
remove_docker_container() { docker rm -f gh200-test || true; }

View File

@ -6,17 +6,19 @@ set -exuo pipefail
# Try building the docker image
cat <<EOF | docker build -t hpu-plugin-v1-test-env -f - .
FROM gaudi-base-image:latest
FROM 1.22-413-pt2.7.1:latest
COPY ./ /workspace/vllm
WORKDIR /workspace/vllm
RUN pip install -v -r requirements/hpu.txt
RUN pip install git+https://github.com/vllm-project/vllm-gaudi.git
ENV no_proxy=localhost,127.0.0.1
ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
RUN VLLM_TARGET_DEVICE=empty pip install .
RUN pip install git+https://github.com/vllm-project/vllm-gaudi.git
RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install
# install development dependencies (for testing)
RUN python3 -m pip install -e tests/vllm_test_utils

View File

@ -0,0 +1,64 @@
#!/bin/bash
# This script build the Neuron docker image and run the API server inside the container.
# It serves a sanity check for compilation and basic model usage.
set -e
set -v
image_name="neuron/vllm-ci"
container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
HF_CACHE="$(realpath ~)/huggingface"
mkdir -p "${HF_CACHE}"
HF_MOUNT="/root/.cache/huggingface"
HF_TOKEN=$(aws secretsmanager get-secret-value --secret-id "ci/vllm-neuron/hf-token" --region us-west-2 --query 'SecretString' --output text | jq -r .VLLM_NEURON_CI_HF_TOKEN)
NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
mkdir -p "${NEURON_COMPILE_CACHE_URL}"
NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
# Try building the docker image
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws
# prune old image and containers to save disk space, and only once a day
# by using a timestamp file in tmp.
if [ -f /tmp/neuron-docker-build-timestamp ]; then
last_build=$(cat /tmp/neuron-docker-build-timestamp)
current_time=$(date +%s)
if [ $((current_time - last_build)) -gt 86400 ]; then
# Remove dangling images (those that are not tagged and not used by any container)
docker image prune -f
# Remove unused volumes / force the system prune for old images as well.
docker volume prune -f && docker system prune -f
echo "$current_time" > /tmp/neuron-docker-build-timestamp
fi
else
date "+%s" > /tmp/neuron-docker-build-timestamp
fi
docker build -t "${image_name}" -f docker/Dockerfile.neuron .
# Setup cleanup
remove_docker_container() {
docker image rm -f "${image_name}" || true;
}
trap remove_docker_container EXIT
# Run the image
docker run --rm -it --device=/dev/neuron0 --network bridge \
-v "${HF_CACHE}:${HF_MOUNT}" \
-e "HF_HOME=${HF_MOUNT}" \
-e "HF_TOKEN=${HF_TOKEN}" \
-v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
-e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
--name "${container_name}" \
${image_name} \
/bin/bash -c "
set -e; # Exit on first error
python3 /workspace/vllm/examples/offline_inference/neuron.py;
python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys;
for f in /workspace/vllm/tests/neuron/2_core/*.py; do
echo \"Running test file: \$f\";
python3 -m pytest \$f -v --capture=tee-sys;
done
"

View File

@ -1,167 +0,0 @@
#!/bin/bash
set -xu
remove_docker_container() {
docker rm -f tpu-test || true;
}
trap remove_docker_container EXIT
# Remove the container that might not be cleaned up in the previous run.
remove_docker_container
# Build the docker image.
docker build -f docker/Dockerfile.tpu -t vllm-tpu .
# Set up cleanup.
cleanup_docker() {
# Get Docker's root directory
docker_root=$(docker info -f '{{.DockerRootDir}}')
if [ -z "$docker_root" ]; then
echo "Failed to determine Docker root directory."
exit 1
fi
echo "Docker root directory: $docker_root"
# Check disk usage of the filesystem where Docker's root directory is located
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
# Define the threshold
threshold=70
if [ "$disk_usage" -gt "$threshold" ]; then
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
# Remove dangling images (those that are not tagged and not used by any container)
docker image prune -f
# Remove unused volumes / force the system prune for old images as well.
docker volume prune -f && docker system prune --force --filter "until=72h" --all
echo "Docker images and volumes cleanup completed."
else
echo "Disk usage is below $threshold%. No cleanup needed."
fi
}
cleanup_docker
# For HF_TOKEN.
source /etc/environment
docker run --privileged --net host --shm-size=16G -it \
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
vllm-tpu /bin/bash -c '
set -e # Exit immediately if a command exits with a non-zero status.
set -u # Treat unset variables as an error.
echo "--- Starting script inside Docker container ---"
# Create results directory
RESULTS_DIR=$(mktemp -d)
# If mktemp fails, set -e will cause the script to exit.
echo "Results will be stored in: $RESULTS_DIR"
# Install dependencies
echo "--- Installing Python dependencies ---"
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
echo "--- Python dependencies installed ---"
export VLLM_USE_V1=1
export VLLM_XLA_CHECK_RECOMPILATION=1
export VLLM_XLA_CACHE_PATH=
echo "Using VLLM V1"
echo "--- Hardware Information ---"
# tpu-info
echo "--- Starting Tests ---"
set +e
overall_script_exit_code=0
# --- Test Definitions ---
# If a test fails, this function will print logs and will not cause the main script to exit.
run_test() {
local test_num=$1
local test_name=$2
local test_command=$3
local log_file="$RESULTS_DIR/test_${test_num}.log"
local actual_exit_code
echo "--- TEST_$test_num: Running $test_name ---"
# Execute the test command.
eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2)
actual_exit_code=$?
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log
if [ "$actual_exit_code" -ne 0 ]; then
echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2
echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2
if [ -f "$log_file" ]; then
cat "$log_file" >&2
else
echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2
fi
echo "--- End of log for TEST_$test_num ($test_name) ---" >&2
return "$actual_exit_code" # Return the failure code
else
echo "TEST_$test_num ($test_name) PASSED."
return 0 # Return success
fi
}
# Helper function to call run_test and update the overall script exit code
run_and_track_test() {
local test_num_arg="$1"
local test_name_arg="$2"
local test_command_arg="$3"
# Run the test
run_test "$test_num_arg" "$test_name_arg" "$test_command_arg"
local test_specific_exit_code=$?
# If the test failed, set the overall script exit code to 1
if [ "$test_specific_exit_code" -ne 0 ]; then
# No need for extra echo here, run_test already logged the failure.
overall_script_exit_code=1
fi
}
# --- Actual Test Execution ---
run_and_track_test 1 "test_struct_output_generate.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
run_and_track_test 2 "test_moe_pallas.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
run_and_track_test 3 "test_lora.py" \
"VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
run_and_track_test 4 "test_tpu_qkv_linear.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
run_and_track_test 5 "test_spmd_model_weight_loading.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
run_and_track_test 6 "test_kv_cache_update_kernel.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py"
run_and_track_test 7 "test_tpu_int8.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_int8.py"
# After all tests have been attempted, exit with the overall status.
if [ "$overall_script_exit_code" -ne 0 ]; then
echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---"
else
echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---"
fi
exit "$overall_script_exit_code"
' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct.
# Capture the exit code of the docker run command
DOCKER_RUN_EXIT_CODE=$?
# The trap will run for cleanup.
# Exit the main script with the Docker run command's exit code.
if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then
echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE."
exit "$DOCKER_RUN_EXIT_CODE"
else
echo "Docker run command completed successfully."
exit 0
fi
# TODO: This test fails because it uses RANDOM_SEED sampling
# pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \

View File

@ -5,6 +5,7 @@ set -xu
remove_docker_container() {
docker rm -f tpu-test || true;
docker rm -f vllm-tpu || true;
}
trap remove_docker_container EXIT
@ -61,8 +62,7 @@ echo "Results will be stored in: $RESULTS_DIR"
echo "--- Installing Python dependencies ---"
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
&& python3 -m pip install --progress-bar off lm_eval[api]==0.4.4
echo "--- Python dependencies installed ---"
export VLLM_USE_V1=1
export VLLM_XLA_CHECK_RECOMPILATION=1
@ -70,7 +70,7 @@ export VLLM_XLA_CACHE_PATH=
echo "Using VLLM V1"
echo "--- Hardware Information ---"
# tpu-info
tpu-info
echo "--- Starting Tests ---"
set +e
overall_script_exit_code=0
@ -149,6 +149,18 @@ run_and_track_test 9 "test_multimodal.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py"
run_and_track_test 10 "test_pallas.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
run_and_track_test 11 "test_struct_output_generate.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
run_and_track_test 12 "test_moe_pallas.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
run_and_track_test 13 "test_lora.py" \
"VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
run_and_track_test 14 "test_tpu_qkv_linear.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
run_and_track_test 15 "test_spmd_model_weight_loading.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
run_and_track_test 16 "test_kv_cache_update_kernel.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py"
# After all tests have been attempted, exit with the overall status.
if [ "$overall_script_exit_code" -ne 0 ]; then

View File

@ -23,27 +23,12 @@ docker run \
--device /dev/dri \
-v /dev/dri/by-path:/dev/dri/by-path \
--entrypoint="" \
-e "HF_TOKEN=${HF_TOKEN}" \
-e "ZE_AFFINITY_MASK=${ZE_AFFINITY_MASK}" \
--name "${container_name}" \
"${image_name}" \
bash -c '
set -e
echo $ZE_AFFINITY_MASK
pip install tblib==3.1.0
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
sh -c '
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
cd tests
pytest -v -s v1/core
pytest -v -s v1/engine
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
pytest -v -s v1/structured_output
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
pytest -v -s v1/test_metrics
pytest -v -s v1/test_serial_utils.py
'

View File

@ -11,10 +11,10 @@ cd "$(dirname "${BASH_SOURCE[0]}")/../.."
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
# run python-based benchmarks and upload the result to buildkite
vllm bench latency --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
python3 benchmarks/benchmark_latency.py --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
bench_latency_exit_code=$?
vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
bench_throughput_exit_code=$?
# run server-based benchmarks and upload the result to buildkite
@ -24,7 +24,7 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r
# wait for server to start, timeout after 600 seconds
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
vllm bench serve \
python3 benchmarks/benchmark_serving.py \
--backend vllm \
--dataset-name sharegpt \
--dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \

View File

@ -1,59 +0,0 @@
#!/bin/bash
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Setup script for Prime-RL integration tests
# This script prepares the environment for running Prime-RL tests with nightly vLLM
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git"
PRIME_RL_DIR="${REPO_ROOT}/prime-rl"
echo "Setting up Prime-RL integration test environment..."
# Clean up any existing Prime-RL directory
if [ -d "${PRIME_RL_DIR}" ]; then
echo "Removing existing Prime-RL directory..."
rm -rf "${PRIME_RL_DIR}"
fi
# Install UV if not available
if ! command -v uv &> /dev/null; then
echo "Installing UV package manager..."
curl -LsSf https://astral.sh/uv/install.sh | sh
source $HOME/.local/bin/env
fi
# Clone Prime-RL repository at specific branch for reproducible tests
PRIME_RL_BRANCH="integ-vllm-main"
echo "Cloning Prime-RL repository at branch: ${PRIME_RL_BRANCH}..."
git clone --branch "${PRIME_RL_BRANCH}" --single-branch "${PRIME_RL_REPO}" "${PRIME_RL_DIR}"
cd "${PRIME_RL_DIR}"
echo "Setting up UV project environment..."
export UV_PROJECT_ENVIRONMENT=/usr/local
ln -s /usr/bin/python3 /usr/local/bin/python
# Remove vllm pin from pyproject.toml
echo "Removing vllm pin from pyproject.toml..."
sed -i '/vllm==/d' pyproject.toml
# Sync Prime-RL dependencies
echo "Installing Prime-RL dependencies..."
uv sync --inexact && uv sync --inexact --all-extras
# Verify installation
echo "Verifying installations..."
uv run python -c "import vllm; print(f'vLLM version: {vllm.__version__}')"
uv run python -c "import prime_rl; print('Prime-RL imported successfully')"
echo "Prime-RL integration test environment setup complete!"
echo "Running Prime-RL integration tests..."
export WANDB_MODE=offline # this makes this test not require a WANDB_API_KEY
uv run pytest -vs tests/integration/test_rl.py -m gpu
echo "Prime-RL integration tests completed!"

View File

@ -17,7 +17,7 @@ if [ "$disk_usage" -gt "$threshold" ]; then
# Remove dangling images (those that are not tagged and not used by any container)
docker image prune -f
# Remove unused volumes / force the system prune for old images as well.
docker volume prune -f && docker system prune --force --filter "until=24h" --all
docker volume prune -f && docker system prune --force --filter "until=72h" --all
echo "Docker images and volumes cleanup completed."
else
echo "Disk usage is below $threshold%. No cleanup needed."

View File

@ -1,6 +1,6 @@
# Environment config
TEST_NAME=llama8b
CONTAINER_NAME=tpu-test
CONTAINER_NAME=vllm-tpu
# vllm config
MODEL=meta-llama/Llama-3.1-8B-Instruct

View File

@ -12,6 +12,8 @@ source /etc/environment
source $ENV_FILE
remove_docker_container() {
docker rm -f tpu-test || true;
docker rm -f vllm-tpu || true;
docker rm -f $CONTAINER_NAME || true;
}

View File

@ -1,6 +1,6 @@
# Environment config
TEST_NAME=llama8bw8a8
CONTAINER_NAME=tpu-test
CONTAINER_NAME=vllm-tpu
# vllm config
MODEL=RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8

View File

@ -44,6 +44,7 @@ echo
VLLM_USE_V1=1 vllm serve $MODEL \
--seed 42 \
--disable-log-requests \
--max-num-seqs $MAX_NUM_SEQS \
--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
--tensor-parallel-size $TENSOR_PARALLEL_SIZE \
@ -76,7 +77,7 @@ done
echo "run benchmark test..."
echo "logging to $BM_LOG"
echo
vllm bench serve \
python benchmarks/benchmark_serving.py \
--backend vllm \
--model $MODEL \
--dataset-name sonnet \

View File

@ -14,19 +14,8 @@ fi
# Get the single wheel file
wheel="${wheel_files[0]}"
# Detect architecture and rename 'linux' to appropriate manylinux version
arch=$(uname -m)
if [[ $arch == "x86_64" ]]; then
manylinux_version="manylinux1"
elif [[ $arch == "aarch64" ]]; then
manylinux_version="manylinux2014"
else
echo "Warning: Unknown architecture $arch, using manylinux1 as default"
manylinux_version="manylinux1"
fi
# Rename 'linux' to the appropriate manylinux version in the wheel filename
new_wheel="${wheel/linux/$manylinux_version}"
# Rename 'linux' to 'manylinux1' in the wheel filename
new_wheel="${wheel/linux/manylinux1}"
mv -- "$wheel" "$new_wheel"
wheel="$new_wheel"
@ -58,15 +47,14 @@ python3 .buildkite/generate_index.py --wheel "$normal_wheel"
aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
if [[ $normal_wheel == *"cu126"* ]]; then
if [[ $normal_wheel == *"cu118"* ]]; then
# if $normal_wheel matches cu118, do not upload the index.html
echo "Skipping index files for cu118 wheels"
elif [[ $normal_wheel == *"cu126"* ]]; then
# if $normal_wheel matches cu126, do not upload the index.html
echo "Skipping index files for cu126 wheels"
elif [[ $normal_wheel == *"cu128"* ]]; then
# if $normal_wheel matches cu128, do not upload the index.html
echo "Skipping index files for cu128 wheels"
else
# only upload index.html for cu129 wheels (default wheels) as it
# is available on both x86 and arm64
# only upload index.html for cu128 wheels (default wheels)
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
fi
@ -75,15 +63,14 @@ fi
aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
if [[ $normal_wheel == *"cu126"* ]]; then
if [[ $normal_wheel == *"cu118"* ]]; then
# if $normal_wheel matches cu118, do not upload the index.html
echo "Skipping index files for cu118 wheels"
elif [[ $normal_wheel == *"cu126"* ]]; then
# if $normal_wheel matches cu126, do not upload the index.html
echo "Skipping index files for cu126 wheels"
elif [[ $normal_wheel == *"cu128"* ]]; then
# if $normal_wheel matches cu128, do not upload the index.html
echo "Skipping index files for cu128 wheels"
else
# only upload index.html for cu129 wheels (default wheels) as it
# is available on both x86 and arm64
# only upload index.html for cu128 wheels (default wheels)
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
fi

File diff suppressed because it is too large Load Diff

View File

@ -1,32 +0,0 @@
[run]
source = vllm
omit =
*/tests/*
*/test_*
*/__pycache__/*
*/build/*
*/dist/*
*/vllm.egg-info/*
*/third_party/*
*/examples/*
*/benchmarks/*
*/docs/*
[report]
exclude_lines =
pragma: no cover
def __repr__
if self.debug:
if settings.DEBUG
raise AssertionError
raise NotImplementedError
if 0:
if __name__ == .__main__.:
class .*\bProtocol\):
@(abc\.)?abstractmethod
[html]
directory = htmlcov
[xml]
output = coverage.xml

View File

@ -1,6 +0,0 @@
# https://developers.google.com/gemini-code-assist/docs/customize-gemini-behavior-github
have_fun: false # Just review the code
code_review:
comment_severity_threshold: HIGH # Reduce quantity of comments
pull_request_opened:
summary: false # Don't summarize the PR in a separate comment

View File

@ -1,24 +0,0 @@
# doc: https://github.com/pytorch/test-infra/blob/main/tools/stronghold/docs/bc_linter_config.md
version: 1
paths:
# We temporarily disable globally, and will only enable with `annotations.include`
# include:
# - "vllm/v1/attetion/*.py"
# - "vllm/v1/core/*.py"
exclude:
- "**/*.py"
scan:
functions: true # check free functions and methods
classes: true # check classes/dataclasses
public_only: true # ignore names starting with "_" at any level
annotations:
include: # decorators that forceinclude a symbol
- name: "bc_linter_include" # matched by simple name or dotted suffix
propagate_to_members: false # for classes, include methods/inner classes
exclude: # decorators that forceexclude a symbol
- name: "bc_linter_skip" # matched by simple name or dotted suffix
propagate_to_members: true # for classes, exclude methods/inner classes
excluded_violations: [] # e.g. ["ParameterRenamed", "FieldTypeChanged"]

120
.github/CODEOWNERS vendored
View File

@ -2,121 +2,53 @@
# for more info about CODEOWNERS file
# This lists cover the "core" components of vLLM that require careful review
/vllm/attention @LucasWilkinson
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
/vllm/model_executor/layers/fused_moe @mgoin
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
/vllm/model_executor/layers/mamba @tdoublep
/vllm/model_executor/model_loader @22quinn
/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
/vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
/vllm/model_executor/guided_decoding @mgoin @russellb @aarnphm
/vllm/multimodal @DarkLight1337 @ywang96
/vllm/vllm_flash_attn @LucasWilkinson
/vllm/lora @jeejeelee
/vllm/reasoning @aarnphm @chaunceyjiang
/vllm/entrypoints @aarnphm @chaunceyjiang
/vllm/compilation @zou3519 @youkaichao @ProExpertProg
/vllm/distributed/kv_transfer @NickLucche @ApostaC
/vllm/reasoning @aarnphm
/vllm/entrypoints @aarnphm
CMakeLists.txt @tlrmchlsmth @LucasWilkinson
# Any change to the VllmConfig changes can have a large user-facing impact,
# so spam a lot of people
/vllm/config @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
/vllm/config.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor
# vLLM V1
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
/vllm/v1/attention @LucasWilkinson
/vllm/v1/attention/backends/flashinfer.py @mgoin
/vllm/v1/attention/backends/triton_attn.py @tdoublep
/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
/vllm/v1/sample @22quinn @houseroad @njhill
/vllm/v1/spec_decode @benchislett @luccafong
/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
/vllm/v1/kv_cache_interface.py @heheda12345
/vllm/v1/offloading @ApostaC
/vllm/v1/structured_output @mgoin @russellb @aarnphm
# Test ownership
/.buildkite/lm-eval-harness @mgoin @simon-mo
/tests/async_engine @njhill @robertgshaw2-redhat @simon-mo
/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac
/tests/distributed/test_multi_node_assignment.py @youkaichao
/tests/distributed/test_pipeline_parallel.py @youkaichao
/tests/distributed/test_same_node.py @youkaichao
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm @NickLucche
/tests/evals @mgoin
/tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm
/tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb @aarnphm
/tests/kernels @tlrmchlsmth @WoosukKwon
/tests/model_executor/test_guided_processors.py @mgoin @russellb
/tests/models @DarkLight1337 @ywang96
/tests/multimodal @DarkLight1337 @ywang96 @NickLucche
/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256
/tests/multi_step @alexm-redhat @comaniac
/tests/multimodal @DarkLight1337 @ywang96
/tests/prefix_caching @comaniac @KuntaiDu
/tests/quantization @mgoin @robertgshaw2-redhat
/tests/spec_decode @njhill @LiuXiaoxuanPKU
/tests/test_inputs.py @DarkLight1337 @ywang96
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
/tests/v1/structured_output @mgoin @russellb @aarnphm
/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
/tests/weight_loading @mgoin @youkaichao @yewentao256
/tests/weight_loading @mgoin @youkaichao
/tests/lora @jeejeelee
/tests/models/language/generation/test_hybrid.py @tdoublep
/tests/v1/kv_connector/nixl_integration @NickLucche
/tests/v1/kv_connector @ApostaC
/tests/v1/offloading @ApostaC
# Transformers backend
/vllm/model_executor/models/transformers.py @hmellor
/tests/models/test_transformers.py @hmellor
# Docs
/docs/mkdocs @hmellor
/docs/**/*.yml @hmellor
/requirements/docs.txt @hmellor
.readthedocs.yaml @hmellor
/docs @hmellor
mkdocs.yaml @hmellor
# Linting
.markdownlint.yaml @hmellor
.pre-commit-config.yaml @hmellor
/tools/pre_commit @hmellor
# CPU
/vllm/v1/worker/cpu* @bigPYJ1151
/csrc/cpu @bigPYJ1151
/vllm/platforms/cpu.py @bigPYJ1151
/cmake/cpu_extension.cmake @bigPYJ1151
/docker/Dockerfile.cpu @bigPYJ1151
# Intel GPU
/vllm/v1/worker/xpu* @jikunshang
/vllm/platforms/xpu.py @jikunshang
/docker/Dockerfile.xpu @jikunshang
# Qwen-specific files
/vllm/attention/backends/dual_chunk_flash_attn.py @sighingnow
/vllm/model_executor/models/qwen* @sighingnow
# MTP-specific files
/vllm/model_executor/models/deepseek_mtp.py @luccafong
# Mistral-specific files
/vllm/model_executor/models/mistral*.py @patrickvonplaten
/vllm/model_executor/models/mixtral*.py @patrickvonplaten
/vllm/model_executor/models/voxtral*.py @patrickvonplaten
/vllm/model_executor/models/pixtral*.py @patrickvonplaten
/vllm/transformers_utils/configs/mistral.py @patrickvonplaten
/vllm/transformers_utils/tokenizers/mistral.py @patrickvonplaten
# Kernels
/vllm/attention/ops/chunked_prefill_paged_decode.py @tdoublep
/vllm/attention/ops/triton_unified_attention.py @tdoublep
# ROCm related: specify owner with write access to notify AMD folks for careful code review
/docker/Dockerfile.rocm* @gshtras
/vllm/v1/attention/backends/rocm*.py @gshtras
/vllm/v1/attention/backends/mla/rocm*.py @gshtras
/vllm/attention/ops/rocm*.py @gshtras
/vllm/model_executor/layers/fused_moe/rocm*.py @gshtras
# TPU
/vllm/v1/worker/tpu* @NickLucche
/vllm/platforms/tpu.py @NickLucche
/vllm/v1/sample/tpu @NickLucche
/vllm/tests/v1/tpu @NickLucche
# KVConnector installation files
/requirements/kv_connectors.txt @NickLucche

View File

@ -43,6 +43,10 @@ body:
Any other things you would like to mention.
validations:
required: false
- type: markdown
attributes:
value: >
Thanks for contributing 🎉!
- type: checkboxes
id: askllm
attributes:

View File

@ -1,5 +1,10 @@
<!-- markdownlint-disable -->
PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTTOM) HAVE BEEN CONSIDERED.
## Essential Elements of an Effective PR Description Checklist
- [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)".
- [ ] The test plan, such as providing test command.
- [ ] The test results, such as pasting the results comparison before and after, or e2e results
- [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model.
PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS ABOVE HAVE BEEN CONSIDERED.
## Purpose
@ -7,15 +12,7 @@ PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTT
## Test Result
---
<details>
<summary> Essential Elements of an Effective PR Description Checklist </summary>
- [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)".
- [ ] The test plan, such as providing test command.
- [ ] The test results, such as pasting the results comparison before and after, or e2e results
- [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model.
- [ ] (Optional) Release notes update. If your change is user facing, please update the release notes draft in the [Google Doc](https://docs.google.com/document/d/1YyVqrgX4gHTtrstbq8oWUImOyPCKSGnJ7xtTpmXzlRs/edit?tab=t.0).
</details>
## (Optional) Documentation Update
<!--- pyml disable-next-line no-emphasis-as-heading -->
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions)

63
.github/mergify.yml vendored
View File

@ -118,27 +118,6 @@ pull_request_rules:
add:
- qwen
- name: label-gpt-oss
description: Automatically apply gpt-oss label
conditions:
- or:
- files~=^examples/.*gpt[-_]?oss.*\.py
- files~=^tests/.*gpt[-_]?oss.*\.py
- files~=^tests/entrypoints/openai/test_response_api_with_harmony.py
- files~=^tests/entrypoints/test_context.py
- files~=^vllm/model_executor/models/.*gpt[-_]?oss.*\.py
- files~=^vllm/model_executor/layers/.*gpt[-_]?oss.*\.py
- files~=^vllm/entrypoints/harmony_utils.py
- files~=^vllm/entrypoints/tool_server.py
- files~=^vllm/entrypoints/tool.py
- files~=^vllm/entrypoints/context.py
- title~=(?i)gpt[-_]?oss
- title~=(?i)harmony
actions:
label:
add:
- gpt-oss
- name: label-rocm
description: Automatically apply rocm label
conditions:
@ -170,8 +149,11 @@ pull_request_rules:
- files=examples/offline_inference/structured_outputs.py
- files=examples/online_serving/openai_chat_completion_structured_outputs.py
- files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
- files~=^vllm/model_executor/guided_decoding/
- files=tests/model_executor/test_guided_processors.py
- files=tests/entrypoints/llm/test_guided_generate.py
- files~=^tests/v1/structured_output/
- files=tests/v1/entrypoints/llm/test_struct_output_generate.py
- files=tests/v1/entrypoints/llm/test_guided_generate.py
- files~=^vllm/v1/structured_output/
actions:
label:
@ -182,7 +164,10 @@ pull_request_rules:
description: Automatically apply speculative-decoding label
conditions:
- or:
- files~=^vllm/spec_decode/
- files~=^vllm/v1/spec_decode/
- files=vllm/model_executor/layers/spec_decode_base_sampler.py
- files~=^tests/spec_decode/
- files~=^tests/v1/spec_decode/
- files~=^examples/.*(spec_decode|mlpspeculator|eagle|speculation).*\.py
- files~=^vllm/model_executor/models/.*eagle.*\.py
@ -239,6 +224,7 @@ pull_request_rules:
conditions:
- or:
- files~=^tests/tool_use/
- files~=^tests/mistral_tool_use/
- files~=^tests/entrypoints/openai/tool_parsers/
- files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
- files~=^vllm/entrypoints/openai/tool_parsers/
@ -273,26 +259,12 @@ pull_request_rules:
- files~=^vllm/model_executor/model_loader/tensorizer.py
- files~=^vllm/model_executor/model_loader/tensorizer_loader.py
- files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
- files~=^tests/model_executor/model_loader/tensorizer_loader/
- files~=^tests/tensorizer_loader/
actions:
assign:
users:
- "sangstar"
- name: assign reviewer for modelopt changes
conditions:
- or:
- files~=^vllm/model_executor/layers/quantization/modelopt\.py$
- files~=^vllm/model_executor/layers/quantization/__init__\.py$
- files~=^tests/models/quantization/test_modelopt\.py$
- files~=^tests/quantization/test_modelopt\.py$
- files~=^tests/models/quantization/test_nvfp4\.py$
- files~=^docs/features/quantization/modelopt\.md$
actions:
assign:
users:
- "Edwardf0t1"
- name: remove 'needs-rebase' label when conflict is resolved
conditions:
- -conflict
@ -301,20 +273,3 @@ pull_request_rules:
label:
remove:
- needs-rebase
- name: label-kv-connector
description: Automatically apply kv-connector label
conditions:
- or:
- files~=^examples/online_serving/disaggregated[^/]*/.*
- files~=^examples/offline_inference/disaggregated[^/]*/.*
- files~=^examples/others/lmcache/
- files~=^tests/v1/kv_connector/
- files~=^vllm/distributed/kv_transfer/
- title~=(?i)\bP/?D\b
- title~=(?i)NIXL
- title~=(?i)LMCache
actions:
label:
add:
- kv-connector

View File

@ -1,21 +0,0 @@
# scale-config.yml:
# Powers what instance types are available for GHA auto-scaled
# runners. Runners listed here will be available as self hosted
# runners, configuration is directly pulled from the main branch.
# runner_types:
# runner_label:
# instance_type: m4.large
# os: linux
# # min_available defaults to the global cfg in the ALI Terraform
# min_available: undefined
# # when max_available value is not defined, no max runners is enforced
# max_available: undefined
# disk_size: 50
# is_ephemeral: true
runner_types:
linux.2xlarge:
disk_size: 150
instance_type: c5.2xlarge
is_ephemeral: true
os: linux

View File

@ -15,11 +15,11 @@ NEW=/tmp/new_pr_body.txt
gh pr view --json body --template "{{.body}}" "${PR_NUMBER}" > "${OLD}"
cp "${OLD}" "${NEW}"
# Remove markdown comments (like the <!-- markdownlint-disable --> at the start)
sed -i '/<!--.*-->$/d' "${NEW}"
# Remove "FIX #xxxx (*link existing issues this PR will resolve*)"
sed -i '/FIX #xxxx.*$/d' "${NEW}"
# Remove "PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTTOM) HAVE BEEN CONSIDERED."
sed -i '/PLEASE FILL IN THE PR DESCRIPTION HERE.*$/d' "${NEW}"
# Remove "FILL IN THE PR DESCRIPTION HERE"
sed -i '/FILL IN THE PR DESCRIPTION HERE/d' "${NEW}"
# Remove all lines after and including "**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**"
sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ.*\*\*/,$d' "${NEW}"

View File

@ -10,7 +10,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Add label
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
with:
script: |
github.rest.issues.addLabels({

View File

@ -1,29 +0,0 @@
name: BC Lint
on:
pull_request:
types:
- opened
- synchronize
- reopened
- labeled
- unlabeled
jobs:
bc_lint:
if: github.repository_owner == 'vllm-project'
runs-on: ubuntu-latest
steps:
- name: Run BC Lint Action
uses: pytorch/test-infra/.github/actions/bc-lint@main
with:
repo: ${{ github.event.pull_request.head.repo.full_name }}
base_sha: ${{ github.event.pull_request.base.sha }}
head_sha: ${{ github.event.pull_request.head.sha }}
suppression: ${{ contains(github.event.pull_request.labels.*.name, 'suppress-bc-linter') }}
docs_link: 'https://github.com/pytorch/test-infra/wiki/BC-Linter'
config_dir: .github
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}
cancel-in-progress: true

View File

@ -16,7 +16,7 @@ jobs:
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Set up Python
uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
with:
python-version: '3.12'

View File

@ -1,309 +0,0 @@
name: Label issues based on keywords
on:
issues:
types: [opened, edited, reopened]
permissions:
issues: write # needed so the workflow can add labels
contents: read
concurrency:
group: issue-labeler-${{ github.event.issue.number }}
cancel-in-progress: true
jobs:
add-labels:
runs-on: ubuntu-latest
steps:
- name: Label issues based on keywords
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
with:
script: |
// Configuration: Add new labels and keywords here
const labelConfig = {
rocm: {
// Keyword search - matches whole words only (with word boundaries)
keywords: [
{
term: "composable kernel",
searchIn: "both"
},
{
term: "rccl",
searchIn: "body" // only search in body
},
{
term: "migraphx",
searchIn: "title" // only search in title
},
{
term: "hipgraph",
searchIn: "both"
},
{
term: "ROCm System Management Interface",
searchIn: "body"
},
],
// Substring search - matches anywhere in text (partial matches)
substrings: [
{
term: "VLLM_ROCM_",
searchIn: "both"
},
{
term: "aiter",
searchIn: "title"
},
{
term: "rocm",
searchIn: "title"
},
{
term: "amd",
searchIn: "title"
},
{
term: "hip-",
searchIn: "both"
},
{
term: "gfx",
searchIn: "both"
},
{
term: "cdna",
searchIn: "both"
},
{
term: "rdna",
searchIn: "both"
},
{
term: "torch_hip",
searchIn: "body" // only in body
},
{
term: "_hip",
searchIn: "both"
},
{
term: "hip_",
searchIn: "both"
},
// ROCm tools and libraries
{
term: "hipify",
searchIn: "both"
},
],
// Regex patterns - for complex pattern matching
regexPatterns: [
{
pattern: "\\bmi\\d{3}[a-z]*\\b",
description: "AMD GPU names (mi + 3 digits + optional letters)",
flags: "gi",
searchIn: "both" // "title", "body", or "both"
}
],
},
};
// Helper function to create regex based on search type
function createSearchRegex(term, type) {
// Escape special regex characters in the term
const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
switch (type) {
case 'keyword':
// Word boundary search - matches whole words only
return new RegExp(`\\b${escapedTerm}\\b`, "gi");
case 'substring':
// Substring search - matches anywhere in the text
return new RegExp(escapedTerm, "gi");
default:
throw new Error(`Unknown search type: ${type}`);
}
}
// Helper function to find matching terms in text with line information
function findMatchingTermsWithLines(text, searchTerms = [], searchType = 'keyword', searchLocation = '') {
const matches = [];
const lines = text.split('\n');
for (const termConfig of searchTerms) {
let regex;
let term, searchIn, pattern, description, flags;
// Handle different input formats (string or object)
if (typeof termConfig === 'string') {
term = termConfig;
searchIn = 'both'; // default
} else {
term = termConfig.term;
searchIn = termConfig.searchIn || 'both';
pattern = termConfig.pattern;
description = termConfig.description;
flags = termConfig.flags;
}
// Skip if this term shouldn't be searched in the current location
if (searchIn !== 'both' && searchIn !== searchLocation) {
continue;
}
// Create appropriate regex
if (searchType === 'regex') {
regex = new RegExp(pattern, flags || "gi");
} else {
regex = createSearchRegex(term, searchType);
}
const termMatches = [];
// Check each line for matches
lines.forEach((line, lineIndex) => {
const lineMatches = line.match(regex);
if (lineMatches) {
lineMatches.forEach(match => {
termMatches.push({
match: match,
lineNumber: lineIndex + 1,
lineContent: line.trim(),
searchType: searchType,
searchLocation: searchLocation,
originalTerm: term || pattern,
description: description,
// Show context around the match in the line
context: line.length > 100 ?
line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30),
line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...'
: line.trim()
});
});
}
});
if (termMatches.length > 0) {
matches.push({
term: term || (description || pattern),
searchType: searchType,
searchLocation: searchLocation,
searchIn: searchIn,
pattern: pattern,
matches: termMatches,
count: termMatches.length
});
}
}
return matches;
}
// Helper function to check if label should be added
async function processLabel(labelName, config) {
const body = context.payload.issue.body || "";
const title = context.payload.issue.title || "";
core.notice(`Processing label: ${labelName}`);
core.notice(`Issue Title: "${title}"`);
core.notice(`Issue Body length: ${body.length} characters`);
let shouldAddLabel = false;
let allMatches = [];
let reason = '';
const keywords = config.keywords || [];
const substrings = config.substrings || [];
const regexPatterns = config.regexPatterns || [];
core.notice(`Searching with ${keywords.length} keywords, ${substrings.length} substrings, and ${regexPatterns.length} regex patterns`);
// Search in title
if (title.trim()) {
core.notice(`Searching in title: "${title}"`);
const titleKeywordMatches = findMatchingTermsWithLines(title, keywords, 'keyword', 'title');
const titleSubstringMatches = findMatchingTermsWithLines(title, substrings, 'substring', 'title');
const titleRegexMatches = findMatchingTermsWithLines(title, regexPatterns, 'regex', 'title');
allMatches.push(...titleKeywordMatches, ...titleSubstringMatches, ...titleRegexMatches);
}
// Search in body
if (body.trim()) {
core.notice(`Searching in body (${body.length} characters)`);
const bodyKeywordMatches = findMatchingTermsWithLines(body, keywords, 'keyword', 'body');
const bodySubstringMatches = findMatchingTermsWithLines(body, substrings, 'substring', 'body');
const bodyRegexMatches = findMatchingTermsWithLines(body, regexPatterns, 'regex', 'body');
allMatches.push(...bodyKeywordMatches, ...bodySubstringMatches, ...bodyRegexMatches);
}
if (allMatches.length > 0) {
core.notice(`Found ${allMatches.length} matching term(s):`);
for (const termMatch of allMatches) {
const locationText = termMatch.searchLocation === 'title' ? 'title' : 'body';
const searchInText = termMatch.searchIn === 'both' ? 'both' : termMatch.searchIn;
if (termMatch.searchType === 'regex') {
core.notice(` 📍 Regex: "${termMatch.term}" (pattern: ${termMatch.pattern}) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
} else {
core.notice(` 📍 Term: "${termMatch.term}" (${termMatch.searchType} search) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
}
// Show details for each match
termMatch.matches.forEach((match, index) => {
core.notice(` ${index + 1}. Line ${match.lineNumber} in ${match.searchLocation}: "${match.match}" [${match.searchType}]`);
if (match.description) {
core.notice(` Description: ${match.description}`);
}
core.notice(` Context: ${match.context}`);
if (match.lineContent !== match.context) {
core.notice(` Full line: ${match.lineContent}`);
}
});
}
shouldAddLabel = true;
const totalMatches = allMatches.reduce((sum, t) => sum + t.count, 0);
const titleMatches = allMatches.filter(t => t.searchLocation === 'title').reduce((sum, t) => sum + t.count, 0);
const bodyMatches = allMatches.filter(t => t.searchLocation === 'body').reduce((sum, t) => sum + t.count, 0);
const keywordMatches = allMatches.filter(t => t.searchType === 'keyword').reduce((sum, t) => sum + t.count, 0);
const substringMatches = allMatches.filter(t => t.searchType === 'substring').reduce((sum, t) => sum + t.count, 0);
const regexMatches = allMatches.filter(t => t.searchType === 'regex').reduce((sum, t) => sum + t.count, 0);
reason = `Found ${totalMatches} total matches (${titleMatches} in title, ${bodyMatches} in body) - ${keywordMatches} keyword matches, ${substringMatches} substring matches, ${regexMatches} regex matches`;
}
core.notice(`Final decision: ${shouldAddLabel ? 'ADD LABEL' : 'DO NOT ADD LABEL'}`);
core.notice(`Reason: ${reason || 'No matching terms found'}`);
if (shouldAddLabel) {
const existingLabels = context.payload.issue.labels.map(l => l.name);
if (!existingLabels.includes(labelName)) {
await github.rest.issues.addLabels({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
labels: [labelName],
});
core.notice(`Label "${labelName}" added. ${reason}`);
return true;
}
core.notice(`Label "${labelName}" already present.`);
return false;
}
core.notice(`No matching terms found for label "${labelName}".`);
return false;
}
// Process all configured labels
const processLabels = Object.entries(labelConfig)
.map(([labelName, config]) => processLabel(labelName, config));
const labelsAdded = await Promise.all(processLabels);
const numLabelsAdded = labelsAdded.reduce((x, y) => x + y, 0);
core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);

85
.github/workflows/lint-and-deploy.yaml vendored Normal file
View File

@ -0,0 +1,85 @@
name: Lint and Deploy Charts
on: pull_request
permissions:
contents: read
jobs:
lint-and-deploy:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: Set up Helm
uses: azure/setup-helm@b9e51907a09c216f16ebe8536097933489208112 # v4.3.0
with:
version: v3.14.4
#Python is required because ct lint runs Yamale and yamllint which require Python.
- uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
with:
python-version: '3.13'
- name: Set up chart-testing
uses: helm/chart-testing-action@0d28d3144d3a25ea2cc349d6e59901c4ff469b3b # v2.7.0
with:
version: v3.10.1
- name: Run chart-testing (lint)
run: ct lint --target-branch ${{ github.event.repository.default_branch }} --chart-dirs examples/online_serving/chart-helm --charts examples/online_serving/chart-helm
- name: Setup minio
run: |
docker network create vllm-net
docker run -d -p 9000:9000 --name minio --net vllm-net \
-e "MINIO_ACCESS_KEY=minioadmin" \
-e "MINIO_SECRET_KEY=minioadmin" \
-v /tmp/data:/data \
-v /tmp/config:/root/.minio \
minio/minio server /data
export AWS_ACCESS_KEY_ID=minioadmin
export AWS_SECRET_ACCESS_KEY=minioadmin
export AWS_EC2_METADATA_DISABLED=true
mkdir opt-125m
cd opt-125m && curl -O -Ls "https://huggingface.co/facebook/opt-125m/resolve/main/{pytorch_model.bin,config.json,generation_config.json,merges.txt,special_tokens_map.json,tokenizer_config.json,vocab.json}" && cd ..
aws --endpoint-url http://127.0.0.1:9000/ s3 mb s3://testbucket
aws --endpoint-url http://127.0.0.1:9000/ s3 cp opt-125m/ s3://testbucket/opt-125m --recursive
- name: Create kind cluster
uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0
- name: Build the Docker image vllm cpu
run: docker buildx build -f docker/Dockerfile.cpu -t vllm-cpu-env .
- name: Configuration of docker images, network and namespace for the kind cluster
run: |
docker pull amazon/aws-cli:2.6.4
kind load docker-image amazon/aws-cli:2.6.4 --name chart-testing
kind load docker-image vllm-cpu-env:latest --name chart-testing
docker network connect vllm-net "$(docker ps -aqf "name=chart-testing-control-plane")"
kubectl create ns ns-vllm
- name: Run chart-testing (install)
run: |
export AWS_ACCESS_KEY_ID=minioadmin
export AWS_SECRET_ACCESS_KEY=minioadmin
sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" &
helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set image.env[2].name=VLLM_CPU_CI_ENV --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string image.env[2].value="1" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
- name: curl test
run: |
kubectl -n ns-vllm port-forward service/test-vllm-service 8001:80 &
sleep 10
CODE="$(curl -v -f --location http://localhost:8001/v1/completions \
--header "Content-Type: application/json" \
--data '{
"model": "opt-125m",
"prompt": "San Francisco is a",
"max_tokens": 7,
"temperature": 0
}'):$CODE"
echo "$CODE"

View File

@ -1,17 +0,0 @@
{
"problemMatcher": [
{
"owner": "markdownlint",
"pattern": [
{
"regexp": "^([^:]*):(\\d+):?(\\d+)?\\s([\\w-\\/]*)\\s(.*)$",
"file": 1,
"line": 2,
"column": 3,
"code": 4,
"message": 5
}
]
}
]
}

View File

@ -5,10 +5,6 @@ on:
push:
branches: [main]
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.event_name == 'pull_request' }}
permissions:
contents: read
@ -17,11 +13,10 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
- uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
with:
python-version: "3.12"
- run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
- run: echo "::add-matcher::.github/workflows/matchers/markdownlint.json"
- run: echo "::add-matcher::.github/workflows/matchers/mypy.json"
- uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
with:

111
.github/workflows/publish.yml vendored Normal file
View File

@ -0,0 +1,111 @@
# This workflow will upload a Python Package to Release asset
# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions
name: Create Release
on:
push:
tags:
- v*
# Needed to create release and upload assets
permissions:
contents: write
jobs:
release:
# Retrieve tag and create release
name: Create Release
runs-on: ubuntu-latest
outputs:
upload_url: ${{ steps.create_release.outputs.upload_url }}
steps:
- name: Checkout
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Extract branch info
shell: bash
run: |
echo "release_tag=${GITHUB_REF#refs/*/}" >> "$GITHUB_ENV"
- name: Create Release
id: create_release
uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
env:
RELEASE_TAG: ${{ env.release_tag }}
with:
github-token: "${{ secrets.GITHUB_TOKEN }}"
script: |
const script = require('.github/workflows/scripts/create_release.js')
await script(github, context, core)
# NOTE(simon): No longer build wheel using GitHub Actions. See buildkite's release workflow.
# wheel:
# name: Build Wheel
# runs-on: ${{ matrix.os }}
# needs: release
# strategy:
# fail-fast: false
# matrix:
# os: ['ubuntu-20.04']
# python-version: ['3.9', '3.10', '3.11', '3.12']
# pytorch-version: ['2.4.0'] # Must be the most recent version that meets requirements/cuda.txt.
# cuda-version: ['11.8', '12.1']
# steps:
# - name: Checkout
# uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
# - name: Setup ccache
# uses: hendrikmuhs/ccache-action@ed74d11c0b343532753ecead8a951bb09bb34bc9 # v1.2.14
# with:
# create-symlink: true
# key: ${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}
# - name: Set up Linux Env
# if: ${{ runner.os == 'Linux' }}
# run: |
# bash -x .github/workflows/scripts/env.sh
# - name: Set up Python
# uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
# with:
# python-version: ${{ matrix.python-version }}
# - name: Install CUDA ${{ matrix.cuda-version }}
# run: |
# bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }}
# - name: Install PyTorch ${{ matrix.pytorch-version }} with CUDA ${{ matrix.cuda-version }}
# run: |
# bash -x .github/workflows/scripts/pytorch-install.sh ${{ matrix.python-version }} ${{ matrix.pytorch-version }} ${{ matrix.cuda-version }}
# - name: Build wheel
# shell: bash
# env:
# CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
# run: |
# bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
# wheel_name=$(find dist -name "*whl" -print0 | xargs -0 -n 1 basename)
# asset_name=${wheel_name//"linux"/"manylinux1"}
# echo "wheel_name=${wheel_name}" >> "$GITHUB_ENV"
# echo "asset_name=${asset_name}" >> "$GITHUB_ENV"
# - name: Upload Release Asset
# uses: actions/upload-release-asset@e8f9f06c4b078e705bd2ea027f0926603fc9b4d5 # v1.0.2
# env:
# GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
# with:
# upload_url: ${{ needs.release.outputs.upload_url }}
# asset_path: ./dist/${{ env.wheel_name }}
# asset_name: ${{ env.asset_name }}
# asset_content_type: application/*
# (Danielkinz): This last step will publish the .whl to pypi. Warning: untested
# - name: Publish package
# uses: pypa/gh-action-pypi-publish@release/v1.8
# with:
# repository-url: https://test.pypi.org/legacy/
# password: ${{ secrets.PYPI_API_TOKEN }}
# skip-existing: true

View File

@ -9,46 +9,19 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Remind to run full CI on PR
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
with:
script: |
try {
// Get the PR author
const prAuthor = context.payload.pull_request.user.login;
// Check if this is the author's first PR in this repository
// Use GitHub's search API to find all PRs by this author
const { data: searchResults } = await github.rest.search.issuesAndPullRequests({
q: `repo:${context.repo.owner}/${context.repo.repo} type:pr author:${prAuthor}`,
per_page: 100
});
const authorPRCount = searchResults.total_count;
console.log(`Found ${authorPRCount} PRs by ${prAuthor}`);
// Only post comment if this is the first PR (only one PR by this author)
if (authorPRCount === 1) {
console.log(`Posting welcome comment for first-time contributor: ${prAuthor}`);
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
body: '👋 Hi! Thank you for contributing to the vLLM project.\n\n' +
'💬 Join our developer Slack at https://slack.vllm.ai to discuss your PR in #pr-reviews, coordinate on features in #feat- channels, or join special interest groups in #sig- channels.\n\n' +
'Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. \n\n' +
'You ask your reviewers to trigger select CI tests on top of `fastcheck` CI. \n\n' +
'Once the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n' +
'To run CI, PR reviewers can either: Add `ready` label to the PR or enable auto-merge.\n\n' +
'If you have any questions, please reach out to us on Slack at https://slack.vllm.ai.\n\n' +
'🚀'
});
} else {
console.log(`Skipping comment for ${prAuthor} - not their first PR (${authorPRCount} PRs found)`);
}
} catch (error) {
console.error('Error checking PR history or posting comment:', error);
// Don't fail the workflow, just log the error
}
github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
body: '👋 Hi! Thank you for contributing to the vLLM project.\n\n' +
'💬 Join our developer Slack at https://slack.vllm.ai to discuss your PR in #pr-reviews, coordinate on features in #feat- channels, or join special interest groups in #sig- channels.\n\n' +
'Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping `simon-mo` or `khluu` to add you in our Buildkite org.\n\n' +
'Once the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n' +
'To run CI, PR reviewers can either: Add `ready` label to the PR or enable auto-merge.\n\n' +
'🚀'
})
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

View File

@ -15,6 +15,7 @@ $python_executable -m pip install -r requirements/build.txt -r requirements/cuda
export MAX_JOBS=1
# Make sure release wheels are built for the following architectures
export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
export VLLM_FA_CMAKE_GPU_ARCHES="80-real;90-real"
bash tools/check_repo.sh

View File

@ -13,7 +13,7 @@ jobs:
actions: write
runs-on: ubuntu-latest
steps:
- uses: actions/stale@3a9db7e6a41a89f618792c92c0e97cc736e1b13f # v10.0.0
- uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0
with:
# Increasing this value ensures that changes to this workflow
# propagate to all issues and PRs in days rather than months

17
.gitignore vendored
View File

@ -4,9 +4,6 @@
# vllm-flash-attn built from source
vllm/vllm_flash_attn/*
# triton jit
.triton
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
@ -150,8 +147,7 @@ venv.bak/
# mkdocs documentation
/site
docs/argparse
docs/examples/*
!docs/examples/README.md
docs/examples
# mypy
.mypy_cache/
@ -177,14 +173,6 @@ cython_debug/
# VSCode
.vscode/
# Claude
CLAUDE.md
.claude/
# Codex
AGENTS.md
.codex/
# DS Store
.DS_Store
@ -215,6 +203,3 @@ shellcheck*/
# Ignore moe/marlin_moe gen code
csrc/moe/marlin_moe_wna16/kernel_*
# Ignore ep_kernels_workspace folder
ep_kernels_workspace/

View File

@ -1,13 +0,0 @@
MD007:
indent: 4
MD013: false
MD024:
siblings_only: true
MD033: false
MD042: false
MD045: false
MD046: false
MD051: false
MD052: false
MD053: false
MD059: false

View File

@ -21,7 +21,7 @@ repos:
- id: ruff-format
files: ^(.buildkite|benchmarks|examples)/.*
- repo: https://github.com/crate-ci/typos
rev: v1.35.5
rev: v1.32.0
hooks:
- id: typos
- repo: https://github.com/PyCQA/isort
@ -35,12 +35,12 @@ repos:
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
types_or: [c++, cuda]
args: [--style=file, --verbose]
- repo: https://github.com/igorshubovych/markdownlint-cli
rev: v0.45.0
- repo: https://github.com/jackdewinter/pymarkdown
rev: v0.9.29
hooks:
- id: markdownlint
- id: pymarkdown
exclude: '.*\.inc\.md'
stages: [manual] # Only run in CI
args: [fix]
- repo: https://github.com/rhysd/actionlint
rev: v1.7.7
hooks:
@ -49,7 +49,7 @@ repos:
rev: 0.6.17
hooks:
- id: pip-compile
args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128, --python-platform, x86_64-manylinux_2_28]
args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128]
files: ^requirements/test\.(in|txt)$
- repo: local
hooks:
@ -60,32 +60,38 @@ repos:
files: ^requirements/test\.(in|txt)$
- id: mypy-local
name: Run mypy for local Python installation
entry: python tools/pre_commit/mypy.py 0 "local"
entry: tools/mypy.sh 0 "local"
language: python
types: [python]
additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests, pydantic]
stages: [pre-commit] # Don't run in CI
<<: &mypy_common
language: python
types_or: [python, pyi]
require_serial: true
additional_dependencies: [mypy==1.11.1, regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic]
- id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.9
entry: python tools/pre_commit/mypy.py 1 "3.9"
<<: *mypy_common
entry: tools/mypy.sh 1 "3.9"
language: python
types: [python]
additional_dependencies: *mypy_deps
stages: [manual] # Only run in CI
- id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.10
entry: python tools/pre_commit/mypy.py 1 "3.10"
<<: *mypy_common
entry: tools/mypy.sh 1 "3.10"
language: python
types: [python]
additional_dependencies: *mypy_deps
stages: [manual] # Only run in CI
- id: mypy-3.11 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.11
entry: python tools/pre_commit/mypy.py 1 "3.11"
<<: *mypy_common
entry: tools/mypy.sh 1 "3.11"
language: python
types: [python]
additional_dependencies: *mypy_deps
stages: [manual] # Only run in CI
- id: mypy-3.12 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.12
entry: python tools/pre_commit/mypy.py 1 "3.12"
<<: *mypy_common
entry: tools/mypy.sh 1 "3.12"
language: python
types: [python]
additional_dependencies: *mypy_deps
stages: [manual] # Only run in CI
- id: shellcheck
name: Lint shell scripts
@ -149,15 +155,18 @@ repos:
additional_dependencies: [regex]
- id: check-pickle-imports
name: Prevent new pickle/cloudpickle imports
entry: python tools/pre_commit/check_pickle_imports.py
entry: python tools/check_pickle_imports.py
language: python
types: [python]
additional_dependencies: [regex]
pass_filenames: false
additional_dependencies: [pathspec, regex]
- id: validate-config
name: Validate configuration has default values and that each field has a docstring
entry: python tools/validate_config.py
language: python
additional_dependencies: [regex]
types: [python]
pass_filenames: true
files: vllm/config.py|tests/test_config.py
# Keep `suggestion` last
- id: suggestion
name: Suggestion

View File

@ -7,13 +7,9 @@ build:
os: ubuntu-22.04
tools:
python: "3.12"
jobs:
post_checkout:
- git fetch --unshallow || true
mkdocs:
configuration: mkdocs.yaml
fail_on_warning: true
# Optionally declare the Python requirements required to build your docs
python:

View File

@ -1,2 +1 @@
collect_env.py
vllm/model_executor/layers/fla/ops/*.py

View File

@ -13,10 +13,6 @@ cmake_minimum_required(VERSION 3.26)
# cmake --install . --component _C
project(vllm_extensions LANGUAGES CXX)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
@ -34,10 +30,10 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
# Supported python versions. These versions will be searched in order, the
# first match will be selected. These should be kept in sync with setup.py.
#
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12" "3.13")
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
# Supported AMD GPU architectures.
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
#
# Supported/expected torch versions for CUDA/ROCm.
@ -49,8 +45,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
# requirements.txt files and should be kept consistent. The ROCm torch
# versions are derived from docker/Dockerfile.rocm
#
set(TORCH_SUPPORTED_VERSION_CUDA "2.8.0")
set(TORCH_SUPPORTED_VERSION_ROCM "2.8.0")
set(TORCH_SUPPORTED_VERSION_CUDA "2.7.0")
set(TORCH_SUPPORTED_VERSION_ROCM "2.7.0")
#
# Try to find python package with an executable that exactly matches
@ -86,9 +82,6 @@ find_package(Torch REQUIRED)
# Supported NVIDIA architectures.
# This check must happen after find_package(Torch) because that's when CMAKE_CUDA_COMPILER_VERSION gets defined
if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
set(CUDA_SUPPORTED_ARCHS "7.5;8.0;8.6;8.7;8.9;9.0;10.0;11.0;12.0")
elseif(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
else()
@ -179,24 +172,15 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
endif()
#
# Set compression mode for CUDA >=13.x.
#
if(VLLM_GPU_LANG STREQUAL "CUDA" AND
DEFINED CMAKE_CUDA_COMPILER_VERSION AND
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
list(APPEND VLLM_GPU_FLAGS "--compress-mode=size")
endif()
#
# Set CUDA include flags for CXX compiler.
# Set nvcc fatbin compression.
#
if(VLLM_GPU_LANG STREQUAL "CUDA")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I${CUDA_TOOLKIT_ROOT_DIR}/include")
if(CUDA_VERSION VERSION_GREATER_EQUAL 13.0)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I${CUDA_TOOLKIT_ROOT_DIR}/include/cccl")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
list(APPEND VLLM_GPU_FLAGS "-Xfatbin" "-compress-all" "-compress-mode=size")
endif()
endif()
#
# Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process.
# setup.py will override FETCHCONTENT_BASE_DIR to play nicely with sccache.
@ -275,6 +259,7 @@ set(VLLM_EXT_SRC
"csrc/quantization/gguf/gguf_kernel.cu"
"csrc/quantization/activation_kernels.cu"
"csrc/cuda_utils_kernels.cu"
"csrc/prepare_inputs/advance_step.cu"
"csrc/custom_all_reduce.cu"
"csrc/torch_bindings.cpp")
@ -282,7 +267,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
# Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
set(CUTLASS_REVISION "v4.2.1" CACHE STRING "CUTLASS revision to use")
set(CUTLASS_REVISION "v4.0.0" CACHE STRING "CUTLASS revision to use")
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
@ -312,14 +297,16 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
FetchContent_MakeAvailable(cutlass)
list(APPEND VLLM_EXT_SRC
"csrc/quantization/aqlm/gemm_kernels.cu"
"csrc/quantization/awq/gemm_kernels.cu"
"csrc/permute_cols.cu"
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
"csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu"
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
"csrc/cutlass_extensions/common.cpp"
"csrc/quantization/fp8/per_token_group_quant.cu")
"csrc/attention/mla/cutlass_mla_entry.cu")
set_gencode_flags_for_srcs(
SRCS "${VLLM_EXT_SRC}"
@ -373,27 +360,20 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
set_gencode_flags_for_srcs(
SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
CUDA_ARCHS "${MARLIN_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
set_source_files_properties(${MARLIN_TEMPLATE_KERNEL_SRC}
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
endif()
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
set(MARLIN_SRCS
"csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
"csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
"csrc/quantization/gptq_marlin/gptq_marlin.cu"
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
"csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
set_gencode_flags_for_srcs(
SRCS "${MARLIN_SRCS}"
CUDA_ARCHS "${MARLIN_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
set_source_files_properties("csrc/quantization/gptq_marlin/gptq_marlin.cu"
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
endif()
list(APPEND VLLM_EXT_SRC "${MARLIN_SRCS}")
message(STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS}")
else()
message(STATUS "Not building Marlin kernels as no compatible archs found"
@ -451,16 +431,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# The cutlass_scaled_mm kernels for Geforce Blackwell SM120 (c3x, i.e. CUTLASS 3.x) require
# CUDA 12.8 or later
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0f" "${CUDA_ARCHS}")
else()
cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0a" "${CUDA_ARCHS}")
endif()
cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0;12.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
set(SRCS
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8.cu"
)
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
@ -485,11 +460,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
# require CUDA 12.8 or later
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
else()
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
endif()
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
set(SRCS
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
@ -567,40 +538,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif()
endif()
# The nvfp4_scaled_mm_sm120 kernels for Geforce Blackwell SM120 require
# CUDA 12.8 or later
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
cuda_archs_loose_intersection(FP4_ARCHS "12.0f" "${CUDA_ARCHS}")
else()
cuda_archs_loose_intersection(FP4_ARCHS "12.0a" "${CUDA_ARCHS}")
endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
set(SRCS
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
"csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu"
"csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${FP4_ARCHS}")
list(APPEND VLLM_EXT_SRC "${SRCS}")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4_SM120=1")
message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
else()
message(STATUS "Not building NVFP4 as no compatible archs were found.")
# clear FP4_ARCHS
set(FP4_ARCHS)
endif()
# FP4 Archs and flags
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
cuda_archs_loose_intersection(FP4_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
else()
cuda_archs_loose_intersection(FP4_ARCHS "10.0a;10.1a;12.0a;12.1a" "${CUDA_ARCHS}")
endif()
cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
set(SRCS
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
"csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu"
"csrc/quantization/fp4/nvfp4_experts_quant.cu"
"csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu"
"csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu")
@ -608,7 +550,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
SRCS "${SRCS}"
CUDA_ARCHS "${FP4_ARCHS}")
list(APPEND VLLM_EXT_SRC "${SRCS}")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4_SM100=1")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4=1")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM100=1")
message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
else()
@ -618,14 +560,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif()
# CUTLASS MLA Archs and flags
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
cuda_archs_loose_intersection(MLA_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
else()
cuda_archs_loose_intersection(MLA_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
endif()
cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND MLA_ARCHS)
set(SRCS
"csrc/attention/mla/sm100_cutlass_mla_kernel.cu")
"csrc/attention/mla/cutlass_mla_kernels.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${MLA_ARCHS}")
@ -648,7 +586,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# if it's possible to compile MoE kernels that use its output.
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm90.cu")
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_ARCHS}")
@ -666,36 +604,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif()
endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f" "${CUDA_ARCHS}")
else()
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm100.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_ARCHS}")
list(APPEND VLLM_EXT_SRC "${SRCS}")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM100=1")
message(STATUS "Building grouped_mm_c3x for archs: ${SCALED_MM_ARCHS}")
else()
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
message(STATUS "Not building grouped_mm_c3x kernels as CUDA Compiler version is "
"not >= 12.8, we recommend upgrading to CUDA 12.8 or later "
"if you intend on running FP8 quantized MoE models on Blackwell.")
else()
message(STATUS "Not building grouped_mm_c3x as no compatible archs found "
"in CUDA target architectures.")
endif()
endif()
# moe_data.cu is used by all CUTLASS MoE kernels.
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
else()
cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
endif()
cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
set(SRCS "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
set_gencode_flags_for_srcs(
@ -713,12 +623,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
"in CUDA target architectures.")
endif()
endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
else()
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
endif()
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
set(SRCS "csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu")
set_gencode_flags_for_srcs(
@ -807,44 +713,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
"found in CUDA target architectures")
endif()
endif()
# Only build W4A8 kernels if we are building for something compatible with sm90a
cuda_archs_loose_intersection(W4A8_ARCHS "9.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND W4A8_ARCHS)
set(SRCS
"csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${W4A8_ARCHS}")
list(APPEND VLLM_EXT_SRC "${SRCS}")
message(STATUS "Building W4A8 kernels for archs: ${W4A8_ARCHS}")
else()
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0
AND W4A8_ARCHS)
message(STATUS "Not building W4A8 kernels as CUDA Compiler version is "
"not >= 12.0, we recommend upgrading to CUDA 12.0 or "
"later if you intend on running w4a16 quantized models on "
"Hopper.")
else()
message(STATUS "Not building W4A8 kernels as no compatible archs "
"found in CUDA target architectures")
endif()
endif()
# Hadacore kernels
cuda_archs_loose_intersection(HADACORE_ARCHS "8.0;8.9;9.0" "${CUDA_ARCHS}")
if(HADACORE_ARCHS)
set(SRCS "csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${HADACORE_ARCHS}")
list(APPEND VLLM_EXT_SRC "${SRCS}")
message(STATUS "Building hadacore")
endif()
# if CUDA endif
endif()
@ -885,17 +753,7 @@ set(VLLM_MOE_EXT_SRC
"csrc/moe/topk_softmax_kernels.cu")
if(VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_MOE_EXT_SRC
"csrc/moe/moe_wna16.cu"
"csrc/moe/grouped_topk_kernels.cu")
endif()
if(VLLM_GPU_LANG STREQUAL "CUDA")
set(MOE_PERMUTE_SRC
"csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu"
"csrc/moe/moe_permute_unpermute_op.cu")
list(APPEND VLLM_MOE_EXT_SRC "${MOE_PERMUTE_SRC}")
list(APPEND VLLM_MOE_EXT_SRC "csrc/moe/moe_wna16.cu")
endif()
set_gencode_flags_for_srcs(
@ -956,10 +814,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
set_gencode_flags_for_srcs(
SRCS "${MOE_WNAA16_MARLIN_SRC}"
CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
set_source_files_properties(${MOE_WNAA16_MARLIN_SRC}
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
endif()
list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC})
@ -970,6 +824,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif()
endif()
if(VLLM_GPU_LANG STREQUAL "CUDA")
set(MOE_PERMUTE_SRC
"csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu"
"csrc/moe/moe_permute_unpermute_op.cu")
set_gencode_flags_for_srcs(
SRCS "${MARLIN_PERMUTE_SRC}"
CUDA_ARCHS "${MOE_PERMUTE_ARCHS}")
list(APPEND VLLM_MOE_EXT_SRC "${MOE_PERMUTE_SRC}")
endif()
message(STATUS "Enabling moe extension.")
define_gpu_extension_target(
_moe_C

View File

@ -2,6 +2,7 @@ include LICENSE
include requirements/common.txt
include requirements/cuda.txt
include requirements/rocm.txt
include requirements/neuron.txt
include requirements/cpu.txt
include CMakeLists.txt

View File

@ -1,4 +1,3 @@
<!-- markdownlint-disable MD001 MD041 -->
<p align="center">
<picture>
<source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/assets/logos/vllm-logo-text-dark.png">
@ -14,27 +13,17 @@ Easy, fast, and cheap LLM serving for everyone
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://blog.vllm.ai/"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://discuss.vllm.ai"><b>User Forum</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
</p>
---
Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundation.org/pytorch-conference/) and [Ray Summit, November 3-5](https://www.anyscale.com/ray-summit/2025) in San Francisco for our latest updates on vLLM and to meet the vLLM team! Register now for the largest vLLM community events of the year!
---
*Latest News* 🔥
- [2025/09] We hosted [vLLM Toronto Meetup](https://luma.com/e80e0ymm) focused on tackling inference at scale and speculative decoding with speakers from NVIDIA and Red Hat! Please find the meetup slides [here](https://docs.google.com/presentation/d/1IYJYmJcu9fLpID5N5RbW_vO0XLo0CGOR14IXOjB61V8/edit?usp=sharing).
- [2025/08] We hosted [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ) focusing on the ecosystem around vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA).
- [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing).
- [2025/08] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg) focusing on building, developing, and integrating with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH).
- [2025/05] We hosted [NYC vLLM Meetup](https://lu.ma/c1rqyf1f)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing).
- [2025/05] vLLM is now a hosted project under PyTorch Foundation! Please find the announcement [here](https://pytorch.org/blog/pytorch-foundation-welcomes-vllm/).
- [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
<details>
<summary>Previous News</summary>
- [2025/08] We hosted [vLLM Korea Meetup](https://luma.com/cgcgprmh) with Red Hat and Rebellions! We shared the latest advancements in vLLM along with project spotlights from the vLLM Korea community. Please find the meetup slides [here](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view).
- [2025/08] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA) focusing on large-scale LLM deployment! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) and the recording [here](https://www.chaspark.com/#/live/1166916873711665152).
- [2025/05] We hosted [NYC vLLM Meetup](https://lu.ma/c1rqyf1f)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing).
- [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
- [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
- [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
@ -57,7 +46,6 @@ Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundatio
</details>
---
## About
vLLM is a fast and easy-to-use library for LLM inference and serving.
@ -75,19 +63,20 @@ vLLM is fast with:
- Speculative decoding
- Chunked prefill
**Performance benchmark**: We include a performance benchmark at the end of [our blog post](https://blog.vllm.ai/2024/09/05/perf-update.html). It compares the performance of vLLM against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [SGLang](https://github.com/sgl-project/sglang) and [LMDeploy](https://github.com/InternLM/lmdeploy)). The implementation is under [nightly-benchmarks folder](.buildkite/nightly-benchmarks/) and you can [reproduce](https://github.com/vllm-project/vllm/issues/8176) this benchmark using our one-click runnable script.
vLLM is flexible and easy to use with:
- Seamless integration with popular Hugging Face models
- High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
- Tensor, pipeline, data and expert parallelism support for distributed inference
- Tensor parallelism and pipeline parallelism support for distributed inference
- Streaming outputs
- OpenAI-compatible API server
- Support for NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, and TPU. Additionally, support for diverse hardware plugins such as Intel Gaudi, IBM Spyre and Huawei Ascend.
- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron
- Prefix caching support
- Multi-LoRA support
vLLM seamlessly supports most popular open-source models on HuggingFace, including:
- Transformer-like LLMs (e.g., Llama)
- Mixture-of-Expert LLMs (e.g., Mixtral, Deepseek-V2 and V3)
- Embedding Models (e.g., E5-Mistral)
@ -104,7 +93,6 @@ pip install vllm
```
Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more.
- [Installation](https://docs.vllm.ai/en/latest/getting_started/installation.html)
- [Quickstart](https://docs.vllm.ai/en/latest/getting_started/quickstart.html)
- [List of Supported Models](https://docs.vllm.ai/en/latest/models/supported_models.html)
@ -121,7 +109,6 @@ vLLM is a community project. Our compute resources for development and testing a
<!-- Note: Please sort them in alphabetical order. -->
<!-- Note: Please keep these consistent with docs/community/sponsors.md -->
Cash Donations:
- a16z
- Dropbox
- Sequoia Capital
@ -129,8 +116,6 @@ Cash Donations:
- ZhenFund
Compute Resources:
- Alibaba Cloud
- AMD
- Anyscale
- AWS
@ -170,7 +155,7 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
## Contact Us
<!-- --8<-- [start:contact-us] -->
- For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues)
- For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues) or [Discussions](https://github.com/vllm-project/vllm/discussions)
- For discussing with fellow users, please use the [vLLM Forum](https://discuss.vllm.ai)
- For coordinating contributions and development, please use [Slack](https://slack.vllm.ai)
- For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm/security/advisories) feature

View File

@ -52,39 +52,3 @@ After branch cut, we approach finalizing the release branch with clear criteria
* Release branch specific changes (e.g. change version identifiers or CI fixes)
Please note: **No feature work allowed for cherry picks**. All PRs that are considered for cherry-picks need to be merged on trunk, the only exception are Release branch specific changes.
## Manual validations
### E2E Performance Validation
Before each release, we perform end-to-end performance validation to ensure no regressions are introduced. This validation uses the [vllm-benchmark workflow](https://github.com/pytorch/pytorch-integration-testing/actions/workflows/vllm-benchmark.yml) on PyTorch CI.
**Current Coverage:**
* Models: Llama3, Llama4, and Mixtral
* Hardware: NVIDIA H100 and AMD MI300x
* _Note: Coverage may change based on new model releases and hardware availability_
**Performance Validation Process:**
**Step 1: Get Access**
Request write access to the [pytorch/pytorch-integration-testing](https://github.com/pytorch/pytorch-integration-testing) repository to run the benchmark workflow.
**Step 2: Review Benchmark Setup**
Familiarize yourself with the benchmark configurations:
* [CUDA setup](https://github.com/pytorch/pytorch-integration-testing/tree/main/vllm-benchmarks/benchmarks/cuda)
* [ROCm setup](https://github.com/pytorch/pytorch-integration-testing/tree/main/vllm-benchmarks/benchmarks/rocm)
**Step 3: Run the Benchmark**
Navigate to the [vllm-benchmark workflow](https://github.com/pytorch/pytorch-integration-testing/actions/workflows/vllm-benchmark.yml) and configure:
* **vLLM branch**: Set to the release branch (e.g., `releases/v0.9.2`)
* **vLLM commit**: Set to the RC commit hash
**Step 4: Review Results**
Once the workflow completes, benchmark results will be available on the [vLLM benchmark dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm) under the corresponding branch and commit.
**Step 5: Performance Comparison**
Compare the current results against the previous release to verify no performance regressions have occurred. Here is an
example of [v0.9.1 vs v0.9.2](https://hud.pytorch.org/benchmark/llms?startTime=Thu%2C%2017%20Apr%202025%2021%3A43%3A50%20GMT&stopTime=Wed%2C%2016%20Jul%202025%2021%3A43%3A50%20GMT&granularity=week&lBranch=releases/v0.9.1&lCommit=b6553be1bc75f046b00046a4ad7576364d03c835&rBranch=releases/v0.9.2&rCommit=a5dd03c1ebc5e4f56f3c9d3dc0436e9c582c978f&repoName=vllm-project%2Fvllm&benchmarkName=&modelName=All%20Models&backendName=All%20Backends&modeName=All%20Modes&dtypeName=All%20DType&deviceName=All%20Devices&archName=All%20Platforms).

View File

@ -1,50 +1,13 @@
# Security Policy
## Reporting security issues
## Reporting a Vulnerability
Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new).
If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem.
## Issue triage
Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new). Reports will then be triaged by the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html).
Reports will then be triaged by the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html).
## Threat model
---
Please see the [Security Guide in the vLLM documentation](https://docs.vllm.ai/en/latest/usage/security.html) for more information on vLLM's security assumptions and recommendations.
Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models.
## Issue severity
We will determine the risk of each issue, taking into account our experience dealing with past issues, versions affected, common defaults, and use cases. We use the following severity categories:
### CRITICAL Severity
Vulnerabilities that allow remote attackers to execute arbitrary code, take full control of the system, or significantly compromise confidentiality, integrity, or availability without any interaction or privileges needed, examples include remote code execution via network, deserialization issues that allow exploit chains. Generally those issues which are rated as CVSS ≥9.0.
### HIGH Severity
Serious security flaws that allow elevated impact—like RCE in specific, limited contexts or significant data loss—but require advanced conditions or some trust, examples include RCE in advanced deployment modes (e.g. multi-node), or high impact issues where some sort of privileged network access is required. These issues typically have CVSS scores between 7.0 and 8.9
### MODERATE Severity
Vulnerabilities that cause denial of service or partial disruption, but do not allow arbitrary code execution or data breach and have limited impact. These issues have a CVSS rating between 4.0 and 6.9
### LOW Severity
Minor issues such as informational disclosures, logging errors, non-exploitable flaws, or weaknesses that require local or high-privilege access and offer negligible impact. Examples include side channel attacks or hash collisions. These issues often have CVSS scores less than 4.0
## Prenotification policy
For certain security issues of CRITICAL, HIGH, or MODERATE severity level, we may prenotify certain organizations or vendors that ship vLLM. The purpose of this prenotification is to allow for a coordinated release of fixes for severe issues.
* This prenotification will be in the form of a private email notification. It may also include adding security contacts to the GitHub security advisory, typically a few days before release.
* If you wish to be added to the prenotification group, please send an email copying all the members of the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html). Each vendor contact will be analyzed on a case-by-case basis.
* Organizations and vendors who either ship or use vLLM, are eligible to join the prenotification group if they meet at least one of the following qualifications
* Substantial internal deployment leveraging the upstream vLLM project.
* Established internal security teams and comprehensive compliance measures.
* Active and consistent contributions to the upstream vLLM project.
* We may withdraw organizations from receiving future prenotifications if they release fixes or any other information about issues before they are public. Group membership may also change based on policy refinements for who may be included.

View File

@ -1,20 +1,605 @@
# Benchmarks
# Benchmarking vLLM
This directory used to contain vLLM's benchmark scripts and utilities for performance testing and evaluation.
This README guides you through running benchmark tests with the extensive
datasets supported on vLLM. Its a living document, updated as new features and datasets
become available.
## Contents
**Dataset Overview**
- **Serving benchmarks**: Scripts for testing online inference performance (latency, throughput)
- **Throughput benchmarks**: Scripts for testing offline batch inference performance
- **Specialized benchmarks**: Tools for testing specific features like structured output, prefix caching, long document QA, request prioritization, and multi-modal inference
- **Dataset utilities**: Framework for loading and sampling from various benchmark datasets (ShareGPT, HuggingFace datasets, synthetic data, etc.)
<table style="width:100%; border-collapse: collapse;">
<thead>
<tr>
<th style="width:15%; text-align: left;">Dataset</th>
<th style="width:10%; text-align: center;">Online</th>
<th style="width:10%; text-align: center;">Offline</th>
<th style="width:65%; text-align: left;">Data Path</th>
</tr>
</thead>
<tbody>
<tr>
<td><strong>ShareGPT</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td><code>wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json</code></td>
</tr>
<tr>
<td><strong>BurstGPT</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td><code>wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv</code></td>
</tr>
<tr>
<td><strong>Sonnet</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td>Local file: <code>benchmarks/sonnet.txt</code></td>
</tr>
<tr>
<td><strong>Random</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td><code>synthetic</code></td>
</tr>
<tr>
<td><strong>HuggingFace-VisionArena</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td><code>lmarena-ai/VisionArena-Chat</code></td>
</tr>
<tr>
<td><strong>HuggingFace-InstructCoder</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td><code>likaixin/InstructCoder</code></td>
</tr>
<tr>
<td><strong>HuggingFace-AIMO</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td><code>AI-MO/aimo-validation-aime</code> , <code>AI-MO/NuminaMath-1.5</code>, <code>AI-MO/NuminaMath-CoT</code></td>
</tr>
<tr>
<td><strong>HuggingFace-Other</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td><code>lmms-lab/LLaVA-OneVision-Data</code>, <code>Aeala/ShareGPT_Vicuna_unfiltered</code></td>
</tr>
<tr>
<td><strong>Custom</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td>Local file: <code>data.jsonl</code></td>
</tr>
</tbody>
</table>
## Usage
✅: supported
For detailed usage instructions, examples, and dataset information, see the [Benchmark CLI documentation](https://docs.vllm.ai/en/latest/contributing/benchmarks.html#benchmark-cli).
🟡: Partial support
For full CLI reference see:
🚧: to be supported
- <https://docs.vllm.ai/en/latest/cli/bench/latency.html>
- <https://docs.vllm.ai/en/latest/cli/bench/serve.html>
- <https://docs.vllm.ai/en/latest/cli/bench/throughput.html>
**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`
---
<details>
<summary><b>🚀 Example - Online Benchmark</b></summary>
<br/>
First start serving your model
```bash
vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests
```
Then run the benchmarking script
```bash
# download dataset
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
python3 vllm/benchmarks/benchmark_serving.py \
--backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--endpoint /v1/completions \
--dataset-name sharegpt \
--dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
--num-prompts 10
```
If successful, you will see the following output
```
============ Serving Benchmark Result ============
Successful requests: 10
Benchmark duration (s): 5.78
Total input tokens: 1369
Total generated tokens: 2212
Request throughput (req/s): 1.73
Output token throughput (tok/s): 382.89
Total Token throughput (tok/s): 619.85
---------------Time to First Token----------------
Mean TTFT (ms): 71.54
Median TTFT (ms): 73.88
P99 TTFT (ms): 79.49
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms): 7.91
Median TPOT (ms): 7.96
P99 TPOT (ms): 8.03
---------------Inter-token Latency----------------
Mean ITL (ms): 7.74
Median ITL (ms): 7.70
P99 ITL (ms): 8.39
==================================================
```
**Custom Dataset**
If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl
```
{"prompt": "What is the capital of India?"}
{"prompt": "What is the capital of Iran?"}
{"prompt": "What is the capital of China?"}
```
```bash
# start server
VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct --disable-log-requests
```
```bash
# run benchmarking script
python3 benchmarks/benchmark_serving.py --port 9001 --save-result --save-detailed \
--backend vllm \
--model meta-llama/Llama-3.1-8B-Instruct \
--endpoint /v1/completions \
--dataset-name custom \
--dataset-path <path-to-your-data-jsonl> \
--custom-skip-chat-template \
--num-prompts 80 \
--max-concurrency 1 \
--temperature=0.3 \
--top-p=0.75 \
--result-dir "./log/"
```
You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`.
**VisionArena Benchmark for Vision Language Models**
```bash
# need a model with vision capability here
vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
```
```bash
python3 vllm/benchmarks/benchmark_serving.py \
--backend openai-chat \
--model Qwen/Qwen2-VL-7B-Instruct \
--endpoint /v1/chat/completions \
--dataset-name hf \
--dataset-path lmarena-ai/VisionArena-Chat \
--hf-split train \
--num-prompts 1000
```
**InstructCoder Benchmark with Speculative Decoding**
``` bash
VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
--speculative-config $'{"method": "ngram",
"num_speculative_tokens": 5, "prompt_lookup_max": 5,
"prompt_lookup_min": 2}'
```
``` bash
python3 benchmarks/benchmark_serving.py \
--model meta-llama/Meta-Llama-3-8B-Instruct \
--dataset-name hf \
--dataset-path likaixin/InstructCoder \
--num-prompts 2048
```
**Other HuggingFaceDataset Examples**
```bash
vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
```
**`lmms-lab/LLaVA-OneVision-Data`**
```bash
python3 vllm/benchmarks/benchmark_serving.py \
--backend openai-chat \
--model Qwen/Qwen2-VL-7B-Instruct \
--endpoint /v1/chat/completions \
--dataset-name hf \
--dataset-path lmms-lab/LLaVA-OneVision-Data \
--hf-split train \
--hf-subset "chart2text(cauldron)" \
--num-prompts 10
```
**`Aeala/ShareGPT_Vicuna_unfiltered`**
```bash
python3 vllm/benchmarks/benchmark_serving.py \
--backend openai-chat \
--model Qwen/Qwen2-VL-7B-Instruct \
--endpoint /v1/chat/completions \
--dataset-name hf \
--dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
--hf-split train \
--num-prompts 10
```
**`AI-MO/aimo-validation-aime`**
``` bash
python3 vllm/benchmarks/benchmark_serving.py \
--model Qwen/QwQ-32B \
--dataset-name hf \
--dataset-path AI-MO/aimo-validation-aime \
--num-prompts 10 \
--seed 42
```
**`philschmid/mt-bench`**
``` bash
python3 vllm/benchmarks/benchmark_serving.py \
--model Qwen/QwQ-32B \
--dataset-name hf \
--dataset-path philschmid/mt-bench \
--num-prompts 80
```
**Running With Sampling Parameters**
When using OpenAI-compatible backends such as `vllm`, optional sampling
parameters can be specified. Example client command:
```bash
python3 vllm/benchmarks/benchmark_serving.py \
--backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--endpoint /v1/completions \
--dataset-name sharegpt \
--dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
--top-k 10 \
--top-p 0.9 \
--temperature 0.5 \
--num-prompts 10
```
**Running With Ramp-Up Request Rate**
The benchmark tool also supports ramping up the request rate over the
duration of the benchmark run. This can be useful for stress testing the
server or finding the maximum throughput that it can handle, given some latency budget.
Two ramp-up strategies are supported:
- `linear`: Increases the request rate linearly from a start value to an end value.
- `exponential`: Increases the request rate exponentially.
The following arguments can be used to control the ramp-up:
- `--ramp-up-strategy`: The ramp-up strategy to use (`linear` or `exponential`).
- `--ramp-up-start-rps`: The request rate at the beginning of the benchmark.
- `--ramp-up-end-rps`: The request rate at the end of the benchmark.
</details>
<details>
<summary><b>📈 Example - Offline Throughput Benchmark</b></summary>
<br/>
```bash
python3 vllm/benchmarks/benchmark_throughput.py \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--dataset-name sonnet \
--dataset-path vllm/benchmarks/sonnet.txt \
--num-prompts 10
```
If successful, you will see the following output
```
Throughput: 7.15 requests/s, 4656.00 total tokens/s, 1072.15 output tokens/s
Total num prompt tokens: 5014
Total num output tokens: 1500
```
**VisionArena Benchmark for Vision Language Models**
``` bash
python3 vllm/benchmarks/benchmark_throughput.py \
--model Qwen/Qwen2-VL-7B-Instruct \
--backend vllm-chat \
--dataset-name hf \
--dataset-path lmarena-ai/VisionArena-Chat \
--num-prompts 1000 \
--hf-split train
```
The `num prompt tokens` now includes image token counts
```
Throughput: 2.55 requests/s, 4036.92 total tokens/s, 326.90 output tokens/s
Total num prompt tokens: 14527
Total num output tokens: 1280
```
**InstructCoder Benchmark with Speculative Decoding**
``` bash
VLLM_WORKER_MULTIPROC_METHOD=spawn \
VLLM_USE_V1=1 \
python3 vllm/benchmarks/benchmark_throughput.py \
--dataset-name=hf \
--dataset-path=likaixin/InstructCoder \
--model=meta-llama/Meta-Llama-3-8B-Instruct \
--input-len=1000 \
--output-len=100 \
--num-prompts=2048 \
--async-engine \
--speculative-config $'{"method": "ngram",
"num_speculative_tokens": 5, "prompt_lookup_max": 5,
"prompt_lookup_min": 2}'
```
```
Throughput: 104.77 requests/s, 23836.22 total tokens/s, 10477.10 output tokens/s
Total num prompt tokens: 261136
Total num output tokens: 204800
```
**Other HuggingFaceDataset Examples**
**`lmms-lab/LLaVA-OneVision-Data`**
```bash
python3 vllm/benchmarks/benchmark_throughput.py \
--model Qwen/Qwen2-VL-7B-Instruct \
--backend vllm-chat \
--dataset-name hf \
--dataset-path lmms-lab/LLaVA-OneVision-Data \
--hf-split train \
--hf-subset "chart2text(cauldron)" \
--num-prompts 10
```
**`Aeala/ShareGPT_Vicuna_unfiltered`**
```bash
python3 vllm/benchmarks/benchmark_throughput.py \
--model Qwen/Qwen2-VL-7B-Instruct \
--backend vllm-chat \
--dataset-name hf \
--dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
--hf-split train \
--num-prompts 10
```
**`AI-MO/aimo-validation-aime`**
```bash
python3 benchmarks/benchmark_throughput.py \
--model Qwen/QwQ-32B \
--backend vllm \
--dataset-name hf \
--dataset-path AI-MO/aimo-validation-aime \
--hf-split train \
--num-prompts 10
```
**Benchmark with LoRA Adapters**
``` bash
# download dataset
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
python3 vllm/benchmarks/benchmark_throughput.py \
--model meta-llama/Llama-2-7b-hf \
--backend vllm \
--dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
--dataset_name sharegpt \
--num-prompts 10 \
--max-loras 2 \
--max-lora-rank 8 \
--enable-lora \
--lora-path yard1/llama-2-7b-sql-lora-test
```
</details>
<details>
<summary><b>🛠️ Example - Structured Output Benchmark</b></summary>
<br/>
Benchmark the performance of structured output generation (JSON, grammar, regex).
**Server Setup**
```bash
vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests
```
**JSON Schema Benchmark**
```bash
python3 benchmarks/benchmark_serving_structured_output.py \
--backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--dataset json \
--structured-output-ratio 1.0 \
--request-rate 10 \
--num-prompts 1000
```
**Grammar-based Generation Benchmark**
```bash
python3 benchmarks/benchmark_serving_structured_output.py \
--backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--dataset grammar \
--structure-type grammar \
--request-rate 10 \
--num-prompts 1000
```
**Regex-based Generation Benchmark**
```bash
python3 benchmarks/benchmark_serving_structured_output.py \
--backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--dataset regex \
--request-rate 10 \
--num-prompts 1000
```
**Choice-based Generation Benchmark**
```bash
python3 benchmarks/benchmark_serving_structured_output.py \
--backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--dataset choice \
--request-rate 10 \
--num-prompts 1000
```
**XGrammar Benchmark Dataset**
```bash
python3 benchmarks/benchmark_serving_structured_output.py \
--backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--dataset xgrammar_bench \
--request-rate 10 \
--num-prompts 1000
```
</details>
<details>
<summary><b>📚 Example - Long Document QA Benchmark</b></summary>
<br/>
Benchmark the performance of long document question-answering with prefix caching.
**Basic Long Document QA Test**
```bash
python3 benchmarks/benchmark_long_document_qa_throughput.py \
--model meta-llama/Llama-2-7b-chat-hf \
--enable-prefix-caching \
--num-documents 16 \
--document-length 2000 \
--output-len 50 \
--repeat-count 5
```
**Different Repeat Modes**
```bash
# Random mode (default) - shuffle prompts randomly
python3 benchmarks/benchmark_long_document_qa_throughput.py \
--model meta-llama/Llama-2-7b-chat-hf \
--enable-prefix-caching \
--num-documents 8 \
--document-length 3000 \
--repeat-count 3 \
--repeat-mode random
# Tile mode - repeat entire prompt list in sequence
python3 benchmarks/benchmark_long_document_qa_throughput.py \
--model meta-llama/Llama-2-7b-chat-hf \
--enable-prefix-caching \
--num-documents 8 \
--document-length 3000 \
--repeat-count 3 \
--repeat-mode tile
# Interleave mode - repeat each prompt consecutively
python3 benchmarks/benchmark_long_document_qa_throughput.py \
--model meta-llama/Llama-2-7b-chat-hf \
--enable-prefix-caching \
--num-documents 8 \
--document-length 3000 \
--repeat-count 3 \
--repeat-mode interleave
```
</details>
<details>
<summary><b>🗂️ Example - Prefix Caching Benchmark</b></summary>
<br/>
Benchmark the efficiency of automatic prefix caching.
**Fixed Prompt with Prefix Caching**
```bash
python3 benchmarks/benchmark_prefix_caching.py \
--model meta-llama/Llama-2-7b-chat-hf \
--enable-prefix-caching \
--num-prompts 1 \
--repeat-count 100 \
--input-length-range 128:256
```
**ShareGPT Dataset with Prefix Caching**
```bash
# download dataset
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
python3 benchmarks/benchmark_prefix_caching.py \
--model meta-llama/Llama-2-7b-chat-hf \
--dataset-path /path/ShareGPT_V3_unfiltered_cleaned_split.json \
--enable-prefix-caching \
--num-prompts 20 \
--repeat-count 5 \
--input-length-range 128:256
```
</details>
<details>
<summary><b>⚡ Example - Request Prioritization Benchmark</b></summary>
<br/>
Benchmark the performance of request prioritization in vLLM.
**Basic Prioritization Test**
```bash
python3 benchmarks/benchmark_prioritization.py \
--model meta-llama/Llama-2-7b-chat-hf \
--input-len 128 \
--output-len 64 \
--num-prompts 100 \
--scheduling-policy priority
```
**Multiple Sequences per Prompt**
```bash
python3 benchmarks/benchmark_prioritization.py \
--model meta-llama/Llama-2-7b-chat-hf \
--input-len 128 \
--output-len 64 \
--num-prompts 100 \
--scheduling-policy priority \
--n 2
```
</details>

View File

@ -1,45 +1,56 @@
#!/bin/bash
# This script aims to tune the best server parameter combinations to maximize throughput for given requirement.
# See details in README (benchmarks/auto_tune/README.md).
# This script aims to tune the best server parameter combinations to maximize throughput for given requirement.
# The current server parameter combination is max_num_seqs and max_num_batched_tokens
# It also supports additional requirement: e2e latency and prefix cache.
# Pre-requisite:
# 1. Checkout to your branch, install/ update the correct running env. For TPU, activate conda env and install the corresponding torch, xla version.
# 2. If the model is customized, replace the MODEL's config with the customized config.
# 3. Set variables (ALL REQUIRED)
# BASE: your directory for vllm repo
# MODEL: the model served by vllm
# SYSTEM: the hardware, choice TPU or GPU, for other systems, "get best profile" might not support.
# TP: ways of tensor parallelism
# DOWNLOAD_DIR: directory to download and load model weights.
# INPUT_LEN: request input len
# OUTPUT_LEN: request output len
# MIN_CACHE_HIT_PCT: prefix cache rate
# MAX_LATENCY_ALLOWED_MS: (e2e) latency requirement. If there's no latency requirement, set it to a large number like 1000000000
# NUM_SEQS_LIST: a list of `max-num-seqs` you want to loop with.
# NUM_BATCHED_TOKENS_LIST: a list of `max-num-batched-tokens` you want to loop with.
# Note that the default NUM_SEQS_LIST and NUM_BATCHED_TOKENS_LIST are set for medium size input/output len, for extra short context (such as 20:20), you might need to include larger numbers in NUM_SEQS_LIST.
# 4. Run the script, it might take a long time, you can use tmux to avoid the script stop if disconnection happens.
# 5. The final result will be saved in RESULT file.
# Example use cases
# 1. Given input_len=1800, output_len=20, what's the best max_num_seqs and max_num_batched_tokens to get highest throughput?
# Use INPUT_LEN=1800, OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=0, MAX_LATENCY_ALLOWED_MS=100000000000
# 2. If we have latency requirement to be lower than 500ms, what's the best server parameter?
# Use INPUT_LEN=1800, OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=0, MAX_LATENCY_ALLOWED_MS=500
# 3. If we want to reach 60% prefix cache, what's the best server parameter?
# Use INPUT_LEN=1800, OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=60, MAX_LATENCY_ALLOWED_MS=500
TAG=$(date +"%Y_%m_%d_%H_%M")
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
VLLM_LOGGING_LEVEL=${VLLM_LOGGING_LEVEL:-INFO}
BASE=${BASE:-"$SCRIPT_DIR/../../.."}
MODEL=${MODEL:-"meta-llama/Llama-3.1-8B-Instruct"}
SYSTEM=${SYSTEM:-"TPU"}
TP=${TP:-1}
DOWNLOAD_DIR=${DOWNLOAD_DIR:-""}
INPUT_LEN=${INPUT_LEN:-4000}
OUTPUT_LEN=${OUTPUT_LEN:-16}
MAX_MODEL_LEN=${MAX_MODEL_LEN:-4096}
MIN_CACHE_HIT_PCT=${MIN_CACHE_HIT_PCT:-0}
MAX_LATENCY_ALLOWED_MS=${MAX_LATENCY_ALLOWED_MS:-100000000000}
NUM_SEQS_LIST=${NUM_SEQS_LIST:-"128 256"}
NUM_BATCHED_TOKENS_LIST=${NUM_BATCHED_TOKENS_LIST:-"512 1024 2048 4096"}
BASE=""
MODEL="meta-llama/Llama-3.1-8B-Instruct"
SYSTEM="TPU"
TP=1
DOWNLOAD_DIR=""
INPUT_LEN=4000
OUTPUT_LEN=16
MIN_CACHE_HIT_PCT=0
MAX_LATENCY_ALLOWED_MS=100000000000
NUM_SEQS_LIST="128 256"
NUM_BATCHED_TOKENS_LIST="512 1024 2048 4096"
LOG_FOLDER="$BASE/auto-benchmark/$TAG"
RESULT="$LOG_FOLDER/result.txt"
PROFILE_PATH="$LOG_FOLDER/profile"
echo "====================== AUTO TUNE PARAMETERS ===================="
echo "SCRIPT_DIR=$SCRIPT_DIR"
echo "BASE=$BASE"
echo "MODEL=$MODEL"
echo "SYSTEM=$SYSTEM"
echo "TP=$TP"
echo "DOWNLOAD_DIR=$DOWNLOAD_DIR"
echo "INPUT_LEN=$INPUT_LEN"
echo "OUTPUT_LEN=$OUTPUT_LEN"
echo "MAX_MODEL_LEN=$MAX_MODEL_LEN"
echo "MIN_CACHE_HIT_PCT=$MIN_CACHE_HIT_PCT"
echo "MAX_LATENCY_ALLOWED_MS=$MAX_LATENCY_ALLOWED_MS"
echo "NUM_SEQS_LIST=$NUM_SEQS_LIST"
echo "NUM_BATCHED_TOKENS_LIST=$NUM_BATCHED_TOKENS_LIST"
echo "VLLM_LOGGING_LEVEL=$VLLM_LOGGING_LEVEL"
echo "RESULT_FILE=$RESULT"
echo "====================== AUTO TUNEPARAMETERS ===================="
echo "result file: $RESULT"
echo "model: $MODEL"
rm -rf $LOG_FOLDER
rm -rf $PROFILE_PATH
@ -54,18 +65,10 @@ current_hash=$(git rev-parse HEAD)
echo "hash:$current_hash" >> "$RESULT"
echo "current_hash: $current_hash"
TOTAL_LEN=$((INPUT_LEN + OUTPUT_LEN))
RED='\033[0;31m'
if (( TOTAL_LEN > MAX_MODEL_LEN )); then
echo -e "${RED}FAILED: INPUT_LEN($INPUT_LEN) + OUTPUT_LEN($OUTPUT_LEN) = $TOTAL_LEN, which is > MAX_MODEL_LEN = $MAX_MODEL_LEN.\033[0m" >&2
exit 1
fi
best_throughput=0
best_max_num_seqs=0
best_num_batched_tokens=0
best_goodput=0
best_request_rate=0
start_server() {
local gpu_memory_utilization=$1
@ -73,47 +76,26 @@ start_server() {
local max_num_batched_tokens=$3
local vllm_log=$4
local profile_dir=$5
pkill -f vllm
pkill -if vllm
# Define the common arguments as a bash array.
# Each argument and its value are separate elements.
local common_args_array=(
"$MODEL"
"--disable-log-requests"
"--port" "8004"
"--gpu-memory-utilization" "$gpu_memory_utilization"
"--max-num-seqs" "$max_num_seqs"
"--max-num-batched-tokens" "$max_num_batched_tokens"
"--tensor-parallel-size" "$TP"
"--enable-prefix-caching"
"--load-format" "dummy"
"--download-dir" "$DOWNLOAD_DIR"
"--max-model-len" "$MAX_MODEL_LEN"
)
# Use the array expansion "${common_args_array[@]}"
# This correctly passes each element as a separate argument.
if [[ -n "$profile_dir" ]]; then
# Start server with profiling enabled
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
else
# Start server without profiling
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 \
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
fi
local server_pid=$!
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir vllm serve $MODEL \
--disable-log-requests \
--port 8004 \
--gpu-memory-utilization $gpu_memory_utilization \
--max-num-seqs $max_num_seqs \
--max-num-batched-tokens $max_num_batched_tokens \
--tensor-parallel-size $TP \
--enable-prefix-caching \
--load-format dummy \
--download-dir "$DOWNLOAD_DIR" \
--max-model-len $(( INPUT_LEN+OUTPUT_LEN )) > "$vllm_log" 2>&1 &
# wait for 10 minutes...
server_started=0
for i in {1..60}; do
# This line checks whether the server is still alive or not,
# since that we should always have permission to send signal to the server process.
kill -0 $server_pid 2> /dev/null || break
for i in {1..60}; do
RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
if [[ "$STATUS_CODE" -eq 200 ]]; then
server_started=1
break
@ -121,29 +103,45 @@ start_server() {
sleep 10
fi
done
if (( ! server_started )); then
echo "server did not start within 10 minutes or crashed. Please check server log at $vllm_log".
echo "server did not start within 10 minutes. Please check server log at $vllm_log".
return 1
else
return 0
fi
}
update_best_profile() {
local profile_dir=$1
local profile_index=$2
sorted_paths=($(find "$profile_dir" -maxdepth 1 -not -path "$profile_dir" | sort))
selected_profile_file=
if [[ "$SYSTEM" == "TPU" ]]; then
selected_profile_file="${sorted_paths[$profile_index]}/*.xplane.pb"
fi
if [[ "$SYSTEM" == "GPU" ]]; then
selected_profile_file="${sorted_paths[$profile_index]}"
fi
rm -f $PROFILE_PATH/*
cp $selected_profile_file $PROFILE_PATH
}
run_benchmark() {
local max_num_seqs=$1
local max_num_batched_tokens=$2
local gpu_memory_utilization=$3
echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
local profile_dir="$LOG_FOLDER/profile_${max_num_seqs}_${max_num_batched_tokens}"
echo "vllm_log: $vllm_log"
echo
rm -f $vllm_log
pkill -if vllm
mkdir -p $profile_dir
pkill -f vllm
local profile_index=0
echo "starting server..."
# Call start_server without a profile_dir to avoid profiling overhead
start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log ""
start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log $profile_dir
result=$?
if [[ "$result" -eq 1 ]]; then
echo "server failed to start. gpu_memory_utilization:$gpu_memory_utilization, max_num_seqs:$max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
@ -151,19 +149,17 @@ run_benchmark() {
echo "server started."
fi
echo
echo "run benchmark test..."
meet_latency_requirement=0
# get a basic qps by using request-rate inf
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
adjusted_input_len=$(( INPUT_LEN - prefix_len ))
# --profile flag is removed from this call
vllm bench serve \
python benchmarks/benchmark_serving.py \
--backend vllm \
--model $MODEL \
--dataset-name random \
--random-input-len $adjusted_input_len \
--random-input-len $INPUT_LEN \
--random-output-len $OUTPUT_LEN \
--ignore-eos \
--disable-tqdm \
@ -172,7 +168,8 @@ run_benchmark() {
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
--num-prompts 1000 \
--random-prefix-len $prefix_len \
--port 8004 &> "$bm_log"
--port 8004 \
--profile &> "$bm_log"
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
@ -186,15 +183,16 @@ run_benchmark() {
# start from request-rate as int(throughput) + 1
request_rate=$((${throughput%.*} + 1))
while ((request_rate > 0)); do
profile_index=$((profile_index+1))
# clear prefix cache
curl -X POST http://0.0.0.0:8004/reset_prefix_cache
sleep 5
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
vllm bench serve \
python benchmarks/benchmark_serving.py \
--backend vllm \
--model $MODEL \
--dataset-name random \
--random-input-len $adjusted_input_len \
--random-input-len $INPUT_LEN \
--random-output-len $OUTPUT_LEN \
--ignore-eos \
--disable-tqdm \
@ -223,7 +221,12 @@ run_benchmark() {
best_max_num_seqs=$max_num_seqs
best_num_batched_tokens=$max_num_batched_tokens
best_goodput=$goodput
best_request_rate=$request_rate
if [[ "$SYSTEM" == "TPU" ]]; then
update_best_profile "$profile_dir/plugins/profile" $profile_index
fi
if [[ "$SYSTEM" == "GPU" ]]; then
update_best_profile "$profile_dir" $profile_index
fi
fi
else
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}"
@ -232,9 +235,9 @@ run_benchmark() {
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
pkill -if vllm
pkill vllm
sleep 10
echo "===================="
printf '=%.0s' $(seq 1 20)
return 0
}
@ -245,8 +248,7 @@ read -r -a num_batched_tokens_list <<< "$NUM_BATCHED_TOKENS_LIST"
gpu_memory_utilization=0.98
find_gpu_memory_utilization=0
while (( $(echo "$gpu_memory_utilization >= 0.9" | bc -l) )); do
# Pass empty string for profile_dir argument
start_server $gpu_memory_utilization "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log" ""
start_server $gpu_memory_utilization "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log"
result=$?
if [[ "$result" -eq 0 ]]; then
find_gpu_memory_utilization=1
@ -269,45 +271,6 @@ for num_seqs in "${num_seqs_list[@]}"; do
done
done
echo "finish permutations"
# =================================================================================
# FINAL PROFILING RUN FOR THE BEST CONFIGURATION
# =================================================================================
if (( $(echo "$best_throughput > 0" | bc -l) )); then
echo
echo "Benchmark tuning finished. Now running profiling on the best configuration found..."
echo "Best config: max_num_seqs: $best_max_num_seqs, max_num_batched_tokens: $best_num_batched_tokens, throughput: $best_throughput"
echo
vllm_log="$LOG_FOLDER/vllm_log_BEST_PROFILE.txt"
bm_log="$LOG_FOLDER/bm_log_BEST_PROFILE.txt"
# Start server with the best params and profiling ENABLED
echo "Starting server for profiling..."
start_server $gpu_memory_utilization $best_max_num_seqs $best_num_batched_tokens "$vllm_log" "$PROFILE_PATH"
# Run benchmark with the best params and the --profile flag
echo "Running benchmark with profiling..."
prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
adjusted_input_len=$(( INPUT_LEN - prefix_len ))
vllm bench serve \
--backend vllm \
--model $MODEL \
--dataset-name random \
--random-input-len $adjusted_input_len \
--random-output-len $OUTPUT_LEN \
--ignore-eos \
--disable-tqdm \
--request-rate $best_request_rate \
--percentile-metrics ttft,tpot,itl,e2el \
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
--num-prompts 100 \
--random-prefix-len $prefix_len \
--port 8004 \
--profile &> "$bm_log"
else
echo "No configuration met the latency requirements. Skipping final profiling run."
fi
pkill -if vllm
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH"
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT"

View File

@ -1,218 +0,0 @@
# Automated vLLM Server Parameter Tuning
This script automates the process of finding the optimal server parameter combination (`max-num-seqs` and `max-num-batched-tokens`) to maximize throughput for a vLLM server. It also supports additional constraints such as E2E latency and prefix cache hit rate.
## Table of Contents
- [Prerequisites](#prerequisites)
- [Configuration](#configuration)
- [How to Run](#how-to-run)
- [Example Use Cases](#example-use-cases)
- [Output](#output)
- [How It Works](#how-it-works)
## Prerequisites
Before running the script, please ensure the following steps are completed:
1. **Clone vLLM & Set Up Branch**: Clone the vLLM repository and check out to your desired branch.
```bash
git clone https://github.com/vllm-project/vllm.git
cd vllm
# git checkout <your-branch>
```
1. **Install Environment**: Install or update the correct running environment. For TPU usage, activate your `conda` environment and install the corresponding `torch` and `torch_xla` versions.
2. **Model Configuration**: If you are using a customized model, ensure its configuration files are correctly placed and accessible.
## Configuration
You must set the following variables at the top of the script before execution.
Note: You can also override the default values below via environment variables when running the script.
```bash
MODEL=meta-llama/Llama-3.3-70B-Instruct SYSTEM=TPU TP=8 DOWNLOAD_DIR='' INPUT_LEN=128 OUTPUT_LEN=2048 MAX_MODEL_LEN=2300 MIN_CACHE_HIT_PCT=0 MAX_LATENCY_ALLOWED_MS=100000000000 NUM_SEQS_LIST="128 256" NUM_BATCHED_TOKENS_LIST="1024 2048 4096" VLLM_LOGGING_LEVEL=DEBUG bash auto_tune.sh
```
| Variable | Description | Example Value |
| --- | --- | --- |
| `BASE` | **Required.** The absolute path to the parent directory of your vLLM repository directory. | `"$HOME"` |
| `MODEL` | **Required.** The Hugging Face model identifier to be served by vllm. | `"meta-llama/Llama-3.1-8B-Instruct"` |
| `SYSTEM`| **Required.** The hardware you are running on. Choices: `TPU` or `GPU`. (For other systems, it might not support saving profiles) | `"TPU"` |
| `TP` | **Required.** The tensor-parallelism size. | `1` |
| `DOWNLOAD_DIR` | **Required.** Directory to download and load model weights from. | `""` (default download path) |
| `INPUT_LEN` | **Required.** Request input length. | `4000` |
| `OUTPUT_LEN` | **Required.** Request output length. | `16` |
| `MAX_MODEL_LEN` | **Required.** Max model length. | `4096` |
| `MIN_CACHE_HIT_PCT` | Prefix cache hit rate in percentage (0-100). Set to `0` to disable. | `60` |
| `MAX_LATENCY_ALLOWED_MS` | The maximum allowed P99 end-to-end latency in milliseconds. Set to a very large number (e.g., `100000000000`) to effectively ignore the latency constraint. | `500` |
| `NUM_SEQS_LIST` | A space-separated string of `max-num-seqs` values to test. | `"128 256"` |
| `NUM_BATCHED_TOKENS_LIST` | A space-separated string of `max-num-batched-tokens` values to test. | `"1024 2048 4096"` |
**Note**: The default `NUM_SEQS_LIST` and `NUM_BATCHED_TOKENS_LIST` are set for medium-sized inputs/outputs. For very short contexts (e.g., 20 input, 20 output tokens), you may need to test larger values for `max-num-seqs`.
## How to Run
1. **Configure**: Edit the script and set the variables in the [Configuration](#configuration) section.
2. **Execute**: Run the script. Since the process can take a long time, it is highly recommended to use a terminal multiplexer like `tmux` or `screen` to prevent the script from stopping if your connection is lost.
```bash
cd <FOLDER_OF_THIS_SCRIPT>
bash auto_tune.sh
```
Please note that the `bash auto_tune.sh` command cannot contain full or partial path with keyword `vllm`, otherwise `pkill -f vllm` command will also kill this script itself.
## Example Use Cases
Here are a few examples of how to configure the script for different goals:
### 1. Maximize Throughput (No Latency Constraint)
- **Goal**: Find the best `max-num-seqs` and `max-num-batched-tokens` to get the highest possible throughput for 1800 input tokens and 20 output tokens.
- **Configuration**:
```bash
INPUT_LEN=1800
OUTPUT_LEN=20
MAX_MODEL_LEN=2048
MIN_CACHE_HIT_PCT=0
MAX_LATENCY_ALLOWED_MS=100000000000 # A very large number
```
#### 2. Maximize Throughput with a Latency Requirement
- **Goal**: Find the best server parameters when P99 end-to-end latency must be below 500ms.
- **Configuration**:
```bash
INPUT_LEN=1800
OUTPUT_LEN=20
MAX_MODEL_LEN=2048
MIN_CACHE_HIT_PCT=0
MAX_LATENCY_ALLOWED_MS=500
```
#### 3. Maximize Throughput with Prefix Caching and Latency Requirements
- **Goal**: Find the best server parameters assuming a 60% prefix cache hit rate and a latency requirement of 500ms.
- **Configuration**:
```bash
INPUT_LEN=1800
OUTPUT_LEN=20
MAX_MODEL_LEN=2048
MIN_CACHE_HIT_PCT=60
MAX_LATENCY_ALLOWED_MS=500
```
## Output
After the script finishes, you will find the results in a new, timestamped directory created inside `$BASE/auto-benchmark/`.
- **Log Files**: The directory (`$BASE/auto-benchmark/YYYY_MM_DD_HH_MM/`) contains detailed logs for each run:
- `vllm_log_...txt`: The log output from the vLLM server for each parameter combination.
- `bm_log_...txt`: The log output from the `vllm bench serve` command for each benchmark run.
- **Final Result Summary**: A file named `result.txt` is created in the log directory. It contains a summary of each tested combination and concludes with the overall best parameters found.
```text
# Example result.txt content
hash:a1b2c3d4...
max_num_seqs: 128, max_num_batched_tokens: 2048, request_rate: 10.0, e2el: 450.5, throughput: 9.8, goodput: 9.8
max_num_seqs: 128, max_num_batched_tokens: 4096 does not meet latency requirement 500
...
best_max_num_seqs: 256, best_num_batched_tokens: 2048, best_throughput: 12.5, profile saved in: /home/user/vllm/auto-benchmark/2024_08_01_10_30/profile
```
If it cannot find the best parameters, the final row will be `best_max_num_seqs: 0, best_num_batched_tokens: 0, best_throughput: 0`. This can be due to either the server not starting properly, or the latency requirement being too strict.
- **Profiler Trace**: A directory named `profile` is created inside the log directory. It contains the profiler trace file (e.g., `.xplane.pb` for TPU or a `.json` trace for GPU) from the single best-performing run.
## How It Works
The script follows a systematic process to find the optimal parameters:
1. **Find Max GPU Memory Utilization**: The script first determines the highest safe `gpu-memory-utilization` (starting from 0.98 and decreasing) that does not cause an Out-Of-Memory (OOM) error when launching the server. This ensures the benchmark runs use the maximum available memory without crashing.
2. **Iterate and Benchmark**: It then enters a nested loop, iterating through every combination of `max-num-seqs` and `max-num-batched-tokens` provided in the configuration lists.
3. **Latency-Aware Throughput Search**: For each parameter combination:
- The vLLM server is started.
- A benchmark is first run with an infinite request rate (`--request-rate inf`).
- If the resulting P99 E2E latency is within the `MAX_LATENCY_ALLOWED_MS` limit, this throughput is considered the maximum for this configuration.
- If the latency is too high, the script performs a search by iteratively decreasing the request rate until the latency constraint is met. This finds the highest sustainable throughput for the given parameters and latency requirement.
4. **Track Best Result**: Throughout the process, the script tracks the parameter combination that has yielded the highest valid throughput so far.
5. **Profile Collection**: For the best-performing run, the script saves the vLLM profiler output, which can be used for deep-dive performance analysis with tools like TensorBoard.
## Batched `auto_tune`
The `batch_auto_tune.sh` script allows you to run multiple `auto_tune.sh` experiments sequentially from a single configuration file. It iterates through a list of parameter sets, executes `auto_tune.sh` for each, and records the results back into the input file.
### Prerequisites
- **jq**: This script requires `jq` to parse the JSON configuration file.
- **gcloud**: If you plan to upload results to Google Cloud Storage, the `gcloud` CLI must be installed and authenticated.
### How to Run
1. **Create a JSON configuration file**: Create a file (e.g., `runs_config.json`) containing an array of JSON objects. Each object defines the parameters for a single `auto_tune.sh` run.
2. **Execute the script**:
```bash
bash batch_auto_tune.sh <path_to_json_file> [gcs_upload_path]
```
- `<path_to_json_file>`: **Required.** Path to your JSON configuration file.
- `[gcs_upload_path]`: **Optional.** A GCS path (e.g., `gs://my-bucket/benchmark-results`) where the detailed results and profiles for each run will be uploaded. If this is empty, the results will be available on the local filesystem (see the log for `RESULT_FILE=/path/to/results/file.txt`).
### Configuration File
The JSON configuration file should contain an array of objects. Each object's keys correspond to the configuration variables for `auto_tune.sh` (see the [Configuration table above](#configuration)). These keys will be converted to uppercase environment variables for each run.
Here is an example `runs_config.json` with two benchmark configurations:
```json
[
{
"base": "/home/user",
"model": "meta-llama/Llama-3.1-8B-Instruct",
"system": "TPU", # OR GPU
"tp": 8,
"input_len": 128,
"output_len": 2048,
"max_model_len": 2300,
"num_seqs_list": "128 256",
"num_batched_tokens_list": "8192 16384"
},
{
"base": "/home/user",
"model": "meta-llama/Llama-3.1-70B-Instruct",
"system": "TPU", # OR GPU
"tp": 8,
"input_len": 4000,
"output_len": 16,
"max_model_len": 4096,
"num_seqs_list": "64 128",
"num_batched_tokens_list": "4096 8192",
"max_latency_allowed_ms": 500
}
]
```
### Output
The script modifies the input JSON file in place, adding the results of each run to the corresponding object. The following fields are added:
- `run_id`: A unique identifier for the run, derived from the timestamp.
- `status`: The outcome of the run (`SUCCESS`, `FAILURE`, or `WARNING_NO_RESULT_FILE`).
- `results`: The content of the `result.txt` file from the `auto_tune.sh` run.
- `gcs_results`: The GCS URL where the run's artifacts are stored (if a GCS path was provided).
A summary of successful and failed runs is also printed to the console upon completion.

View File

@ -1,128 +0,0 @@
#!/bin/bash
INPUT_JSON="$1"
GCS_PATH="$2" # Optional GCS path for uploading results for each run
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
AUTOTUNE_SCRIPT="$SCRIPT_DIR/auto_tune.sh"
if [[ -z "$INPUT_JSON" ]]; then
echo "Error: Input JSON file not provided."
echo "Usage: $0 <path_to_json_file> [gcs_upload_path]"
exit 1
fi
if [[ ! -f "$INPUT_JSON" ]]; then
echo "Error: File not found at '$INPUT_JSON'"
exit 1
fi
if ! command -v jq &> /dev/null; then
echo "Error: 'jq' command not found. Please install jq to process the JSON input."
exit 1
fi
if [[ -n "$GCS_PATH" ]] && ! command -v gcloud &> /dev/null; then
echo "Error: 'gcloud' command not found, but a GCS_PATH was provided."
exit 1
fi
SUCCESS_COUNT=0
FAILURE_COUNT=0
FAILED_RUNS=()
SCRIPT_START_TIME=$(date +%s)
json_content=$(cat "$INPUT_JSON")
if ! num_runs=$(echo "$json_content" | jq 'length'); then
echo "Error: Invalid JSON in $INPUT_JSON. 'jq' failed to get array length." >&2
exit 1
fi
echo "Found $num_runs benchmark configurations in $INPUT_JSON."
echo "Starting benchmark runs..."
echo "--------------------------------------------------"
for i in $(seq 0 $(($num_runs - 1))); do
run_object=$(echo "$json_content" | jq ".[$i]")
RUN_START_TIME=$(date +%s)
ENV_VARS_ARRAY=()
# Dynamically create env vars from the JSON object's keys
for key in $(echo "$run_object" | jq -r 'keys_unsorted[]'); do
value=$(echo "$run_object" | jq -r ".$key")
var_name=$(echo "$key" | tr '[:lower:]' '[:upper:]' | tr -cd 'A-Z0-9_')
ENV_VARS_ARRAY+=("${var_name}=${value}")
done
echo "Executing run #$((i+1))/$num_runs with parameters: ${ENV_VARS_ARRAY[*]}"
# Execute auto_tune.sh and capture output
RUN_OUTPUT_FILE=$(mktemp)
if env "${ENV_VARS_ARRAY[@]}" bash "$AUTOTUNE_SCRIPT" > >(tee -a "$RUN_OUTPUT_FILE") 2>&1; then
STATUS="SUCCESS"
((SUCCESS_COUNT++))
else
STATUS="FAILURE"
((FAILURE_COUNT++))
FAILED_RUNS+=("Run #$((i+1)): $(echo $run_object | jq -c .)")
fi
RUN_OUTPUT=$(<"$RUN_OUTPUT_FILE")
rm "$RUN_OUTPUT_FILE"
# Parse results and optionally upload them to GCS
RUN_ID=""
RESULTS=""
GCS_RESULTS_URL=""
if [[ "$STATUS" == "SUCCESS" ]]; then
RESULT_FILE_PATH=$(echo "$RUN_OUTPUT" | grep 'RESULT_FILE=' | tail -n 1 | cut -d'=' -f2 | tr -s '/' || true)
if [[ -n "$RESULT_FILE_PATH" && -f "$RESULT_FILE_PATH" ]]; then
RUN_ID=$(basename "$(dirname "$RESULT_FILE_PATH")")
RESULT_DIR=$(dirname "$RESULT_FILE_PATH")
RESULTS=$(cat "$RESULT_FILE_PATH")
if [[ -n "$GCS_PATH" ]]; then
GCS_RESULTS_URL="${GCS_PATH}/${RUN_ID}"
echo "Uploading results to GCS..."
if gcloud storage rsync --recursive "$RESULT_DIR/" "$GCS_RESULTS_URL"; then
echo "GCS upload successful."
else
echo "Warning: GCS upload failed for RUN_ID $RUN_ID."
fi
fi
else
echo "Warning: Could not find result file for a successful run."
STATUS="WARNING_NO_RESULT_FILE"
fi
fi
# Add the results back into the JSON object for this run
json_content=$(echo "$json_content" | jq --argjson i "$i" --arg run_id "$RUN_ID" --arg status "$STATUS" --arg results "$RESULTS" --arg gcs_results "$GCS_RESULTS_URL" \
'.[$i] += {run_id: $run_id, status: $status, results: $results, gcs_results: $gcs_results}')
RUN_END_TIME=$(date +%s)
echo "Run finished in $((RUN_END_TIME - RUN_START_TIME)) seconds. Status: $STATUS"
echo "--------------------------------------------------"
# Save intermediate progress back to the file
echo "$json_content" > "$INPUT_JSON.tmp" && mv "$INPUT_JSON.tmp" "$INPUT_JSON"
done
SCRIPT_END_TIME=$(date +%s)
echo "All benchmark runs completed in $((SCRIPT_END_TIME - SCRIPT_START_TIME)) seconds."
echo
echo "====================== SUMMARY ======================"
echo "Successful runs: $SUCCESS_COUNT"
echo "Failed runs: $FAILURE_COUNT"
echo "==================================================="
if [[ $FAILURE_COUNT -gt 0 ]]; then
echo "Details of failed runs (see JSON file for full parameters):"
for failed in "${FAILED_RUNS[@]}"; do
echo " - $failed"
done
fi
echo "Updated results have been saved to '$INPUT_JSON'."

View File

@ -31,10 +31,9 @@ class RequestFuncInput:
model_name: Optional[str] = None
logprobs: Optional[int] = None
extra_body: Optional[dict] = None
multi_modal_content: Optional[dict | list[dict]] = None
multi_modal_content: Optional[dict] = None
ignore_eos: bool = False
language: Optional[str] = None
request_id: Optional[str] = None
@dataclass
@ -72,9 +71,6 @@ async def async_request_tgi(
"inputs": request_func_input.prompt,
"parameters": params,
}
headers = None
if request_func_input.request_id:
headers = {"x-request-id": request_func_input.request_id}
output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len
if request_func_input.ignore_eos:
@ -86,9 +82,7 @@ async def async_request_tgi(
st = time.perf_counter()
most_recent_timestamp = st
try:
async with session.post(
url=api_url, json=payload, headers=headers
) as response:
async with session.post(url=api_url, json=payload) as response:
if response.status == 200:
async for chunk_bytes in response.content:
chunk_bytes = chunk_bytes.strip()
@ -151,9 +145,6 @@ async def async_request_trt_llm(
}
if request_func_input.ignore_eos:
payload["min_length"] = request_func_input.output_len
headers = None
if request_func_input.request_id:
headers = {"x-request-id": request_func_input.request_id}
output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len
@ -161,9 +152,7 @@ async def async_request_trt_llm(
st = time.perf_counter()
most_recent_timestamp = st
try:
async with session.post(
url=api_url, json=payload, headers=headers
) as response:
async with session.post(url=api_url, json=payload) as response:
if response.status == 200:
async for chunk_bytes in response.content:
chunk_bytes = chunk_bytes.strip()
@ -222,8 +211,6 @@ async def async_request_deepspeed_mii(
"top_p": 1.0,
}
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
if request_func_input.request_id:
headers["x-request-id"] = request_func_input.request_id
output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len
@ -296,8 +283,6 @@ async def async_request_openai_completions(
if request_func_input.extra_body:
payload.update(request_func_input.extra_body)
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
if request_func_input.request_id:
headers["x-request-id"] = request_func_input.request_id
output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len
@ -379,15 +364,7 @@ async def async_request_openai_chat_completions(
) as session:
content = [{"type": "text", "text": request_func_input.prompt}]
if request_func_input.multi_modal_content:
mm_content = request_func_input.multi_modal_content
if isinstance(mm_content, list):
content.extend(mm_content)
elif isinstance(mm_content, dict):
content.append(mm_content)
else:
raise TypeError(
"multi_modal_content must be a dict or list[dict] for openai-chat"
)
content.append(request_func_input.multi_modal_content)
payload = {
"model": request_func_input.model_name
if request_func_input.model_name
@ -410,8 +387,6 @@ async def async_request_openai_chat_completions(
"Content-Type": "application/json",
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
}
if request_func_input.request_id:
headers["x-request-id"] = request_func_input.request_id
output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len
@ -508,8 +483,6 @@ async def async_request_openai_audio(
headers = {
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
}
if request_func_input.request_id:
headers["x-request-id"] = request_func_input.request_id
# Send audio file
def to_bytes(y, sr):
@ -518,10 +491,7 @@ async def async_request_openai_audio(
buffer.seek(0)
return buffer
mm_audio = request_func_input.multi_modal_content
if not isinstance(mm_audio, dict) or "audio" not in mm_audio:
raise TypeError("multi_modal_content must be a dict containing 'audio'")
with to_bytes(*mm_audio["audio"]) as f:
with to_bytes(*request_func_input.multi_modal_content["audio"]) as f:
form = aiohttp.FormData()
form.add_field("file", f, content_type="audio/wav")
for key, value in payload.items():

View File

@ -1,74 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import gc
from tabulate import tabulate
from benchmark_utils import TimeCollector
from vllm.utils import FlexibleArgumentParser
from vllm.v1.core.block_pool import BlockPool
def main(args):
rows = []
for allocate_block in args.allocate_blocks:
# Enforce a GC collect ahead to minimize the impact among runs
gc.collect()
block_pool = BlockPool(num_gpu_blocks=args.num_gpu_blocks, enable_caching=True)
get_blocks_times = TimeCollector(TimeCollector.US)
free_blocks_times = TimeCollector(TimeCollector.US)
for _ in range(args.num_iteration):
with get_blocks_times:
blocks = block_pool.get_new_blocks(allocate_block)
with free_blocks_times:
block_pool.free_blocks(blocks)
rows.append(
[get_blocks_times.cnt, args.num_gpu_blocks, allocate_block]
+ get_blocks_times.dump_avg_max()
+ free_blocks_times.dump_avg_max()
)
print(
tabulate(
rows,
headers=[
"Iterations",
"Total\nBlocks",
"Allocated\nBlocks",
"Get Blocks\nAvg (us)",
"Get Blocks\nMax (us)",
"Free Blocks\nAvg (us)",
"Free Blocks\nMax (us)",
],
tablefmt="grid",
floatfmt=".3f",
)
)
def invoke_main() -> None:
parser = FlexibleArgumentParser(
description="Benchmark the performance of BlockPool for KV Cache."
)
parser.add_argument("--num-gpu-blocks", type=int, default=100000)
parser.add_argument(
"--num-iteration",
type=int,
default=1000,
help="Number of iterations to run to stabilize final data readings",
)
parser.add_argument(
"--allocate-blocks",
type=int,
nargs="*",
default=[10, 50, 100, 500, 1000],
help="Number of blocks to allocate",
)
args = parser.parse_args()
main(args)
if __name__ == "__main__":
invoke_main() # pragma: no cover

File diff suppressed because it is too large Load Diff

View File

@ -1,17 +1,186 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import sys
"""Benchmark the latency of processing a single batch of requests."""
import argparse
import dataclasses
import json
import os
import time
from typing import Any, Optional
import numpy as np
from tqdm import tqdm
import vllm.envs as envs
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
from vllm import LLM, SamplingParams
from vllm.engine.arg_utils import EngineArgs
from vllm.inputs import PromptType
from vllm.sampling_params import BeamSearchParams
from vllm.utils import FlexibleArgumentParser
def save_to_pytorch_benchmark_format(
args: argparse.Namespace, results: dict[str, Any]
) -> None:
pt_records = convert_to_pytorch_benchmark_format(
args=args,
metrics={"latency": results["latencies"]},
extra_info={k: results[k] for k in ["avg_latency", "percentiles"]},
)
if pt_records:
pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
write_to_json(pt_file, pt_records)
def main(args: argparse.Namespace):
print(args)
engine_args = EngineArgs.from_cli_args(args)
# NOTE(woosuk): If the request cannot be processed in a single batch,
# the engine will automatically process the request in multiple batches.
llm = LLM(**dataclasses.asdict(engine_args))
assert llm.llm_engine.model_config.max_model_len >= (
args.input_len + args.output_len
), (
"Please ensure that max_model_len is greater than"
" the sum of input_len and output_len."
)
sampling_params = SamplingParams(
n=args.n,
temperature=1.0,
top_p=1.0,
ignore_eos=True,
max_tokens=args.output_len,
detokenize=not args.disable_detokenize,
)
print(sampling_params)
dummy_prompt_token_ids = np.random.randint(
10000, size=(args.batch_size, args.input_len)
)
dummy_prompts: list[PromptType] = [
{"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist()
]
def llm_generate():
if not args.use_beam_search:
llm.generate(dummy_prompts, sampling_params=sampling_params, use_tqdm=False)
else:
llm.beam_search(
dummy_prompts,
BeamSearchParams(
beam_width=args.n,
max_tokens=args.output_len,
ignore_eos=True,
),
)
def run_to_completion(profile_dir: Optional[str] = None):
if profile_dir:
llm.start_profile()
llm_generate()
llm.stop_profile()
else:
start_time = time.perf_counter()
llm_generate()
end_time = time.perf_counter()
latency = end_time - start_time
return latency
print("Warming up...")
for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
run_to_completion(profile_dir=None)
if args.profile:
profile_dir = envs.VLLM_TORCH_PROFILER_DIR
print(f"Profiling (results will be saved to '{profile_dir}')...")
run_to_completion(profile_dir=profile_dir)
return
# Benchmark.
latencies = []
for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
latencies.append(run_to_completion(profile_dir=None))
latencies = np.array(latencies)
percentages = [10, 25, 50, 75, 90, 99]
percentiles = np.percentile(latencies, percentages)
print(f"Avg latency: {np.mean(latencies)} seconds")
for percentage, percentile in zip(percentages, percentiles):
print(f"{percentage}% percentile latency: {percentile} seconds")
# Output JSON results if specified
if args.output_json:
results = {
"avg_latency": np.mean(latencies),
"latencies": latencies.tolist(),
"percentiles": dict(zip(percentages, percentiles.tolist())),
}
with open(args.output_json, "w") as f:
json.dump(results, f, indent=4)
save_to_pytorch_benchmark_format(args, results)
def create_argument_parser():
parser = FlexibleArgumentParser(
description="Benchmark the latency of processing a single batch of "
"requests till completion."
)
parser.add_argument("--input-len", type=int, default=32)
parser.add_argument("--output-len", type=int, default=128)
parser.add_argument("--batch-size", type=int, default=8)
parser.add_argument(
"--n",
type=int,
default=1,
help="Number of generated sequences per prompt.",
)
parser.add_argument("--use-beam-search", action="store_true")
parser.add_argument(
"--num-iters-warmup",
type=int,
default=10,
help="Number of iterations to run for warmup.",
)
parser.add_argument(
"--num-iters", type=int, default=30, help="Number of iterations to run."
)
parser.add_argument(
"--profile",
action="store_true",
help="profile the generation process of a single batch",
)
parser.add_argument(
"--output-json",
type=str,
default=None,
help="Path to save the latency results in JSON format.",
)
parser.add_argument(
"--disable-detokenize",
action="store_true",
help=(
"Do not detokenize responses (i.e. do not include "
"detokenization time in the latency measurement)"
),
)
parser = EngineArgs.add_cli_args(parser)
# V1 enables prefix caching by default which skews the latency
# numbers. We need to disable prefix caching by default.
parser.set_defaults(enable_prefix_caching=False)
return parser
if __name__ == "__main__":
print("""DEPRECATED: This script has been moved to the vLLM CLI.
Please use the following command instead:
vllm bench latency
For help with the new command, run:
vllm bench latency --help
Alternatively, you can run the new command directly with:
python -m vllm.entrypoints.cli.main bench latency --help
""")
sys.exit(1)
parser = create_argument_parser()
args = parser.parse_args()
if args.profile and not envs.VLLM_TORCH_PROFILER_DIR:
raise OSError(
"The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. "
"Please set it to a valid path to use torch profiler."
)
main(args)

View File

@ -1,213 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import gc
import time
from unittest import mock
import numpy as np
from tabulate import tabulate
from benchmark_utils import TimeCollector
from vllm.config import (
CacheConfig,
DeviceConfig,
LoadConfig,
ModelConfig,
ParallelConfig,
SchedulerConfig,
SpeculativeConfig,
VllmConfig,
)
from vllm.platforms import current_platform
from vllm.utils import FlexibleArgumentParser
from vllm.v1.spec_decode.ngram_proposer import NgramProposer
from vllm.v1.worker.gpu_input_batch import InputBatch
from vllm.v1.worker.gpu_model_runner import GPUModelRunner
def benchmark_propose(args):
rows = []
for max_ngram in args.max_ngram:
collector = TimeCollector(TimeCollector.US)
model_config = ModelConfig(
model="facebook/opt-125m",
task="generate",
max_model_len=args.num_token + args.num_spec_token,
tokenizer="facebook/opt-125m",
tokenizer_mode="auto",
dtype="auto",
seed=None,
trust_remote_code=False,
)
proposer = NgramProposer(
vllm_config=VllmConfig(
model_config=model_config,
speculative_config=SpeculativeConfig(
prompt_lookup_min=args.min_ngram,
prompt_lookup_max=max_ngram,
num_speculative_tokens=args.num_spec_token,
method="ngram",
),
)
)
# Warm up
proposer.propose(np.random.randint(0, 20, (args.num_token,)))
gc.collect()
for _ in range(args.num_iteration):
tokens = np.random.randint(0, 20, (args.num_req, args.num_token))
with collector:
for i in range(args.num_req):
proposer.propose(tokens[i, :])
rows.append(
[args.num_req, args.num_token, args.min_ngram, max_ngram]
+ collector.dump_avg_max()
)
print(
tabulate(
rows,
headers=[
"# Request",
"# Token",
"Min Ngram",
"Max Ngram",
"Avg (us)",
"Max (us)",
],
tablefmt="grid",
floatfmt=".3f",
)
)
def benchmark_batched_propose(args):
NUM_SPECULATIVE_TOKENS_NGRAM = 10
PROMPT_LOOKUP_MIN = 5
PROMPT_LOOKUP_MAX = 15
MAX_MODEL_LEN = int(1e7)
DEVICE = current_platform.device_type
model_config = ModelConfig(model="facebook/opt-125m", runner="generate")
speculative_config = SpeculativeConfig(
target_model_config=model_config,
target_parallel_config=ParallelConfig(),
method="ngram",
num_speculative_tokens=NUM_SPECULATIVE_TOKENS_NGRAM,
prompt_lookup_max=PROMPT_LOOKUP_MAX,
prompt_lookup_min=PROMPT_LOOKUP_MIN,
)
vllm_config = VllmConfig(
model_config=model_config,
cache_config=CacheConfig(),
speculative_config=speculative_config,
device_config=DeviceConfig(device=current_platform.device_type),
parallel_config=ParallelConfig(),
load_config=LoadConfig(),
scheduler_config=SchedulerConfig(),
)
# monkey patch vllm.v1.worker.gpu_model_runner.get_pp_group
mock_pp_group = mock.MagicMock()
mock_pp_group.world_size = 1
with mock.patch(
"vllm.v1.worker.gpu_model_runner.get_pp_group", return_value=mock_pp_group
):
runner = GPUModelRunner(vllm_config, DEVICE)
# hack max model len
runner.max_model_len = MAX_MODEL_LEN
runner.drafter.max_model_len = MAX_MODEL_LEN
dummy_input_batch = InputBatch(
max_num_reqs=args.num_req,
max_model_len=MAX_MODEL_LEN,
max_num_batched_tokens=args.num_req * args.num_token,
device=DEVICE,
pin_memory=False,
vocab_size=256000,
block_sizes=[16],
)
dummy_input_batch._req_ids = list(str(id) for id in range(args.num_req))
dummy_input_batch.spec_decode_unsupported_reqs = ()
dummy_input_batch.num_tokens_no_spec = [args.num_token] * args.num_req
dummy_input_batch.token_ids_cpu = np.random.randint(
0, 20, (args.num_req, args.num_token)
)
runner.input_batch = dummy_input_batch
sampled_token_ids = [[0]] * args.num_req
print("Starting benchmark")
# first run is warmup so ignore it
for _ in range(args.num_iteration):
start = time.time()
runner.drafter.propose(
sampled_token_ids,
dummy_input_batch.req_ids,
dummy_input_batch.num_tokens_no_spec,
dummy_input_batch.token_ids_cpu,
dummy_input_batch.spec_decode_unsupported_reqs,
)
end = time.time()
print(f"Iteration time (s): {end - start}")
def invoke_main() -> None:
parser = FlexibleArgumentParser(
description="Benchmark the performance of N-gram speculative decode drafting"
)
parser.add_argument(
"--batched", action="store_true", help="consider time to prepare batch"
) # noqa: E501
parser.add_argument(
"--num-iteration",
type=int,
default=100,
help="Number of iterations to run to stabilize final data readings",
)
parser.add_argument(
"--num-req", type=int, default=128, help="Number of requests in the batch"
)
parser.add_argument(
"--num-token", type=int, default=1500, help="Number of tokens for each request"
)
parser.add_argument(
"--min-ngram",
type=int,
default=3,
help="Minimum n-gram to match",
)
parser.add_argument(
"--max-ngram",
type=int,
nargs="*",
default=[5, 7, 10, 15, 20],
help="Maximum n-gram to match",
)
parser.add_argument(
"--num-spec-token",
type=int,
default=3,
help="Number of speculative tokens to generate",
)
args = parser.parse_args()
if not args.batched:
benchmark_propose(args)
else:
benchmark_batched_propose(args)
"""
# Example command lines:
# time python3 benchmarks/benchmark_ngram_proposer.py
# time python3 benchmarks/benchmark_ngram_proposer.py --batched --num-iteration 4 --num-token 1000000 --num-req 128
""" # noqa: E501
if __name__ == "__main__":
invoke_main() # pragma: no cover

File diff suppressed because it is too large Load Diff

View File

@ -4,7 +4,7 @@ r"""Benchmark online serving throughput with structured outputs.
On the server side, run one of the following commands:
(vLLM OpenAI API server)
vllm serve <your_model>
vllm serve <your_model> --disable-log-requests
On the client side, run:
python benchmarks/benchmark_serving_structured_output.py \
@ -449,8 +449,7 @@ async def benchmark(
def prepare_extra_body(request) -> dict:
extra_body = {}
# Add the schema to the extra_body
extra_body["structured_outputs"] = {}
extra_body["structured_outputs"][request.structure_type] = request.schema
extra_body[request.structure_type] = request.schema
return extra_body
print("Starting initial single prompt test run...")
@ -539,6 +538,20 @@ async def benchmark(
)
outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
if profile:
print("Stopping profiler...")
profile_input = RequestFuncInput(
model=model_id,
prompt=test_request.prompt,
api_url=base_url + "/stop_profile",
prompt_len=test_request.prompt_len,
output_len=test_request.expected_output_len,
extra_body={test_request.structure_type: test_request.schema},
)
profile_output = await request_func(request_func_input=profile_input)
if profile_output.success:
print("Profiler stopped")
if pbar is not None:
pbar.close()
@ -556,10 +569,6 @@ async def benchmark(
print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
if max_concurrency is not None:
print("{:<40} {:<10}".format("Maximum request concurrency:", max_concurrency))
if request_rate != float("inf"):
print("{:<40} {:<10.2f}".format("Request rate configured (RPS):", request_rate))
print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
@ -657,20 +666,6 @@ async def benchmark(
print("=" * 50)
if profile:
print("Stopping profiler...")
profile_input = RequestFuncInput(
model=model_id,
prompt=test_request.prompt,
api_url=base_url + "/stop_profile",
prompt_len=test_request.prompt_len,
output_len=test_request.expected_output_len,
extra_body={test_request.structure_type: test_request.schema},
)
profile_output = await request_func(request_func_input=profile_input)
if profile_output.success:
print("Profiler stopped")
return result, ret
@ -697,11 +692,11 @@ def evaluate(ret, args):
return re.match(args.regex, actual) is not None
def _eval_correctness(expected, actual):
if args.structure_type == "json":
if args.structure_type == "guided_json":
return _eval_correctness_json(expected, actual)
elif args.structure_type == "regex":
elif args.structure_type == "guided_regex":
return _eval_correctness_regex(expected, actual)
elif args.structure_type == "choice":
elif args.structure_type == "guided_choice":
return _eval_correctness_choice(expected, actual)
else:
return None
@ -781,18 +776,18 @@ def main(args: argparse.Namespace):
)
if args.dataset == "grammar":
args.structure_type = "grammar"
args.structure_type = "guided_grammar"
elif args.dataset == "regex":
args.structure_type = "regex"
args.structure_type = "guided_regex"
elif args.dataset == "choice":
args.structure_type = "choice"
args.structure_type = "guided_choice"
else:
args.structure_type = "json"
args.structure_type = "guided_json"
if args.no_structured_output:
args.structured_output_ratio = 0
if args.save_results:
result_file_name = f"{args.structured_output_ratio}so"
result_file_name = f"{args.structured_output_ratio}guided"
result_file_name += f"_{backend}"
result_file_name += f"_{args.request_rate}qps"
result_file_name += f"_{args.model.split('/')[-1]}"
@ -999,7 +994,7 @@ def create_argument_parser():
"--percentile-metrics",
type=str,
default="ttft,tpot,itl",
help="Comma-separated list of selected metrics to report percentiles. "
help="Comma-separated list of selected metrics to report percentils. "
"This argument specifies the metrics to report percentiles. "
'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
'Default value is "ttft,tpot,itl".',

View File

@ -1,17 +1,736 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import sys
"""Benchmark offline inference throughput."""
import argparse
import dataclasses
import json
import os
import random
import time
import warnings
from typing import Any, Optional, Union
import torch
import uvloop
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase
from benchmark_dataset import (
AIMODataset,
BurstGPTDataset,
ConversationDataset,
InstructCoderDataset,
RandomDataset,
SampleRequest,
ShareGPTDataset,
SonnetDataset,
VisionArenaDataset,
)
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
from vllm.entrypoints.openai.api_server import (
build_async_engine_client_from_engine_args,
)
from vllm.inputs import TextPrompt, TokensPrompt
from vllm.lora.request import LoRARequest
from vllm.outputs import RequestOutput
from vllm.sampling_params import BeamSearchParams
from vllm.utils import FlexibleArgumentParser, merge_async_iterators
def run_vllm(
requests: list[SampleRequest],
n: int,
engine_args: EngineArgs,
disable_detokenize: bool = False,
) -> tuple[float, Optional[list[RequestOutput]]]:
from vllm import LLM, SamplingParams
llm = LLM(**dataclasses.asdict(engine_args))
assert all(
llm.llm_engine.model_config.max_model_len
>= (request.prompt_len + request.expected_output_len)
for request in requests
), (
"Please ensure that max_model_len is greater than the sum of"
" prompt_len and expected_output_len for all requests."
)
# Add the requests to the engine.
prompts: list[Union[TextPrompt, TokensPrompt]] = []
sampling_params: list[SamplingParams] = []
for request in requests:
prompts.append(
TokensPrompt(
prompt_token_ids=request.prompt["prompt_token_ids"],
multi_modal_data=request.multi_modal_data,
)
if "prompt_token_ids" in request.prompt
else TextPrompt(
prompt=request.prompt, multi_modal_data=request.multi_modal_data
)
)
sampling_params.append(
SamplingParams(
n=n,
temperature=1.0,
top_p=1.0,
ignore_eos=True,
max_tokens=request.expected_output_len,
detokenize=not disable_detokenize,
)
)
lora_requests: Optional[list[LoRARequest]] = None
if engine_args.enable_lora:
lora_requests = [request.lora_request for request in requests]
use_beam_search = False
outputs = None
if not use_beam_search:
start = time.perf_counter()
outputs = llm.generate(
prompts, sampling_params, lora_request=lora_requests, use_tqdm=True
)
end = time.perf_counter()
else:
assert lora_requests is None, "BeamSearch API does not support LoRA"
prompts = [request.prompt for request in requests]
# output_len should be the same for all requests.
output_len = requests[0].expected_output_len
for request in requests:
assert request.expected_output_len == output_len
start = time.perf_counter()
llm.beam_search(
prompts,
BeamSearchParams(
beam_width=n,
max_tokens=output_len,
ignore_eos=True,
),
)
end = time.perf_counter()
return end - start, outputs
def run_vllm_chat(
requests: list[SampleRequest],
n: int,
engine_args: EngineArgs,
disable_detokenize: bool = False,
) -> tuple[float, list[RequestOutput]]:
"""
Run vLLM chat benchmark. This function is recommended ONLY for benchmarking
multimodal models as it properly handles multimodal inputs and chat
formatting. For non-multimodal models, use run_vllm() instead.
"""
from vllm import LLM, SamplingParams
llm = LLM(**dataclasses.asdict(engine_args))
assert all(
llm.llm_engine.model_config.max_model_len
>= (request.prompt_len + request.expected_output_len)
for request in requests
), (
"Please ensure that max_model_len is greater than the sum of "
"prompt_len and expected_output_len for all requests."
)
prompts = []
sampling_params: list[SamplingParams] = []
for request in requests:
prompts.append(request.prompt)
sampling_params.append(
SamplingParams(
n=n,
temperature=1.0,
top_p=1.0,
ignore_eos=True,
max_tokens=request.expected_output_len,
detokenize=not disable_detokenize,
)
)
start = time.perf_counter()
outputs = llm.chat(prompts, sampling_params, use_tqdm=True)
end = time.perf_counter()
return end - start, outputs
async def run_vllm_async(
requests: list[SampleRequest],
n: int,
engine_args: AsyncEngineArgs,
disable_frontend_multiprocessing: bool = False,
disable_detokenize: bool = False,
) -> float:
from vllm import SamplingParams
async with build_async_engine_client_from_engine_args(
engine_args, disable_frontend_multiprocessing
) as llm:
model_config = await llm.get_model_config()
assert all(
model_config.max_model_len
>= (request.prompt_len + request.expected_output_len)
for request in requests
), (
"Please ensure that max_model_len is greater than the sum of"
" prompt_len and expected_output_len for all requests."
)
# Add the requests to the engine.
prompts: list[Union[TextPrompt, TokensPrompt]] = []
sampling_params: list[SamplingParams] = []
lora_requests: list[Optional[LoRARequest]] = []
for request in requests:
prompts.append(
TokensPrompt(
prompt_token_ids=request.prompt["prompt_token_ids"],
multi_modal_data=request.multi_modal_data,
)
if "prompt_token_ids" in request.prompt
else TextPrompt(
prompt=request.prompt, multi_modal_data=request.multi_modal_data
)
)
sampling_params.append(
SamplingParams(
n=n,
temperature=1.0,
top_p=1.0,
ignore_eos=True,
max_tokens=request.expected_output_len,
detokenize=not disable_detokenize,
)
)
lora_requests.append(request.lora_request)
generators = []
start = time.perf_counter()
for i, (prompt, sp, lr) in enumerate(
zip(prompts, sampling_params, lora_requests)
):
generator = llm.generate(prompt, sp, lora_request=lr, request_id=f"test{i}")
generators.append(generator)
all_gens = merge_async_iterators(*generators)
async for i, res in all_gens:
pass
end = time.perf_counter()
return end - start
def run_hf(
requests: list[SampleRequest],
model: str,
tokenizer: PreTrainedTokenizerBase,
n: int,
max_batch_size: int,
trust_remote_code: bool,
disable_detokenize: bool = False,
) -> float:
llm = AutoModelForCausalLM.from_pretrained(
model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code
)
if llm.config.model_type == "llama":
# To enable padding in the HF backend.
tokenizer.pad_token = tokenizer.eos_token
llm = llm.cuda()
pbar = tqdm(total=len(requests))
start = time.perf_counter()
batch: list[str] = []
max_prompt_len = 0
max_output_len = 0
for i in range(len(requests)):
prompt = requests[i].prompt
prompt_len = requests[i].prompt_len
output_len = requests[i].expected_output_len
# Add the prompt to the batch.
batch.append(prompt)
max_prompt_len = max(max_prompt_len, prompt_len)
max_output_len = max(max_output_len, output_len)
if len(batch) < max_batch_size and i != len(requests) - 1:
# Check if we can add more requests to the batch.
next_prompt_len = requests[i + 1].prompt_len
next_output_len = requests[i + 1].expected_output_len
if (
max(max_prompt_len, next_prompt_len)
+ max(max_output_len, next_output_len)
) <= 2048:
# We can add more requests to the batch.
continue
# Generate the sequences.
input_ids = tokenizer(batch, return_tensors="pt", padding=True).input_ids
llm_outputs = llm.generate(
input_ids=input_ids.cuda(),
do_sample=True,
num_return_sequences=n,
temperature=1.0,
top_p=1.0,
use_cache=True,
max_new_tokens=max_output_len,
)
if not disable_detokenize:
# Include the decoding time.
tokenizer.batch_decode(llm_outputs, skip_special_tokens=True)
pbar.update(len(batch))
# Clear the batch.
batch = []
max_prompt_len = 0
max_output_len = 0
end = time.perf_counter()
return end - start
def run_mii(
requests: list[SampleRequest],
model: str,
tensor_parallel_size: int,
output_len: int,
) -> float:
from mii import client, serve
llm = serve(model, tensor_parallel=tensor_parallel_size)
prompts = [request.prompt for request in requests]
start = time.perf_counter()
llm.generate(prompts, max_new_tokens=output_len)
end = time.perf_counter()
client = client(model)
client.terminate_server()
return end - start
def save_to_pytorch_benchmark_format(
args: argparse.Namespace, results: dict[str, Any]
) -> None:
pt_records = convert_to_pytorch_benchmark_format(
args=args,
metrics={
"requests_per_second": [results["requests_per_second"]],
"tokens_per_second": [results["tokens_per_second"]],
},
extra_info={
k: results[k] for k in ["elapsed_time", "num_requests", "total_num_tokens"]
},
)
if pt_records:
# Don't use json suffix here as we don't want CI to pick it up
pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
write_to_json(pt_file, pt_records)
def get_requests(args, tokenizer):
# Common parameters for all dataset types.
common_kwargs = {
"dataset_path": args.dataset_path,
"random_seed": args.seed,
}
sample_kwargs = {
"tokenizer": tokenizer,
"lora_path": args.lora_path,
"max_loras": args.max_loras,
"num_requests": args.num_prompts,
"input_len": args.input_len,
"output_len": args.output_len,
}
if args.dataset_path is None or args.dataset_name == "random":
sample_kwargs["range_ratio"] = args.random_range_ratio
sample_kwargs["prefix_len"] = args.prefix_len
dataset_cls = RandomDataset
elif args.dataset_name == "sharegpt":
dataset_cls = ShareGPTDataset
if args.backend == "vllm-chat":
sample_kwargs["enable_multimodal_chat"] = True
elif args.dataset_name == "sonnet":
assert tokenizer.chat_template or tokenizer.default_chat_template, (
"Tokenizer/model must have chat template for sonnet dataset."
)
dataset_cls = SonnetDataset
sample_kwargs["prefix_len"] = args.prefix_len
sample_kwargs["return_prompt_formatted"] = True
elif args.dataset_name == "burstgpt":
dataset_cls = BurstGPTDataset
elif args.dataset_name == "hf":
common_kwargs["no_stream"] = args.no_stream
if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
dataset_cls = VisionArenaDataset
common_kwargs["dataset_subset"] = None
common_kwargs["dataset_split"] = "train"
sample_kwargs["enable_multimodal_chat"] = True
elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
dataset_cls = InstructCoderDataset
common_kwargs["dataset_split"] = "train"
elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
dataset_cls = ConversationDataset
common_kwargs["dataset_subset"] = args.hf_subset
common_kwargs["dataset_split"] = args.hf_split
sample_kwargs["enable_multimodal_chat"] = True
elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
dataset_cls = AIMODataset
common_kwargs["dataset_subset"] = None
common_kwargs["dataset_split"] = "train"
else:
raise ValueError(f"Unknown dataset name: {args.dataset_name}")
# Remove None values
sample_kwargs = {k: v for k, v in sample_kwargs.items() if v is not None}
return dataset_cls(**common_kwargs).sample(**sample_kwargs)
def main(args: argparse.Namespace):
if args.seed is None:
args.seed = 0
print(args)
random.seed(args.seed)
# Sample the requests.
tokenizer = AutoTokenizer.from_pretrained(
args.tokenizer, trust_remote_code=args.trust_remote_code
)
requests = get_requests(args, tokenizer)
is_multi_modal = any(request.multi_modal_data is not None for request in requests)
request_outputs: Optional[list[RequestOutput]] = None
if args.backend == "vllm":
if args.async_engine:
elapsed_time = uvloop.run(
run_vllm_async(
requests,
args.n,
AsyncEngineArgs.from_cli_args(args),
args.disable_frontend_multiprocessing,
args.disable_detokenize,
)
)
else:
elapsed_time, request_outputs = run_vllm(
requests,
args.n,
EngineArgs.from_cli_args(args),
args.disable_detokenize,
)
elif args.backend == "hf":
assert args.tensor_parallel_size == 1
elapsed_time = run_hf(
requests,
args.model,
tokenizer,
args.n,
args.hf_max_batch_size,
args.trust_remote_code,
args.disable_detokenize,
)
elif args.backend == "mii":
elapsed_time = run_mii(
requests, args.model, args.tensor_parallel_size, args.output_len
)
elif args.backend == "vllm-chat":
elapsed_time, request_outputs = run_vllm_chat(
requests, args.n, EngineArgs.from_cli_args(args), args.disable_detokenize
)
else:
raise ValueError(f"Unknown backend: {args.backend}")
if request_outputs:
# Note: with the vllm and vllm-chat backends,
# we have request_outputs, which we use to count tokens.
total_prompt_tokens = 0
total_output_tokens = 0
for ro in request_outputs:
if not isinstance(ro, RequestOutput):
continue
total_prompt_tokens += (
len(ro.prompt_token_ids) if ro.prompt_token_ids else 0
)
total_output_tokens += sum(len(o.token_ids) for o in ro.outputs if o)
total_num_tokens = total_prompt_tokens + total_output_tokens
else:
total_num_tokens = sum(r.prompt_len + r.expected_output_len for r in requests)
total_output_tokens = sum(r.expected_output_len for r in requests)
total_prompt_tokens = total_num_tokens - total_output_tokens
if is_multi_modal and args.backend != "vllm-chat":
print(
"\033[91mWARNING\033[0m: Multi-modal request with "
f"{args.backend} backend detected. The "
"following metrics are not accurate because image tokens are not"
" counted. See vllm-project/vllm/issues/9778 for details."
)
# TODO(vllm-project/vllm/issues/9778): Count multi-modal token length.
# vllm-chat backend counts the image tokens now
print(
f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
f"{total_output_tokens / elapsed_time:.2f} output tokens/s"
)
print(f"Total num prompt tokens: {total_prompt_tokens}")
print(f"Total num output tokens: {total_output_tokens}")
# Output JSON results if specified
if args.output_json:
results = {
"elapsed_time": elapsed_time,
"num_requests": len(requests),
"total_num_tokens": total_num_tokens,
"requests_per_second": len(requests) / elapsed_time,
"tokens_per_second": total_num_tokens / elapsed_time,
}
with open(args.output_json, "w") as f:
json.dump(results, f, indent=4)
save_to_pytorch_benchmark_format(args, results)
def validate_args(args):
"""
Validate command-line arguments.
"""
# === Deprecation and Defaulting ===
if args.dataset is not None:
warnings.warn(
"The '--dataset' argument will be deprecated in the next release. "
"Please use '--dataset-name' and '--dataset-path' instead.",
stacklevel=2,
)
args.dataset_path = args.dataset
if not getattr(args, "tokenizer", None):
args.tokenizer = args.model
# === Backend Validation ===
valid_backends = {"vllm", "hf", "mii", "vllm-chat"}
if args.backend not in valid_backends:
raise ValueError(f"Unsupported backend: {args.backend}")
# === Dataset Configuration ===
if not args.dataset and not args.dataset_path:
print("When dataset path is not set, it will default to random dataset")
args.dataset_name = "random"
if args.input_len is None:
raise ValueError("input_len must be provided for a random dataset")
# === Dataset Name Specific Checks ===
# --hf-subset and --hf-split: only used
# when dataset_name is 'hf'
if args.dataset_name != "hf" and (
getattr(args, "hf_subset", None) is not None
or getattr(args, "hf_split", None) is not None
):
warnings.warn(
"--hf-subset and --hf-split will be ignored \
since --dataset-name is not 'hf'.",
stacklevel=2,
)
elif args.dataset_name == "hf":
if args.dataset_path in (
VisionArenaDataset.SUPPORTED_DATASET_PATHS.keys()
| ConversationDataset.SUPPORTED_DATASET_PATHS
):
assert args.backend == "vllm-chat", (
f"{args.dataset_path} needs to use vllm-chat as the backend."
) # noqa: E501
elif args.dataset_path in (
InstructCoderDataset.SUPPORTED_DATASET_PATHS
| AIMODataset.SUPPORTED_DATASET_PATHS
):
assert args.backend == "vllm", (
f"{args.dataset_path} needs to use vllm as the backend."
) # noqa: E501
else:
raise ValueError(f"{args.dataset_path} is not supported by hf dataset.")
# --random-range-ratio: only used when dataset_name is 'random'
if args.dataset_name != "random" and args.random_range_ratio is not None:
warnings.warn(
"--random-range-ratio will be ignored since \
--dataset-name is not 'random'.",
stacklevel=2,
)
# --prefix-len: only used when dataset_name is 'random', 'sonnet', or not
# set.
if (
args.dataset_name not in {"random", "sonnet", None}
and args.prefix_len is not None
):
warnings.warn(
"--prefix-len will be ignored since --dataset-name\
is not 'random', 'sonnet', or not set.",
stacklevel=2,
)
# === LoRA Settings ===
if getattr(args, "enable_lora", False) and args.backend != "vllm":
raise ValueError("LoRA benchmarking is only supported for vLLM backend")
if getattr(args, "enable_lora", False) and args.lora_path is None:
raise ValueError("LoRA path must be provided when enable_lora is True")
# === Backend-specific Validations ===
if args.backend == "hf" and args.hf_max_batch_size is None:
raise ValueError("HF max batch size is required for HF backend")
if args.backend != "hf" and args.hf_max_batch_size is not None:
raise ValueError("HF max batch size is only for HF backend.")
if (
args.backend in {"hf", "mii"}
and getattr(args, "quantization", None) is not None
):
raise ValueError("Quantization is only for vLLM backend.")
if args.backend == "mii" and args.dtype != "auto":
raise ValueError("dtype must be auto for MII backend.")
if args.backend == "mii" and args.n != 1:
raise ValueError("n must be 1 for MII backend.")
if args.backend == "mii" and args.tokenizer != args.model:
raise ValueError("Tokenizer must be the same as the model for MII backend.")
# --data-parallel is not supported currently.
# https://github.com/vllm-project/vllm/issues/16222
if args.data_parallel_size > 1:
raise ValueError(
"Data parallel is not supported in offline benchmark, \
please use benchmark serving instead"
)
def create_argument_parser():
parser = FlexibleArgumentParser(description="Benchmark the throughput.")
parser.add_argument(
"--backend",
type=str,
choices=["vllm", "hf", "mii", "vllm-chat"],
default="vllm",
)
parser.add_argument(
"--dataset-name",
type=str,
choices=["sharegpt", "random", "sonnet", "burstgpt", "hf"],
help="Name of the dataset to benchmark on.",
default="sharegpt",
)
parser.add_argument(
"--no-stream",
action="store_true",
help="Do not load the dataset in streaming mode.",
)
parser.add_argument(
"--dataset",
type=str,
default=None,
help="Path to the ShareGPT dataset, will be deprecated in\
the next release. The dataset is expected to "
"be a json in form of list[dict[..., conversations: "
"list[dict[..., value: <prompt_or_response>]]]]",
)
parser.add_argument(
"--dataset-path", type=str, default=None, help="Path to the dataset"
)
parser.add_argument(
"--input-len",
type=int,
default=None,
help="Input prompt length for each request",
)
parser.add_argument(
"--output-len",
type=int,
default=None,
help="Output length for each request. Overrides the "
"output length from the dataset.",
)
parser.add_argument(
"--n", type=int, default=1, help="Number of generated sequences per prompt."
)
parser.add_argument(
"--num-prompts", type=int, default=1000, help="Number of prompts to process."
)
parser.add_argument(
"--hf-max-batch-size",
type=int,
default=None,
help="Maximum batch size for HF backend.",
)
parser.add_argument(
"--output-json",
type=str,
default=None,
help="Path to save the throughput results in JSON format.",
)
parser.add_argument(
"--async-engine",
action="store_true",
default=False,
help="Use vLLM async engine rather than LLM class.",
)
parser.add_argument(
"--disable-frontend-multiprocessing",
action="store_true",
default=False,
help="Disable decoupled async engine frontend.",
)
parser.add_argument(
"--disable-detokenize",
action="store_true",
help=(
"Do not detokenize the response (i.e. do not include "
"detokenization time in the measurement)"
),
)
# LoRA
parser.add_argument(
"--lora-path",
type=str,
default=None,
help="Path to the LoRA adapters to use. This can be an absolute path, "
"a relative path, or a Hugging Face model identifier.",
)
parser.add_argument(
"--prefix-len",
type=int,
default=None,
help=f"Number of prefix tokens to be used in RandomDataset "
"and SonnetDataset. For RandomDataset, the total input "
"length is the sum of prefix-len (default: "
f"{RandomDataset.DEFAULT_PREFIX_LEN}) and a random context length "
"sampled from [input_len * (1 - range_ratio), "
"input_len * (1 + range_ratio)]. For SonnetDataset, "
f"prefix_len (default: {SonnetDataset.DEFAULT_PREFIX_LEN}) "
"controls how much of the input is fixed lines versus "
"random lines, but the total input length remains approximately "
"input_len tokens.",
)
# random dataset
parser.add_argument(
"--random-range-ratio",
type=float,
default=None,
help=f"Range ratio (default : {RandomDataset.DEFAULT_RANGE_RATIO}) "
"for sampling input/output length, "
"used only for RandomDataset. Must be in the range [0, 1) to "
"define a symmetric sampling range "
"[length * (1 - range_ratio), length * (1 + range_ratio)].",
)
# hf dtaset
parser.add_argument(
"--hf-subset", type=str, default=None, help="Subset of the HF dataset."
)
parser.add_argument(
"--hf-split", type=str, default=None, help="Split of the HF dataset."
)
parser = AsyncEngineArgs.add_cli_args(parser)
return parser
if __name__ == "__main__":
print("""DEPRECATED: This script has been moved to the vLLM CLI.
Please use the following command instead:
vllm bench throughput
For help with the new command, run:
vllm bench throughput --help
Alternatively, you can run the new command directly with:
python -m vllm.entrypoints.cli.main bench throughput --help
""")
sys.exit(1)
parser = create_argument_parser()
args = parser.parse_args()
if args.tokenizer is None:
args.tokenizer = args.model
validate_args(args)
main(args)

View File

@ -1,12 +1,11 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import json
import math
import os
import time
from types import TracebackType
from typing import Any, Optional, Union
from typing import Any
def convert_to_pytorch_benchmark_format(
@ -73,53 +72,3 @@ def write_to_json(filename: str, records: list) -> None:
cls=InfEncoder,
default=lambda o: f"<{type(o).__name__} object is not JSON serializable>",
)
# Collect time and generate time metrics
#
# Example Usage:
# collector = TimeCollector(TimeCollector.US)
# for _ in range(total_iteration):
# with collector:
# ...
# collector.dump_avg_max()
class TimeCollector:
NS: int = 1
US: int = NS * 1000
MS: int = US * 1000
S: int = MS * 1000
def __init__(self, scale: int) -> None:
self.cnt: int = 0
self._sum: int = 0
self._max: Optional[int] = None
self.scale = scale
self.start_time: int = time.monotonic_ns()
def collect(self, v: int) -> None:
self.cnt += 1
self._sum += v
if self._max is None:
self._max = v
else:
self._max = max(self._max, v)
def avg(self) -> Union[float, str]:
return self._sum * 1.0 / self.cnt / self.scale if self.cnt > 0 else "N/A"
def max(self) -> Union[float, str]:
return self._max / self.scale if self._max else "N/A"
def dump_avg_max(self) -> list[Union[float, str]]:
return [self.avg(), self.max()]
def __enter__(self) -> None:
self.start_time = time.monotonic_ns()
def __exit__(
self,
exc_type: Optional[type[BaseException]],
exc_value: Optional[BaseException],
exc_traceback: Optional[TracebackType],
) -> None:
self.collect(time.monotonic_ns() - self.start_time)

View File

@ -3,7 +3,7 @@
# benchmark the overhead of disaggregated prefill.
# methodology:
# - send all request to prefill vLLM instance. It will buffer KV cache.
# - then send all request to decode instance.
# - then send all request to decode instance.
# - The TTFT of decode instance is the overhead.
set -ex
@ -12,8 +12,6 @@ kill_gpu_processes() {
# kill all processes on GPU.
pgrep pt_main_thread | xargs -r kill -9
pgrep python3 | xargs -r kill -9
# vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
pgrep VLLM | xargs -r kill -9
sleep 10
# remove vllm config file
@ -62,8 +60,8 @@ benchmark() {
--max-model-len 10000 \
--gpu-memory-utilization 0.6 \
--kv-transfer-config \
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
'{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
CUDA_VISIBLE_DEVICES=1 python3 \
-m vllm.entrypoints.openai.api_server \
@ -72,44 +70,44 @@ benchmark() {
--max-model-len 10000 \
--gpu-memory-utilization 0.6 \
--kv-transfer-config \
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
'{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
wait_for_server 8100
wait_for_server 8200
# let the prefill instance finish prefill
vllm bench serve \
--backend vllm \
--model $model \
--dataset-name $dataset_name \
--dataset-path $dataset_path \
--sonnet-input-len $input_len \
--sonnet-output-len "$output_len" \
--sonnet-prefix-len $prefix_len \
--num-prompts $num_prompts \
--port 8100 \
--save-result \
--result-dir $results_folder \
--result-filename disagg_prefill_tp1.json \
--request-rate "inf"
python3 ../benchmark_serving.py \
--backend vllm \
--model $model \
--dataset-name $dataset_name \
--dataset-path $dataset_path \
--sonnet-input-len $input_len \
--sonnet-output-len "$output_len" \
--sonnet-prefix-len $prefix_len \
--num-prompts $num_prompts \
--port 8100 \
--save-result \
--result-dir $results_folder \
--result-filename disagg_prefill_tp1.json \
--request-rate "inf"
# send the request to decode.
# The TTFT of this command will be the overhead of disagg prefill impl.
vllm bench serve \
--backend vllm \
--model $model \
--dataset-name $dataset_name \
--dataset-path $dataset_path \
--sonnet-input-len $input_len \
--sonnet-output-len "$output_len" \
--sonnet-prefix-len $prefix_len \
--num-prompts $num_prompts \
--port 8200 \
--save-result \
--result-dir $results_folder \
--result-filename disagg_prefill_tp1_overhead.json \
--request-rate "$qps"
python3 ../benchmark_serving.py \
--backend vllm \
--model $model \
--dataset-name $dataset_name \
--dataset-path $dataset_path \
--sonnet-input-len $input_len \
--sonnet-output-len "$output_len" \
--sonnet-prefix-len $prefix_len \
--num-prompts $num_prompts \
--port 8200 \
--save-result \
--result-dir $results_folder \
--result-filename disagg_prefill_tp1_overhead.json \
--request-rate "$qps"
kill_gpu_processes
}

View File

@ -18,8 +18,6 @@ kill_gpu_processes() {
# kill all processes on GPU.
pgrep pt_main_thread | xargs -r kill -9
pgrep python3 | xargs -r kill -9
# vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
pgrep VLLM | xargs -r kill -9
for port in 8000 8100 8200; do lsof -t -i:$port | xargs -r kill -9; done
sleep 1
}
@ -60,7 +58,7 @@ launch_chunked_prefill() {
launch_disagg_prefill() {
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
# disagg prefill
CUDA_VISIBLE_DEVICES=0 python3 \
-m vllm.entrypoints.openai.api_server \
@ -69,7 +67,7 @@ launch_disagg_prefill() {
--max-model-len 10000 \
--gpu-memory-utilization 0.6 \
--kv-transfer-config \
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
'{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
CUDA_VISIBLE_DEVICES=1 python3 \
-m vllm.entrypoints.openai.api_server \
@ -78,7 +76,7 @@ launch_disagg_prefill() {
--max-model-len 10000 \
--gpu-memory-utilization 0.6 \
--kv-transfer-config \
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
'{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
wait_for_server 8100
wait_for_server 8200
@ -99,20 +97,20 @@ benchmark() {
output_len=$2
tag=$3
vllm bench serve \
--backend vllm \
--model $model \
--dataset-name $dataset_name \
--dataset-path $dataset_path \
--sonnet-input-len $input_len \
--sonnet-output-len "$output_len" \
--sonnet-prefix-len $prefix_len \
--num-prompts $num_prompts \
--port 8000 \
--save-result \
--result-dir $results_folder \
--result-filename "$tag"-qps-"$qps".json \
--request-rate "$qps"
python3 ../benchmark_serving.py \
--backend vllm \
--model $model \
--dataset-name $dataset_name \
--dataset-path $dataset_path \
--sonnet-input-len $input_len \
--sonnet-output-len "$output_len" \
--sonnet-prefix-len $prefix_len \
--num-prompts $num_prompts \
--port 8000 \
--save-result \
--result-dir $results_folder \
--result-filename "$tag"-qps-"$qps".json \
--request-rate "$qps"
sleep 2
}

View File

@ -1,199 +1,63 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import asyncio
import logging
import os
import aiohttp
from quart import Quart, Response, make_response, request
from rate_limiter import RateLimiter
from request_queue import RequestQueue
from quart import Quart, make_response, request
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
app = Quart(__name__)
def parse_args():
"""parse command line arguments"""
parser = argparse.ArgumentParser(description="vLLM P/D disaggregation proxy server")
# Add args
parser.add_argument(
"--timeout",
type=float,
default=300,
help="Timeout for backend service requests in seconds (default: 300)",
)
parser.add_argument(
"--max-concurrent",
type=int,
default=100,
help="Maximum concurrent requests to backend services (default: 100)",
)
parser.add_argument(
"--queue-size",
type=int,
default=500,
help="Maximum number of requests in the queue (default: 500)",
)
parser.add_argument(
"--rate-limit",
type=int,
default=40,
help="Maximum requests per second (default: 40)",
)
parser.add_argument(
"--port",
type=int,
default=8000,
help="Port to run the server on (default: 8000)",
)
parser.add_argument(
"--prefill-url",
type=str,
default="http://localhost:8100/v1/completions",
help="Prefill service endpoint URL",
)
parser.add_argument(
"--decode-url",
type=str,
default="http://localhost:8200/v1/completions",
help="Decode service endpoint URL",
)
return parser.parse_args()
def main():
"""parse command line arguments"""
args = parse_args()
# Initialize configuration using command line parameters
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=args.timeout)
MAX_CONCURRENT_REQUESTS = args.max_concurrent
REQUEST_QUEUE_SIZE = args.queue_size
RATE_LIMIT = args.rate_limit
PREFILL_SERVICE_URL = args.prefill_url
DECODE_SERVICE_URL = args.decode_url
PORT = args.port
app = Quart(__name__)
# Initialize the rate limiter and request queue
rate_limiter = RateLimiter(RATE_LIMIT)
request_queue = RequestQueue(MAX_CONCURRENT_REQUESTS, REQUEST_QUEUE_SIZE)
# Attach the configuration object to the application instance
app.config.update(
{
"AIOHTTP_TIMEOUT": AIOHTTP_TIMEOUT,
"rate_limiter": rate_limiter,
"request_queue": request_queue,
"PREFILL_SERVICE_URL": PREFILL_SERVICE_URL,
"DECODE_SERVICE_URL": DECODE_SERVICE_URL,
}
)
# Start queue processing on app startup
@app.before_serving
async def startup():
"""Start request processing task when app starts serving"""
asyncio.create_task(request_queue.process())
async def forward_request(url, data):
"""Forward request to backend service with rate limiting and error handling"""
async def forward_request(url, data):
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
async with session.post(url=url, json=data, headers=headers) as response:
if response.status == 200:
# if response.headers.get('Transfer-Encoding') == 'chunked':
if True:
async for chunk_bytes in response.content.iter_chunked(1024):
yield chunk_bytes
else:
content = await response.read()
yield content
# Use rate limiter as context manager
async with (
rate_limiter,
aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session,
@app.route("/v1/completions", methods=["POST"])
async def handle_request():
try:
original_request_data = await request.get_json()
prefill_request = original_request_data.copy()
# change max_tokens = 1 to let it only do prefill
prefill_request["max_tokens"] = 1
# finish prefill
async for _ in forward_request(
"http://localhost:8100/v1/completions", prefill_request
):
try:
async with session.post(
url=url, json=data, headers=headers
) as response:
if response.status == 200:
# Stream response chunks
async for chunk_bytes in response.content.iter_chunked(1024):
yield chunk_bytes
else:
# Handle backend service errors
error_text = await response.text()
logger.error(
"Backend service error: %s - %s",
response.status,
error_text,
)
yield b'{"error": "Backend service error"}'
except aiohttp.ClientError as e:
# Handle connection errors
logger.error("Connection error to %s: %s", url, str(e))
yield b'{"error": "Service unavailable"}'
except asyncio.TimeoutError:
# Handle timeout errors
logger.error("Timeout connecting to %s", url)
yield b'{"error": "Service timeout"}'
continue
async def process_request():
"""Process a single request through prefill and decode stages"""
try:
original_request_data = await request.get_json()
# return decode
generator = forward_request(
"http://localhost:8200/v1/completions", original_request_data
)
response = await make_response(generator)
response.timeout = None
# Create prefill request (max_tokens=1)
prefill_request = original_request_data.copy()
prefill_request["max_tokens"] = 1
return response
# Execute prefill stage
async for _ in forward_request(PREFILL_SERVICE_URL, prefill_request):
continue
except Exception as e:
import sys
import traceback
# Execute decode stage and stream response
generator = forward_request(DECODE_SERVICE_URL, original_request_data)
response = await make_response(generator)
response.timeout = None # Disable timeout for streaming response
return response
except Exception:
logger.exception("Error processing request")
return Response(
response=b'{"error": "Internal server error"}',
status=500,
content_type="application/json",
)
@app.route("/v1/completions", methods=["POST"])
async def handle_request():
"""Handle incoming API requests with concurrency and rate limiting"""
# Create task for request processing
task = asyncio.create_task(process_request())
# Enqueue request or reject if queue is full
if not await request_queue.enqueue(task):
return Response(
response=b'{"error": "Server busy, try again later"}',
status=503,
content_type="application/json",
)
try:
# Return the response from the processing task
return await task
except asyncio.CancelledError:
# Handle task cancellation (timeout or queue full)
logger.warning("Request cancelled due to timeout or queue full")
return Response(
response=b'{"error": "Request cancelled"}',
status=503,
content_type="application/json",
)
# Start the Quart server with host can be set to 0.0.0.0
app.run(port=PORT)
exc_info = sys.exc_info()
print("Error occurred in disagg prefill proxy server")
print(e)
print("".join(traceback.format_exception(*exc_info)))
if __name__ == "__main__":
main()
app.run(port=8000)

View File

@ -1,45 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
import time
class RateLimiter:
"""Token bucket rate limiter implementation"""
def __init__(self, rate_limit):
self.rate_limit = rate_limit # Requests per second
self.num_available_tokens = rate_limit # Available tokens
self.last_refill = time.monotonic() # Last token refill time
self.lock = asyncio.Lock() # Synchronization lock
async def acquire(self):
"""Acquire a token from the rate limiter"""
while True:
async with self.lock:
current_time = time.monotonic()
elapsed = current_time - self.last_refill
# Refill num_available_tokens if more than 1 second has passed
if elapsed > 1.0:
self.num_available_tokens = self.rate_limit
self.last_refill = current_time
# Check if num_available_tokens are available
if self.num_available_tokens > 0:
self.num_available_tokens -= 1
return True
# Calculate wait time if no num_available_tokens available
wait_time = 1.0 - elapsed
await asyncio.sleep(wait_time)
async def __aenter__(self):
"""Enter async context manager - acquire token"""
await self.acquire()
return self
async def __aexit__(self, exc_type, exc_value, traceback):
"""Exit async context manager - no cleanup needed"""
pass

View File

@ -1,39 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
from collections import deque
class RequestQueue:
"""Request queue manager with concurrency control"""
def __init__(self, max_concurrent, max_queue_size):
# Maximum concurrent requests
self.max_concurrent = max_concurrent
self.max_queue_size = max_queue_size # Maximum queue size
# Concurrency control
self.semaphore = asyncio.Semaphore(max_concurrent)
self.queue = deque() # Request queue
self.queue_size = 0 # Current queue size
self.lock = asyncio.Lock() # Sync queue Lock
async def enqueue(self, task):
"""Add a request task to the queue"""
async with self.lock:
if self.queue_size >= self.max_queue_size:
return False
self.queue.append(task)
self.queue_size += 1
return True
async def process(self):
"""Process queued requests using semaphore for concurrency control"""
while True:
if self.queue:
async with self.semaphore, self.lock:
task = self.queue.popleft()
self.queue_size -= 1
await task
await asyncio.sleep(0.01) # Yield control to event loop

View File

@ -1,145 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import torch
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
apply_w8a8_block_fp8_linear,
)
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
CUTLASS_BLOCK_FP8_SUPPORTED,
)
from vllm.platforms import current_platform
from vllm.triton_utils import triton as vllm_triton
assert current_platform.is_cuda(), (
"Only support benchmarking w8a8 block fp8 kernel on CUDA device."
)
# DeepSeek-V3 weight shapes
DEEPSEEK_V3_SHAPES = [
(512 + 64, 7168),
(2112, 7168),
((128 + 64) * 128, 7168),
(128 * (128 + 128), 512),
(7168, 16384),
(7168, 18432),
(18432 * 2, 7168),
(24576, 1536),
(12288, 7168),
(4096, 7168),
(7168, 2048),
]
def build_w8a8_block_fp8_runner(M, N, K, block_size, device, use_cutlass):
"""Build runner function for w8a8 block fp8 matmul."""
factor_for_scale = 1e-2
fp8_info = torch.finfo(torch.float8_e4m3fn)
fp8_max, fp8_min = fp8_info.max, fp8_info.min
# Create random FP8 tensors
A_ref = (torch.rand(M, K, dtype=torch.bfloat16, device=device) - 0.5) * 2 * fp8_max
B_ref = (torch.rand(N, K, dtype=torch.bfloat16, device=device) - 0.5) * 2 * fp8_max
B = B_ref.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
# Create scales
block_n, block_k = block_size[0], block_size[1]
n_tiles = (N + block_n - 1) // block_n
k_tiles = (K + block_k - 1) // block_k
Bs = (
torch.rand(n_tiles, k_tiles, dtype=torch.float32, device=device)
* factor_for_scale
)
# SM90 CUTLASS requires row-major format for scales
if use_cutlass and current_platform.is_device_capability(90):
Bs = Bs.T.contiguous()
def run():
if use_cutlass:
return apply_w8a8_block_fp8_linear(
A_ref, B, block_size, Bs, cutlass_block_fp8_supported=True
)
else:
return apply_w8a8_block_fp8_linear(
A_ref, B, block_size, Bs, cutlass_block_fp8_supported=False
)
return run
# Determine available providers
available_providers = ["torch-bf16", "w8a8-block-fp8-triton"]
plot_title = "BF16 vs W8A8 Block FP8 GEMMs"
if CUTLASS_BLOCK_FP8_SUPPORTED:
available_providers.append("w8a8-block-fp8-cutlass")
@vllm_triton.testing.perf_report(
vllm_triton.testing.Benchmark(
x_names=["batch_size"],
x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384],
x_log=False,
line_arg="provider",
line_vals=available_providers,
line_names=available_providers,
ylabel="TFLOP/s (larger is better)",
plot_name="BF16 vs W8A8 Block FP8 GEMMs",
args={},
)
)
def benchmark_tflops(batch_size, provider, N, K, block_size=(128, 128)):
M = batch_size
device = "cuda"
quantiles = [0.5, 0.2, 0.8]
if provider == "torch-bf16":
a = torch.randn((M, K), device=device, dtype=torch.bfloat16)
b = torch.randn((N, K), device=device, dtype=torch.bfloat16)
ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph(
lambda: torch.nn.functional.linear(a, b), quantiles=quantiles
)
elif provider == "w8a8-block-fp8-triton":
run_w8a8_triton = build_w8a8_block_fp8_runner(
M, N, K, block_size, device, use_cutlass=False
)
ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph(
lambda: run_w8a8_triton(), quantiles=quantiles
)
elif provider == "w8a8-block-fp8-cutlass":
run_w8a8_cutlass = build_w8a8_block_fp8_runner(
M, N, K, block_size, device, use_cutlass=True
)
ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph(
lambda: run_w8a8_cutlass(), quantiles=quantiles
)
else:
raise ValueError(f"Unknown provider: {provider}")
to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
if __name__ == "__main__":
block_size = (128, 128)
for N, K in DEEPSEEK_V3_SHAPES:
print(f"\nBenchmarking DeepSeek-V3, N={N} K={K}")
print(f"TFLOP/s comparison (block_size={block_size}):")
benchmark_tflops.run(
print_data=True,
# show_plots=False,
# save_path=f"bench_w8a8_block_fp8_tflops_n{N}_k{K}",
N=N,
K=K,
block_size=block_size,
)
print("\nBenchmark finished!")

View File

@ -3,7 +3,6 @@
import argparse
import copy
import itertools
import os
import torch
from weight_shapes import WEIGHT_SHAPES
@ -24,45 +23,21 @@ PROVIDER_CFGS = {
"torch-bf16": dict(enabled=True),
"nvfp4": dict(no_a_quant=False, enabled=True),
"nvfp4-noquant": dict(no_a_quant=True, enabled=True),
"fbgemm-nvfp4": dict(fbgemm=True, no_a_quant=False, enabled=True),
"fbgemm-nvfp4-noquant": dict(fbgemm=True, no_a_quant=True, enabled=True),
}
_needs_fbgemm = any(
v.get("fbgemm", False) for v in PROVIDER_CFGS.values() if v.get("enabled", False)
)
if _needs_fbgemm:
try:
from fbgemm_gpu.experimental.gemm.triton_gemm.fp4_quantize import (
triton_scale_nvfp4_quant,
)
except ImportError:
print(
"WARNING: FBGEMM providers are enabled but fbgemm_gpu is not installed. "
"These providers will be skipped. Please install fbgemm_gpu with: "
"'pip install fbgemm-gpu-genai' to run them."
)
# Disable FBGEMM providers so the benchmark can run.
for cfg in PROVIDER_CFGS.values():
if cfg.get("fbgemm"):
cfg["enabled"] = False
_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
def _quant_weight_nvfp4(b: torch.Tensor, device: str, cfg):
def _quant_weight_nvfp4(b: torch.Tensor, device: str):
# Compute global scale for weight
b_amax = torch.abs(b).max().to(torch.float32)
b_global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / b_amax
if "fbgemm" in cfg and cfg["fbgemm"]:
b_fp4, scale_b_fp4 = triton_scale_nvfp4_quant(b, b_global_scale)
else:
b_fp4, scale_b_fp4 = ops.scaled_fp4_quant(b, b_global_scale)
b_fp4, scale_b_fp4 = ops.scaled_fp4_quant(b, b_global_scale)
return b_fp4, scale_b_fp4, b_global_scale
def build_nvfp4_runner(cfg, a, b, dtype, device):
b_fp4, scale_b_fp4, b_global_scale = _quant_weight_nvfp4(b, device, cfg)
b_fp4, scale_b_fp4, b_global_scale = _quant_weight_nvfp4(b, device)
# Compute global scale for activation
# NOTE: This is generally provided ahead-of-time by the model checkpoint.
@ -71,35 +46,6 @@ def build_nvfp4_runner(cfg, a, b, dtype, device):
# Alpha for the GEMM operation
alpha = 1.0 / (a_global_scale * b_global_scale)
if "fbgemm" in cfg and cfg["fbgemm"]:
if cfg["no_a_quant"]:
a_fp4, scale_a_fp4 = triton_scale_nvfp4_quant(a, a_global_scale)
def run():
return torch.ops.fbgemm.f4f4bf16(
a_fp4,
b_fp4,
scale_a_fp4,
scale_b_fp4,
global_scale=alpha,
use_mx=False,
)
return run
else:
def run():
a_fp4, scale_a_fp4 = triton_scale_nvfp4_quant(a, a_global_scale)
return torch.ops.fbgemm.f4f4bf16(
a_fp4,
b_fp4,
scale_a_fp4,
scale_b_fp4,
global_scale=alpha,
use_mx=False,
)
return run
if cfg["no_a_quant"]:
# Pre-quantize activation
@ -184,13 +130,10 @@ if __name__ == "__main__":
for K, N, model in prepare_shapes(args):
print(f"{model}, N={N} K={K}, BF16 vs NVFP4 GEMMs TFLOP/s:")
save_dir = f"bench_nvfp4_res_n{N}_k{K}"
os.makedirs(save_dir, exist_ok=True)
benchmark.run(
print_data=True,
show_plots=True,
save_path=save_dir,
save_path=f"bench_nvfp4_res_n{N}_k{K}",
N=N,
K=K,
)

View File

@ -1,269 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import itertools
from typing import Callable
from unittest.mock import patch
import pandas as pd
import torch
from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
from vllm.triton_utils import triton
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
def with_triton_mode(fn):
"""Temporarily force the Triton fallback path"""
def wrapped(*args, **kwargs):
with patch("vllm.platforms.current_platform.is_cuda", return_value=False):
return fn(*args, **kwargs)
return wrapped
# TODO(luka): use standalone_compile utility
def with_dyn_arg(fn: Callable, arg_index: int, dim_index: int):
def inner(*args):
torch._dynamo.mark_dynamic(args[arg_index], dim_index)
return fn(*args)
return inner
def bench_compile(fn: Callable):
# recompile for different shapes
fwd = torch.compile(fn, fullgraph=True, dynamic=False)
# First dim is explicitly dynamic to simulate vLLM usage
return with_dyn_arg(fwd, 0, 0)
torch._dynamo.config.recompile_limit = 8888
def calculate_diff(
batch_size: int,
hidden_size: int,
group_shape: GroupShape,
dtype: torch.dtype,
):
"""Calculate the difference between Inductor and CUDA implementations."""
device = torch.device("cuda")
x = torch.randn((batch_size, hidden_size), dtype=dtype, device=device)
quant_fp8 = QuantFP8(False, group_shape, column_major_scales=False)
torch_out, torch_scale = bench_compile(quant_fp8.forward_native)(x)
torch_eager_out, torch_eager_scale = quant_fp8.forward_native(x)
cuda_out, cuda_scale = quant_fp8.forward_cuda(x)
try:
torch.testing.assert_close(
cuda_out.to(torch.float32),
torch_out.to(torch.float32),
rtol=1e-3,
atol=1e-5,
)
torch.testing.assert_close(cuda_scale, torch_scale, rtol=1e-3, atol=1e-5)
torch.testing.assert_close(
cuda_out.to(torch.float32),
torch_eager_out.to(torch.float32),
rtol=1e-3,
atol=1e-5,
)
torch.testing.assert_close(cuda_scale, torch_eager_scale, rtol=1e-3, atol=1e-5)
print("✅ All implementations match")
except AssertionError as e:
print("❌ Implementations differ")
print(e)
configs = []
def benchmark_quantization(
batch_size,
hidden_size,
provider,
group_shape: GroupShape,
col_major: bool,
dtype: torch.dtype,
):
device = torch.device("cuda")
x = torch.randn(batch_size, hidden_size, device=device, dtype=dtype)
quantiles = [0.5, 0.2, 0.8]
quant_fp8 = QuantFP8(False, group_shape, column_major_scales=col_major)
if provider == "torch":
fn = lambda: bench_compile(quant_fp8.forward_native)(x.clone())
elif provider == "cuda":
fn = lambda: quant_fp8.forward_cuda(x.clone())
elif provider == "triton":
if not group_shape.is_per_group():
# Triton only supported for per-group
return 0, 0, 0
fn = lambda: with_triton_mode(quant_fp8.forward_cuda)(x.clone())
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(fn, quantiles=quantiles)
return 1000 * ms, 1000 * max_ms, 1000 * min_ms
# TODO(luka) extract to utils
def compute_geomean_speedups(
df: pd.DataFrame,
baseline_col: str,
speedup_cols: list[str],
groupby_cols: list[str] | None = None,
) -> pd.DataFrame:
"""
Compute geometric mean speedups over a baseline column.
Args:
df: Input dataframe
baseline_col: Column to use as baseline
speedup_cols: Columns to compute speedups for
groupby_cols: Columns to group by. If None, compute over entire df.
Returns:
pd.DataFrame with geometric mean speedups
"""
from scipy.stats import gmean
def geo_speedup(group: pd.DataFrame) -> pd.Series:
ratios = {
col: (group[baseline_col] / group[col]).values for col in speedup_cols
}
return pd.Series({col: gmean(vals) for col, vals in ratios.items()})
if groupby_cols is None:
result = geo_speedup(df).to_frame().T
else:
result = (
df.groupby(groupby_cols)
.apply(geo_speedup, include_groups=False)
.reset_index()
)
return result
if __name__ == "__main__":
parser = FlexibleArgumentParser(
description="Benchmark the various implementations of QuantFP8 (dynamic-only)"
)
parser.add_argument("-c", "--check", action="store_true")
parser.add_argument(
"--dtype", type=str, choices=["half", "bfloat16", "float"], default="bfloat16"
)
parser.add_argument(
"--hidden-sizes",
type=int,
nargs="+",
default=[896, 1024, 2048, 4096, 7168],
help="Hidden sizes to benchmark",
)
parser.add_argument(
"--batch-sizes",
type=int,
nargs="+",
default=[1, 16, 128, 512, 1024],
help="Batch sizes to benchmark",
)
parser.add_argument(
"--group-sizes",
type=int,
nargs="+",
default=None,
help="Group sizes for GroupShape(1,N) to benchmark. "
"Use 0 for PER_TENSOR, -1 for PER_TOKEN (default: 0,-1,64,128)",
)
parser.add_argument(
"--no-column-major",
action="store_true",
help="Disable column-major scales testing",
)
args = parser.parse_args()
assert args
dtype = STR_DTYPE_TO_TORCH_DTYPE[args.dtype]
hidden_sizes = args.hidden_sizes
batch_sizes = args.batch_sizes
if args.group_sizes is not None:
group_shapes = []
for size in args.group_sizes:
if size == 0:
group_shapes.append(GroupShape.PER_TENSOR)
elif size == -1:
group_shapes.append(GroupShape.PER_TOKEN)
else:
group_shapes.append(GroupShape(1, size))
else:
group_shapes = [
GroupShape.PER_TENSOR,
GroupShape.PER_TOKEN,
GroupShape(1, 64),
GroupShape(1, 128),
]
column_major_scales = [False] if args.no_column_major else [True, False]
config_gen = itertools.product(
group_shapes,
column_major_scales,
batch_sizes,
hidden_sizes,
)
# filter out column-major scales for non-group, reverse order
configs.extend(c[::-1] for c in config_gen if (c[0].is_per_group() or not c[1]))
print(f"Running {len(configs)} configurations:")
print(f" Hidden sizes: {hidden_sizes}")
print(f" Batch sizes: {batch_sizes}")
print(f" Group shapes: {[str(g) for g in group_shapes]}")
print(f" Column major scales: {column_major_scales}")
print()
if args.check:
for group_shape in group_shapes:
group_size = group_shape[1]
print(f"{group_size=}")
calculate_diff(
batch_size=4, hidden_size=4096, group_shape=group_shape, dtype=dtype
)
benchmark = triton.testing.perf_report(
triton.testing.Benchmark(
x_names=["hidden_size", "batch_size", "col_major", "group_shape"],
x_vals=configs,
line_arg="provider",
line_vals=["torch", "cuda", "triton"],
line_names=["Torch (Compiled)", "CUDA", "Triton"],
styles=[("blue", "-"), ("green", "-"), ("black", "-")],
ylabel="us",
plot_name="QuantFP8 performance",
args={},
)
)(benchmark_quantization)
df = benchmark.run(print_data=True, dtype=dtype, return_df=True)
# Print geomean speedups
geo_table_grouped = compute_geomean_speedups(
df,
baseline_col="Torch (Compiled)",
speedup_cols=["CUDA", "Triton"],
groupby_cols=["col_major", "group_shape"],
)
print("Speedup over Torch (Compiled)")
print(geo_table_grouped.to_string(index=False))

View File

@ -1,104 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# benchmark custom activation op performance
import itertools
import torch
import vllm.model_executor.layers.activation # noqa F401
from vllm.model_executor.custom_op import CustomOp
from vllm.platforms import current_platform
from vllm.triton_utils import triton
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
batch_size_range = [1, 16, 32, 64, 128]
seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
intermediate_size = [3072, 9728, 12288]
configs = list(itertools.product(batch_size_range, seq_len_range, intermediate_size))
def benchmark_activation(
batch_size: int,
seq_len: int,
intermediate_size: int,
provider: str,
func_name: str,
dtype: torch.dtype,
):
device = "cuda"
num_tokens = batch_size * seq_len
dim = intermediate_size
current_platform.seed_everything(42)
torch.set_default_device(device)
if func_name == "gelu_and_mul":
layer = CustomOp.op_registry[func_name](approximate="none")
elif func_name == "gelu_and_mul_tanh":
layer = CustomOp.op_registry["gelu_and_mul"](approximate="tanh")
elif func_name == "fatrelu_and_mul":
threshold = 0.5
layer = CustomOp.op_registry[func_name](threshold)
else:
layer = CustomOp.op_registry[func_name]()
x = torch.randn(num_tokens, dim, dtype=dtype, device=device)
compiled_layer = torch.compile(layer.forward_native)
if provider == "custom":
fn = lambda: layer(x)
elif provider == "compiled":
fn = lambda: compiled_layer(x)
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
fn, quantiles=[0.5, 0.2, 0.8]
)
return ms, max_ms, min_ms
if __name__ == "__main__":
parser = FlexibleArgumentParser(description="Benchmark the custom activation op.")
parser.add_argument(
"--func-name",
type=str,
choices=[
"mul_and_silu",
"silu_and_mul",
"gelu_and_mul",
"gelu_and_mul_tanh",
"fatrelu_and_mul",
"swigluoai_and_mul",
"gelu_new",
"gelu_fast",
"quick_gelu",
],
default="silu_and_mul",
)
parser.add_argument(
"--dtype", type=str, choices=["half", "bfloat16", "float"], default="bfloat16"
)
args = parser.parse_args()
assert args
func_name = args.func_name
dtype = STR_DTYPE_TO_TORCH_DTYPE[args.dtype]
perf_report = triton.testing.perf_report(
triton.testing.Benchmark(
x_names=["batch_size", "seq_len", "intermediate_size"],
x_vals=configs,
line_arg="provider",
line_vals=["custom", "compiled"],
line_names=["Custom OP", "Compiled"],
styles=[("blue", "-"), ("green", "-")],
ylabel="ms",
plot_name=f"{func_name}-op-performance",
args={},
)
)
perf_report(
lambda batch_size, seq_len, intermediate_size, provider: benchmark_activation(
batch_size, seq_len, intermediate_size, provider, func_name, dtype
)
).run(print_data=True)

View File

@ -0,0 +1,345 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import sys
from typing import Optional
import torch
import torch.nn.functional as F
from vllm import _custom_ops as ops
from vllm.model_executor.layers.quantization.aqlm import (
dequantize_weight,
generic_dequantize_gemm,
get_int_dtype,
optimized_dequantize_gemm,
)
from vllm.utils import FlexibleArgumentParser
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
def torch_mult(
# [..., in_features]
input: torch.Tensor,
weights: torch.Tensor,
# [num_out_groups, 1, 1, 1]
scales: torch.Tensor,
) -> torch.Tensor:
output = F.linear(input, weights)
return output
def dequant_out_scale(
# [..., in_features]
input: torch.Tensor,
# [num_out_groups, num_in_groups, num_codebooks]
codes: torch.IntTensor,
# [num_codebooks, codebook_size, out_group_size, in_group_size]
codebooks: torch.Tensor,
# [num_out_groups, 1, 1, 1]
scales: torch.Tensor,
output_partition_sizes: torch.IntTensor,
bias: Optional[torch.Tensor],
) -> torch.Tensor:
weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
if bias is None:
output = F.linear(input, weights, bias)
orig_shape = output.shape
flattened_output = output.view(-1, output.size(-1))
f_scales = scales.view(-1, scales.shape[0])
b_scales = f_scales.expand(flattened_output.shape[0], -1)
flattened_output *= b_scales
return flattened_output.view(orig_shape)
else:
b_scales = scales.view(scales.shape[:-3] + (-1,)).expand(-1, weights.shape[1])
weights *= b_scales
return F.linear(input, weights, bias)
def dequant_weight_scale(
# [..., in_features]
input: torch.Tensor,
# [num_out_groups, num_in_groups, num_codebooks]
codes: torch.IntTensor,
# [num_codebooks, codebook_size, out_group_size, in_group_size]
codebooks: torch.Tensor,
# [num_out_groups, 1, 1, 1]
scales: torch.Tensor,
output_partition_sizes: torch.IntTensor,
bias: Optional[torch.Tensor],
) -> torch.Tensor:
weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
b_scales = scales.view(scales.shape[:-3] + (-1,)).expand(-1, weights.shape[1])
weights *= b_scales
return F.linear(input, weights, bias)
def dequant_no_scale(
# [..., in_features]
input: torch.Tensor,
# [num_out_groups, num_in_groups, num_codebooks]
codes: torch.IntTensor,
# [num_codebooks, codebook_size, out_group_size, in_group_size]
codebooks: torch.Tensor,
# [num_out_groups, 1, 1, 1]
scales: torch.Tensor,
output_partition_sizes: torch.IntTensor,
bias: Optional[torch.Tensor],
) -> torch.Tensor:
weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
return F.linear(input, weights, bias)
# Compare the optimized 1x16 and 2x8 cuda decompression/dequant kernels against
# the generic pytorch version.
# Just visual comparison.
def dequant_test(k: int, parts: torch.Tensor, nbooks: int, bits: int) -> None:
n = int(parts.sum().item())
device = torch.device("cuda:0")
code_range = (1 << bits) // 2
ingroups = 8
codes = torch.randint(
-code_range,
code_range,
size=(n, k // ingroups, nbooks),
dtype=get_int_dtype(bits),
device=device,
)
codebooks = torch.randn(
size=(parts.shape[0] * nbooks, 1 << bits, 1, 8),
dtype=torch.float16,
device=device,
)
count = 0
for index in range(16):
for i in range(8):
for book in range(nbooks):
codebooks[book, index, 0, i] = count * (10**book)
count += 1
print("codes shape", codes.shape)
for i in range(16):
for book in range(nbooks):
codes[0, i, book] = i
codes[0, -i, book] = i
weights = dequantize_weight(codes, codebooks, None)
weights2 = ops.aqlm_dequant(codes, codebooks, parts)
print("weights shape:", weights.shape)
print("weights2 shape:", weights2.shape)
print("weights are:", weights)
print("weights2 are:", weights2)
print("first 128 weights are", weights[0, 0:128].to(torch.int32))
print("first 128 weights2 are:", weights2[0, 0:128].to(torch.int32))
print("last 128 weights are", weights[0, -128:])
print("last 128 weights2 are:", weights2[0, -128:])
def main():
parser = FlexibleArgumentParser(description="Benchmark aqlm performance.")
# Add arguments
parser.add_argument(
"--nbooks", type=int, default=1, help="Number of codebooks (default: 1)"
)
parser.add_argument(
"--bits",
type=int,
default=16,
help="Number of bits per code element (default: 16)",
)
parser.add_argument(
"--test",
type=bool,
default=False,
help="Run the decompression/dequant tester rather than benchmarking "
"(default: False)",
)
# Parse the arguments
args = parser.parse_args()
# Extract values
nbooks = args.nbooks
bits = args.bits
if args.test:
dequant_test(4096, torch.tensor((4096,)), nbooks, bits)
return
# Otherwise, benchmark.
methods = [
ops.aqlm_gemm,
dequant_out_scale,
generic_dequantize_gemm,
optimized_dequantize_gemm,
dequant_weight_scale,
torch_mult,
dequant_no_scale,
]
filename = f"./aqlm_benchmark_{nbooks}x{bits}.csv"
print(f"writing benchmarks to file {filename}")
with open(filename, "w") as f:
sys.stdout = f
print("m | k | n | n parts", end="")
for method in methods:
print(f" | {method.__name__.replace('_', ' ')} (µs)", end="")
print("")
# These are reasonable prefill sizes.
ksandpartions = (
(4096, (4096, 4096, 4096)),
(4096, (4096,)),
(4096, (11008, 11008)),
(11008, (4096,)),
)
# reasonable ranges for m.
for m in [
1,
2,
4,
8,
10,
12,
14,
16,
24,
32,
48,
52,
56,
64,
96,
112,
128,
256,
512,
1024,
1536,
2048,
3072,
4096,
]:
print(f"{m}", file=sys.__stdout__)
for ksp in ksandpartions:
run_grid(m, ksp[0], torch.tensor(ksp[1]), nbooks, bits, methods)
sys.stdout = sys.__stdout__
def run_grid(m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int, methods):
# I didn't see visible improvements from increasing these, but feel free :)
num_warmup_trials = 1
num_trials = 1
num_calls = 100
# warmup.
for method in methods:
for _ in range(num_warmup_trials):
run_timing(
num_calls=num_calls,
m=m,
k=k,
parts=parts,
nbooks=nbooks,
bits=bits,
method=method,
)
n = parts.sum().item()
print(f"{m} | {k} | {n} | {parts.tolist()}", end="")
for method in methods:
best_time_us = 1e20
for _ in range(num_trials):
kernel_dur_ms = run_timing(
num_calls=num_calls,
m=m,
k=k,
parts=parts,
nbooks=nbooks,
bits=bits,
method=method,
)
kernel_dur_us = 1000 * kernel_dur_ms
if kernel_dur_us < best_time_us:
best_time_us = kernel_dur_us
print(f" | {kernel_dur_us:.0f}", end="")
print("")
def run_timing(
num_calls: int, m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int, method
) -> float:
n = int(parts.sum().item())
device = torch.device("cuda:0")
input = torch.randn((1, m, k), dtype=torch.float16, device=device)
code_range = (1 << bits) // 2
ingroups = 8
codes = torch.randint(
-code_range,
code_range,
size=(n, k // ingroups, nbooks),
dtype=get_int_dtype(bits),
device=device,
)
codebooks = torch.randn(
size=(parts.shape[0] * nbooks, 1 << bits, 1, 8),
dtype=torch.float16,
device=device,
)
scales = torch.randn(size=(n, 1, 1, 1), dtype=torch.float16, device=device)
# for comparison to just a pytorch mult.
weights = torch.randn((n, k), dtype=torch.float16, device=device)
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
start_event.record()
if method is torch_mult:
for i in range(num_calls):
torch_mult(input, weights, scales)
else:
for i in range(num_calls):
method(input, codes, codebooks, scales, parts, None)
end_event.record()
end_event.synchronize()
dur_ms = start_event.elapsed_time(end_event) / num_calls
return dur_ms
if __name__ == "__main__":
sys.exit(main())

View File

@ -3,8 +3,6 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
from packaging import version
from vllm.model_executor.layers.quantization.utils.bitblas_utils import (
MINIMUM_BITBLAS_VERSION,
)
@ -12,7 +10,7 @@ from vllm.model_executor.layers.quantization.utils.bitblas_utils import (
try:
import bitblas
if version.parse(bitblas.__version__) < version.parse(MINIMUM_BITBLAS_VERSION):
if bitblas.__version__ < MINIMUM_BITBLAS_VERSION:
raise ImportError(
"bitblas version is wrong. Please "
f"install bitblas>={MINIMUM_BITBLAS_VERSION}"

View File

@ -13,10 +13,6 @@ import torch.utils.benchmark as benchmark
from vllm import _custom_ops as ops
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
from vllm.model_executor.layers.fused_moe.config import (
fp8_w8a8_moe_quant_config,
nvfp4_moe_quant_config,
)
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
from vllm.scalar_type import scalar_types
@ -144,12 +140,6 @@ def bench_run(
a_fp8_scale: torch.Tensor,
num_repeats: int,
):
quant_config = fp8_w8a8_moe_quant_config(
w1_scale=w1_scale,
w2_scale=w2_scale,
a1_scale=a_fp8_scale,
)
for _ in range(num_repeats):
fused_experts(
a,
@ -157,7 +147,10 @@ def bench_run(
w2,
topk_weights,
topk_ids,
quant_config=quant_config,
use_fp8_w8a8=True,
w1_scale=w1_scale,
w2_scale=w2_scale,
a1_scale=a_fp8_scale,
)
def run_cutlass_moe_fp4(
@ -179,27 +172,25 @@ def bench_run(
device: torch.device,
num_repeats: int,
):
quant_config = nvfp4_moe_quant_config(
a1_gscale=a1_gs,
a2_gscale=a2_gs,
w1_scale=w1_blockscale,
w2_scale=w2_blockscale,
g1_alphas=w1_gs,
g2_alphas=w2_gs,
)
for _ in range(num_repeats):
with nvtx.annotate("cutlass_moe_fp4", color="green"):
cutlass_moe_fp4(
a=a,
a1_gscale=a1_gs,
a2_gscale=a2_gs,
w1_fp4=w1_fp4,
w1_blockscale=w1_blockscale,
w1_alphas=w1_gs,
w2_fp4=w2_fp4,
w2_blockscale=w2_blockscale,
w2_alphas=w2_gs,
topk_weights=topk_weights,
topk_ids=topk_ids,
m=m,
n=n,
k=k,
e=num_experts,
quant_config=quant_config,
device=device,
)
def run_cutlass_from_graph(
@ -220,29 +211,26 @@ def bench_run(
e: int,
device: torch.device,
):
quant_config = nvfp4_moe_quant_config(
a1_gscale=a1_gs,
a2_gscale=a2_gs,
w1_scale=w1_blockscale,
w2_scale=w2_blockscale,
g1_alphas=w1_gs,
g2_alphas=w2_gs,
)
with set_current_vllm_config(
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
):
return cutlass_moe_fp4(
a=a,
a1_gscale=a1_gs,
w1_fp4=w1_fp4,
w1_blockscale=w1_blockscale,
w1_alphas=w1_alphas,
a2_gscale=a2_gs,
w2_fp4=w2_fp4,
w2_blockscale=w2_blockscale,
w2_alphas=w2_alphas,
topk_weights=topk_weights,
topk_ids=topk_ids,
m=m,
n=n,
k=k,
e=num_experts,
quant_config=quant_config,
device=device,
)
def run_triton_from_graph(
@ -258,18 +246,16 @@ def bench_run(
with set_current_vllm_config(
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
):
quant_config = fp8_w8a8_moe_quant_config(
w1_scale=w1_scale,
w2_scale=w2_scale,
a1_scale=a_fp8_scale,
)
return fused_experts(
a,
w1,
w2,
topk_weights,
topk_ids,
quant_config=quant_config,
use_fp8_w8a8=True,
w1_scale=w1_scale,
w2_scale=w2_scale,
a1_scale=a_fp8_scale,
)
def replay_graph(graph, num_repeats):

View File

@ -1,406 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Benchmark the performance of the cutlass_moe_fp8 kernel vs the triton_moe
kernel. Both kernels take in fp8 quantized weights and 16-bit activations,
but use different quantization strategies and backends.
"""
import nvtx
import torch
from vllm import _custom_ops as ops
from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
from vllm.platforms import current_platform
from vllm.utils import FlexibleArgumentParser
# Weight shapes for different models: [num_experts, topk, hidden_size,
# intermediate_size]
WEIGHT_SHAPES_MOE = {
"mixtral-8x7b": [
[8, 2, 4096, 14336],
],
"deepseek-v2": [
[160, 6, 5120, 12288],
],
"custom-small": [
[8, 2, 2048, 7168],
],
"glm45-fp8": [
[128, 8, 4096, 1408],
],
"Llama-4-Maverick-17B-128E-Instruct-FP8": [
[128, 1, 5120, 8192],
],
}
DEFAULT_MODELS = [
"mixtral-8x7b",
]
DEFAULT_BATCH_SIZES = [4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]
DEFAULT_TP_SIZES = [1]
PER_ACT_TOKEN_OPTS = [False, True]
PER_OUT_CH_OPTS = [False, True]
FP8_DTYPE = current_platform.fp8_dtype()
def bench_run(
results: list,
model: str,
num_experts: int,
topk: int,
per_act_token: bool,
per_out_ch: bool,
mkn: tuple[int, int, int],
):
(m, k, n) = mkn
dtype = torch.half
device = "cuda"
# Create input activations
a = torch.randn((m, k), device=device, dtype=dtype) / 10
# Create weights
w1 = torch.randn((num_experts, 2 * n, k), device=device, dtype=dtype) / 10
w2 = torch.randn((num_experts, k, n), device=device, dtype=dtype) / 10
# Create FP8 quantized weights and scales for both kernels
w1_fp8q = torch.empty((num_experts, 2 * n, k), device=device, dtype=FP8_DTYPE)
w2_fp8q = torch.empty((num_experts, k, n), device=device, dtype=FP8_DTYPE)
# Create scales based on quantization strategy
if per_out_ch:
# Per-channel quantization
w1_scale = torch.empty(
(num_experts, 2 * n, 1), device=device, dtype=torch.float32
)
w2_scale = torch.empty((num_experts, k, 1), device=device, dtype=torch.float32)
else:
# Per-tensor quantization
w1_scale = torch.empty((num_experts, 1, 1), device=device, dtype=torch.float32)
w2_scale = torch.empty((num_experts, 1, 1), device=device, dtype=torch.float32)
# Quantize weights
for expert in range(num_experts):
if per_out_ch:
# Per-channel quantization - not yet implemented properly
# For now, fall back to per-tensor quantization
w1_fp8q[expert], w1_scale_temp = ops.scaled_fp8_quant(w1[expert])
w2_fp8q[expert], w2_scale_temp = ops.scaled_fp8_quant(w2[expert])
# Expand scalar scales to the expected per-channel shape
w1_scale[expert] = w1_scale_temp.expand(2 * n, 1)
w2_scale[expert] = w2_scale_temp.expand(k, 1)
else:
# Per-tensor quantization
w1_fp8q[expert], w1_scale_temp = ops.scaled_fp8_quant(w1[expert])
w2_fp8q[expert], w2_scale_temp = ops.scaled_fp8_quant(w2[expert])
# Store scalar scales in [1, 1] tensors
w1_scale[expert, 0, 0] = w1_scale_temp
w2_scale[expert, 0, 0] = w2_scale_temp
# Prepare weights for CUTLASS (no transpose needed)
w1_fp8q_cutlass = w1_fp8q # Keep original [E, 2N, K]
w2_fp8q_cutlass = w2_fp8q # Keep original [E, K, N]
# Create router scores and get topk
score = torch.randn((m, num_experts), device=device, dtype=dtype)
topk_weights, topk_ids, _ = fused_topk(a, score, topk, renormalize=False)
# WORKAROUND: CUTLASS MoE FP8 has issues with per-token quantization
# Force per-tensor quantization for all cases to match working e2e setup
a1_scale = torch.full((), 1e-2, device=device, dtype=torch.float32)
a2_scale = torch.full((), 1e-2, device=device, dtype=torch.float32)
# Force per-tensor quantization for all cases
per_act_token = False
# Create stride tensors for CUTLASS
ab_strides1 = torch.full((num_experts,), k, dtype=torch.int64, device=device)
ab_strides2 = torch.full((num_experts,), n, dtype=torch.int64, device=device)
c_strides1 = torch.full((num_experts,), 2 * n, dtype=torch.int64, device=device)
c_strides2 = torch.full((num_experts,), k, dtype=torch.int64, device=device)
def run_triton_moe(
a: torch.Tensor,
w1: torch.Tensor,
w2: torch.Tensor,
topk_weights: torch.Tensor,
topk_ids: torch.Tensor,
w1_scale: torch.Tensor,
w2_scale: torch.Tensor,
a1_scale: torch.Tensor,
a2_scale: torch.Tensor,
num_repeats: int,
):
quant_config = fp8_w8a8_moe_quant_config(
w1_scale=w1_scale,
w2_scale=w2_scale,
a1_scale=a1_scale,
a2_scale=a2_scale,
per_act_token_quant=per_act_token,
per_out_ch_quant=per_out_ch,
)
for _ in range(num_repeats):
fused_experts(
a,
w1,
w2,
topk_weights,
topk_ids,
quant_config=quant_config,
)
def run_cutlass_moe_fp8(
a: torch.Tensor,
w1: torch.Tensor,
w2: torch.Tensor,
topk_weights: torch.Tensor,
topk_ids: torch.Tensor,
ab_strides1: torch.Tensor,
ab_strides2: torch.Tensor,
c_strides1: torch.Tensor,
c_strides2: torch.Tensor,
w1_scale: torch.Tensor,
w2_scale: torch.Tensor,
a1_scale: torch.Tensor,
a2_scale: torch.Tensor,
num_repeats: int,
):
quant_config = fp8_w8a8_moe_quant_config(
w1_scale=w1_scale,
w2_scale=w2_scale,
a1_scale=a1_scale,
a2_scale=a2_scale,
per_act_token_quant=per_act_token,
per_out_ch_quant=per_out_ch,
)
for _ in range(num_repeats):
with nvtx.annotate("cutlass_moe_fp8", color="blue"):
cutlass_moe_fp8(
a=a,
w1_q=w1,
w2_q=w2,
topk_weights=topk_weights,
topk_ids=topk_ids,
ab_strides1=ab_strides1,
ab_strides2=ab_strides2,
c_strides1=c_strides1,
c_strides2=c_strides2,
quant_config=quant_config,
activation="silu",
global_num_experts=num_experts,
)
# Pre-create quantization config to avoid creating it inside CUDA graph
quant_config = fp8_w8a8_moe_quant_config(
w1_scale=w1_scale,
w2_scale=w2_scale,
a1_scale=a1_scale,
a2_scale=a2_scale,
per_act_token_quant=per_act_token,
per_out_ch_quant=per_out_ch,
)
# Create CUDA graphs for CUTLASS (match benchmark_moe.py pattern exactly)
cutlass_stream = torch.cuda.Stream()
cutlass_graph = torch.cuda.CUDAGraph()
with torch.cuda.graph(cutlass_graph, stream=cutlass_stream):
# Capture 10 invocations like benchmark_moe.py
for _ in range(10):
cutlass_moe_fp8(
a=a,
w1_q=w1_fp8q_cutlass,
w2_q=w2_fp8q_cutlass,
topk_weights=topk_weights,
topk_ids=topk_ids,
ab_strides1=ab_strides1,
ab_strides2=ab_strides2,
c_strides1=c_strides1,
c_strides2=c_strides2,
quant_config=quant_config,
activation="silu",
global_num_experts=num_experts,
)
torch.cuda.synchronize()
# Create CUDA graphs for Triton (match benchmark_moe.py pattern exactly)
triton_stream = torch.cuda.Stream()
triton_graph = torch.cuda.CUDAGraph()
with torch.cuda.graph(triton_graph, stream=triton_stream):
# Capture 10 invocations like benchmark_moe.py
for _ in range(10):
fused_experts(
a,
w1_fp8q,
w2_fp8q,
topk_weights,
topk_ids,
quant_config=quant_config,
)
torch.cuda.synchronize()
def bench_cuda_graph(graph, num_warmup=5, num_iters=100):
"""Benchmark CUDA graph using events like benchmark_moe.py"""
# Warmup
for _ in range(num_warmup):
graph.replay()
torch.cuda.synchronize()
# Timing
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
latencies = []
for _ in range(num_iters):
torch.cuda.synchronize()
start_event.record()
graph.replay()
end_event.record()
end_event.synchronize()
latencies.append(start_event.elapsed_time(end_event))
# Divide by 10 since graph contains 10 calls
return sum(latencies) / (num_iters * 10)
# Benchmark parameters
num_warmup = 5
num_iters = 100
# Benchmark only CUDA graphs (more reliable and faster)
# Benchmark Triton MoE with CUDA graphs
triton_graph_time = bench_cuda_graph(
triton_graph, num_warmup=num_warmup, num_iters=num_iters
)
# Benchmark CUTLASS MoE with CUDA graphs
cutlass_graph_time = bench_cuda_graph(
cutlass_graph, num_warmup=num_warmup, num_iters=num_iters
)
# Convert ms to us and return results
triton_time_us = triton_graph_time * 1000
cutlass_time_us = cutlass_graph_time * 1000
return {
"batch_size": m,
"triton_time_us": triton_time_us,
"cutlass_time_us": cutlass_time_us,
}
def main(args):
print("Benchmarking models:")
for i, model in enumerate(args.models):
print(f"[{i}] {model}")
all_results = []
for model in args.models:
for tp in args.tp_sizes:
for layer in WEIGHT_SHAPES_MOE[model]:
num_experts = layer[0]
topk = layer[1]
size_k = layer[2]
size_n = layer[3] // tp
if len(args.limit_k) > 0 and size_k not in args.limit_k:
continue
if len(args.limit_n) > 0 and size_n not in args.limit_n:
continue
for per_act_token in args.per_act_token_opts:
for per_out_ch in args.per_out_ch_opts:
print(
f"\n=== {model}, experts={num_experts}, topk={topk},"
f"per_act={per_act_token}, per_out_ch={per_out_ch} ==="
)
config_results = []
for size_m in args.batch_sizes:
mkn = (size_m, size_k, size_n)
result = bench_run(
[], # Not used anymore
model,
num_experts,
topk,
per_act_token,
per_out_ch,
mkn,
)
if result:
config_results.append(result)
# Print results table for this configuration
if config_results:
print(
f"\n{'Batch Size':<12}"
f"{'Triton (us)':<15}"
f"{'CUTLASS (us)':<15}"
)
print("-" * 45)
for result in config_results:
print(
f"{result['batch_size']:<12}"
f"{result['triton_time_us']:<15.2f}"
f"{result['cutlass_time_us']:<15.2f}"
)
all_results.extend(config_results)
print(f"\nTotal benchmarks completed: {len(all_results)}")
if __name__ == "__main__":
parser = FlexibleArgumentParser(
description="""Benchmark CUTLASS FP8 MOE vs Triton FP8 FUSED MOE
across specified models/shapes/batches
Example usage:
python benchmark_cutlass_moe_fp8.py \
--model "Llama-4-Maverick-17B-128E-Instruct-FP8" \
--tp-sizes 8 \
--batch-size 2 4 8 \
--per-act-token-opts false \
--per-out-ch-opts false
"""
)
parser.add_argument(
"--models",
nargs="+",
type=str,
default=DEFAULT_MODELS,
choices=WEIGHT_SHAPES_MOE.keys(),
)
parser.add_argument("--tp-sizes", nargs="+", type=int, default=DEFAULT_TP_SIZES)
parser.add_argument(
"--batch-sizes", nargs="+", type=int, default=DEFAULT_BATCH_SIZES
)
parser.add_argument("--limit-k", nargs="+", type=int, default=[])
parser.add_argument("--limit-n", nargs="+", type=int, default=[])
parser.add_argument(
"--per-act-token-opts",
nargs="+",
type=lambda x: x.lower() == "true",
default=[False, True],
help="Per-activation token quantization options (true/false)",
)
parser.add_argument(
"--per-out-ch-opts",
nargs="+",
type=lambda x: x.lower() == "true",
default=[False, True],
help="Per-output channel quantization options (true/false)",
)
args = parser.parse_args()
main(args)

View File

@ -1,508 +0,0 @@
#!/usr/bin/env python3
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Benchmark script for device communicators:
CustomAllreduce (oneshot, twoshot), PyNcclCommunicator,
and SymmMemCommunicator (multimem, two-shot).
for NCCL symmetric memory you need to set the environment variables
NCCL_NVLS_ENABLE=1 NCCL_CUMEM_ENABLE=1 VLLM_USE_NCCL_SYMM_MEM=1, otherwise NCCL does
not use fast NVLS implementation for all reduce.
Usage:
torchrun --nproc_per_node=<N> benchmark_device_communicators.py [options]
Example:
torchrun --nproc_per_node=2 benchmark_device_communicators.py
--sequence-lengths 512 1024 2048 --num-warmup 10 --num-trials 100
"""
import json
import os
import time
from contextlib import nullcontext
from typing import Callable, Optional
import torch
import torch.distributed as dist
from torch.distributed import ProcessGroup
from vllm.distributed.device_communicators.custom_all_reduce import CustomAllreduce
from vllm.distributed.device_communicators.pynccl import (
PyNcclCommunicator,
register_nccl_symmetric_ops,
)
from vllm.distributed.device_communicators.pynccl_allocator import (
set_graph_pool_id,
)
from vllm.distributed.device_communicators.symm_mem import SymmMemCommunicator
from vllm.logger import init_logger
from vllm.utils import FlexibleArgumentParser
logger = init_logger(__name__)
# Default sequence lengths to benchmark
DEFAULT_SEQUENCE_LENGTHS = [128, 512, 1024, 2048, 4096, 8192]
# Fixed hidden size and dtype for all benchmarks
HIDDEN_SIZE = 8192
BENCHMARK_DTYPE = torch.bfloat16
# CUDA graph settings
CUDA_GRAPH_CAPTURE_CYCLES = 10
class CommunicatorBenchmark:
"""Benchmark class for testing device communicators."""
def __init__(
self,
rank: int,
world_size: int,
device: torch.device,
cpu_group: ProcessGroup,
sequence_lengths: list[int],
):
self.rank = rank
self.world_size = world_size
self.device = device
self.cpu_group = cpu_group
# Calculate max_size_override based on largest sequence length
max_seq_len = max(sequence_lengths)
max_tensor_elements = max_seq_len * HIDDEN_SIZE
self.max_size_override = max_tensor_elements * BENCHMARK_DTYPE.itemsize + 1
# Initialize communicators
self.custom_allreduce = None
self.pynccl_comm = None
self.symm_mem_comm = None
self.symm_mem_comm_multimem = None
self.symm_mem_comm_two_shot = None
self._init_communicators()
def _init_communicators(self):
"""Initialize all available communicators."""
try:
self.custom_allreduce = CustomAllreduce(
group=self.cpu_group,
device=self.device,
max_size=self.max_size_override,
)
if not self.custom_allreduce.disabled:
logger.info("Rank %s: CustomAllreduce initialized", self.rank)
else:
logger.info("Rank %s: CustomAllreduce disabled", self.rank)
except Exception as e:
logger.warning(
"Rank %s: Failed to initialize CustomAllreduce: %s", self.rank, e
)
self.custom_allreduce = None
try:
self.pynccl_comm = PyNcclCommunicator(
group=self.cpu_group, device=self.device
)
if not self.pynccl_comm.disabled:
logger.info("Rank %s: PyNcclCommunicator initialized", self.rank)
register_nccl_symmetric_ops(self.pynccl_comm)
else:
logger.info("Rank %s: PyNcclCommunicator disabled", self.rank)
self.pynccl_comm = None
except Exception as e:
logger.warning(
"Rank %s: Failed to initialize PyNcclCommunicator: %s", self.rank, e
)
self.pynccl_comm = None
# Initialize variants for SymmMemCommunicator
try:
self.symm_mem_comm_multimem = SymmMemCommunicator(
group=self.cpu_group,
device=self.device,
force_multimem=True,
max_size_override=self.max_size_override,
)
if not self.symm_mem_comm_multimem.disabled:
logger.info(
"Rank %s: SymmMemCommunicator (multimem) initialized", self.rank
)
else:
self.symm_mem_comm_multimem = None
except Exception as e:
logger.warning(
"Rank %s: Failed to initialize SymmMemCommunicator (multimem): %s",
self.rank,
e,
)
self.symm_mem_comm_multimem = None
try:
self.symm_mem_comm_two_shot = SymmMemCommunicator(
group=self.cpu_group,
device=self.device,
force_multimem=False,
max_size_override=self.max_size_override,
)
if not self.symm_mem_comm_two_shot.disabled:
logger.info(
"Rank %s: SymmMemCommunicator (two_shot) initialized", self.rank
)
else:
self.symm_mem_comm_two_shot = None
except Exception as e:
logger.warning(
"Rank %s: Failed to initialize SymmMemCommunicator (two_shot): %s",
self.rank,
e,
)
self.symm_mem_comm_two_shot = None
def benchmark_allreduce(
self, sequence_length: int, num_warmup: int, num_trials: int
) -> dict[str, float]:
"""Benchmark allreduce operations for all available communicators."""
results = {}
# Define communicators with their benchmark functions
communicators = []
if self.custom_allreduce is not None:
comm = self.custom_allreduce
# CustomAllreduce one-shot
communicators.append(
(
"ca_1stage",
lambda t, c=comm: c.custom_all_reduce(t),
lambda t, c=comm: c.should_custom_ar(t),
comm.capture(),
"1stage", # env variable value
)
)
# CustomAllreduce two-shot
communicators.append(
(
"ca_2stage",
lambda t, c=comm: c.custom_all_reduce(t),
lambda t, c=comm: c.should_custom_ar(t),
comm.capture(),
"2stage", # env variable value
)
)
if self.pynccl_comm is not None:
comm = self.pynccl_comm
communicators.append(
(
"pynccl",
lambda t, c=comm: c.all_reduce(t),
lambda t: True, # Always available if initialized
nullcontext(),
None, # no env variable needed
)
)
communicators.append(
(
"pynccl-symm",
lambda t: torch.ops.vllm.all_reduce_symmetric_with_copy(t),
lambda t: True, # Always available if initialized
nullcontext(),
None, # no env variable needed
)
)
if self.symm_mem_comm_multimem is not None:
comm = self.symm_mem_comm_multimem
communicators.append(
(
"symm_mem_multimem",
lambda t, c=comm: c.all_reduce(t),
lambda t, c=comm: c.should_use_symm_mem(t),
nullcontext(),
None, # no env variable needed
)
)
if self.symm_mem_comm_two_shot is not None:
comm = self.symm_mem_comm_two_shot
communicators.append(
(
"symm_mem_two_shot",
lambda t, c=comm: c.all_reduce(t),
lambda t, c=comm: c.should_use_symm_mem(t),
nullcontext(),
None, # no env variable needed
)
)
# Benchmark each communicator
for name, allreduce_fn, should_use_fn, context, env_var in communicators:
# Set environment variable if needed
if env_var is not None:
os.environ["VLLM_CUSTOM_ALLREDUCE_ALGO"] = env_var
else:
# Clear the environment variable to avoid interference
os.environ.pop("VLLM_CUSTOM_ALLREDUCE_ALGO", None)
latency = self.benchmark_allreduce_single(
sequence_length,
allreduce_fn,
should_use_fn,
context,
num_warmup,
num_trials,
)
if latency is not None:
results[name] = latency
return results
def benchmark_allreduce_single(
self,
sequence_length: int,
allreduce_fn: Callable[[torch.Tensor], Optional[torch.Tensor]],
should_use_fn: Callable[[torch.Tensor], bool],
context,
num_warmup: int,
num_trials: int,
) -> Optional[float]:
"""Benchmark method with CUDA graph optimization."""
try:
# Create test tensor (2D: sequence_length x hidden_size)
tensor = torch.randn(
sequence_length, HIDDEN_SIZE, dtype=BENCHMARK_DTYPE, device=self.device
)
if not should_use_fn(tensor):
return None
torch.cuda.synchronize()
stream = torch.cuda.Stream()
with torch.cuda.stream(stream):
graph_input = tensor.clone()
# Warmup before capture
for _ in range(3):
allreduce_fn(graph_input)
# Capture the graph using context manager
with context:
graph = torch.cuda.CUDAGraph()
graph_pool = torch.cuda.graph_pool_handle()
set_graph_pool_id(graph_pool)
with torch.cuda.graph(graph, pool=graph_pool):
for _ in range(CUDA_GRAPH_CAPTURE_CYCLES):
allreduce_fn(graph_input)
torch.cuda.synchronize()
for _ in range(num_warmup):
graph.replay()
torch.cuda.synchronize()
torch.cuda.synchronize()
start_time = time.perf_counter()
for _ in range(num_trials):
graph.replay()
torch.cuda.synchronize()
end_time = time.perf_counter()
# Convert to ms and divide by CUDA_GRAPH_CAPTURE_CYCLES
return (
(end_time - start_time) / num_trials / CUDA_GRAPH_CAPTURE_CYCLES * 1000
)
except Exception as e:
logger.error("CUDA graph benchmark failed: %s", e)
raise RuntimeError(
f"CUDA graph benchmark failed for communicator: {e}"
) from e
def _calculate_speedup_info(comm_results: dict[str, float]) -> str:
"""Calculate speedup information for a single tensor size."""
if not comm_results:
return "N/A"
# Find the fastest communicator
fastest_comm = min(comm_results.keys(), key=lambda k: comm_results[k])
fastest_time = comm_results[fastest_comm]
# Calculate speedup vs PyNccl if available
if "pynccl" in comm_results:
pynccl_time = comm_results["pynccl"]
speedup = pynccl_time / fastest_time
return f"{fastest_comm} ({speedup:.2f}x)"
else:
return f"{fastest_comm} (N/A)"
def print_results(
results: dict[str, dict[str, float]], sequence_lengths: list[int], world_size: int
):
"""Print benchmark results in a formatted table."""
print(f"\n{'=' * 130}")
print("Device Communicator Benchmark Results")
print(
f"World Size: {world_size}, Data Type: {BENCHMARK_DTYPE}, "
f"Hidden Size: {HIDDEN_SIZE}"
)
print(f"{'=' * 130}")
# Get all communicator names
all_comms = set()
for size_results in results.values():
all_comms.update(size_results.keys())
all_comms = sorted(list(all_comms))
# Print header
header = f"{'Tensor Shape':<20}{'Tensor Size':<15}"
for comm in all_comms:
header += f"{comm:<20}"
header += f"{'Best (Speedup vs PyNccl)':<30}"
print(header)
print("-" * len(header))
# Print results for each sequence length
for seq_len in sequence_lengths:
if seq_len in results:
# Calculate tensor size in elements and bytes
tensor_elements = seq_len * HIDDEN_SIZE
tensor_bytes = tensor_elements * BENCHMARK_DTYPE.itemsize
# Format tensor size (MB)
tensor_size_mb = tensor_bytes / (1024 * 1024)
tensor_size_str = f"{tensor_size_mb:.2f} MB"
# Format tensor shape
tensor_shape = f"({seq_len}, {HIDDEN_SIZE})"
row = f"{tensor_shape:<20}{tensor_size_str:<15}"
for comm in all_comms:
if comm in results[seq_len]:
row += f"{results[seq_len][comm]:<20.3f}"
else:
row += f"{'N/A':<20}"
# Calculate speedup information
speedup_info = _calculate_speedup_info(results[seq_len])
row += f"{speedup_info:<30}"
print(row)
print(f"{'=' * 130}")
print("All times are in milliseconds (ms) per allreduce operation")
print("Speedup column shows: fastest_algorithm (speedup_vs_pynccl)")
def main():
parser = FlexibleArgumentParser(description="Benchmark device communicators")
parser.add_argument(
"--sequence-lengths",
type=int,
nargs="+",
default=DEFAULT_SEQUENCE_LENGTHS,
help="Sequence lengths to benchmark (tensor shape: seq_len x hidden_size)",
)
parser.add_argument(
"--num-warmup", type=int, default=5, help="Number of warmup iterations"
)
parser.add_argument(
"--num-trials", type=int, default=50, help="Number of benchmark trials"
)
parser.add_argument("--output-json", type=str, help="Output results to JSON file")
args = parser.parse_args()
# Initialize distributed
if not dist.is_initialized():
dist.init_process_group(backend="gloo")
rank = dist.get_rank()
world_size = dist.get_world_size()
# Set device
device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device)
# Get CPU process group
cpu_group = dist.new_group(backend="gloo")
# Disable USE_SYMM_MEM to avoid affecting the max_sizes
# in symm_mem and custom_all_reduce for benchmark
os.environ["VLLM_ALLREDUCE_USE_SYMM_MEM"] = "0"
# Initialize benchmark
benchmark = CommunicatorBenchmark(
rank, world_size, device, cpu_group, args.sequence_lengths
)
# Run benchmarks
all_results = {}
for seq_len in args.sequence_lengths:
if rank == 0:
logger.info(
"Benchmarking sequence length: %s (tensor shape: %s x %s)",
seq_len,
seq_len,
HIDDEN_SIZE,
)
results = benchmark.benchmark_allreduce(
sequence_length=seq_len,
num_warmup=args.num_warmup,
num_trials=args.num_trials,
)
all_results[seq_len] = results
# Synchronize between ranks
dist.barrier()
# Print results (only rank 0)
if rank == 0:
print_results(all_results, args.sequence_lengths, world_size)
# Save to JSON if requested
if args.output_json:
# Add speedup information to results
enhanced_results = {}
for seq_len, comm_results in all_results.items():
enhanced_results[seq_len] = {
"timings": comm_results,
"speedup_info": _calculate_speedup_info(comm_results),
}
output_data = {
"world_size": world_size,
"dtype": str(BENCHMARK_DTYPE),
"hidden_size": HIDDEN_SIZE,
"sequence_lengths": args.sequence_lengths,
"num_warmup": args.num_warmup,
"num_trials": args.num_trials,
"cuda_graph_capture_cycles": CUDA_GRAPH_CAPTURE_CYCLES,
"results": enhanced_results,
}
with open(args.output_json, "w") as f:
json.dump(output_data, f, indent=2)
logger.info("Results saved to %s", args.output_json)
# Cleanup
if cpu_group != dist.group.WORLD:
dist.destroy_process_group(cpu_group)
if __name__ == "__main__":
main()

View File

@ -7,7 +7,6 @@ from benchmark_shapes import WEIGHT_SHAPES_MOE
from vllm import _custom_ops as ops
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
from vllm.model_executor.layers.fused_moe.fused_moe import (
fused_experts,
@ -81,11 +80,6 @@ def bench_run(
a, score, topk, renormalize=False
)
ab_strides1 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
ab_strides2 = torch.full((num_experts,), n, device="cuda", dtype=torch.int64)
c_strides1 = torch.full((num_experts,), 2 * n, device="cuda", dtype=torch.int64)
c_strides2 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
def run_triton_moe(
a: torch.Tensor,
w1: torch.Tensor,
@ -97,11 +91,6 @@ def bench_run(
a_scale: torch.Tensor,
num_repeats: int,
):
quant_config = fp8_w8a8_moe_quant_config(
w1_scale=w1_scale,
w2_scale=w2_scale,
a1_scale=a_scale,
)
for _ in range(num_repeats):
fused_experts(
a,
@ -109,7 +98,10 @@ def bench_run(
w2,
topk_weights,
topk_ids,
quant_config=quant_config,
use_fp8_w8a8=True,
w1_scale=w1_scale,
w2_scale=w2_scale,
a1_scale=a_scale,
)
def run_cutlass_moe(
@ -119,21 +111,11 @@ def bench_run(
w2: torch.Tensor,
w1_scale: torch.Tensor,
w2_scale: torch.Tensor,
ab_strides1: torch.Tensor,
ab_strides2: torch.Tensor,
c_strides1: torch.Tensor,
c_strides2: torch.Tensor,
topk_weights: torch.Tensor,
topk_ids: torch.Tensor,
per_act_token: bool,
num_repeats: int,
):
quant_config = fp8_w8a8_moe_quant_config(
w1_scale=w1_scale,
w2_scale=w2_scale,
per_act_token_quant=per_act_token,
)
for _ in range(num_repeats):
cutlass_moe_fp8(
a,
@ -141,11 +123,10 @@ def bench_run(
w2,
topk_weights,
topk_ids,
ab_strides1,
ab_strides2,
c_strides1,
c_strides2,
quant_config=quant_config,
w1_scale,
w2_scale,
per_act_token,
a1_scale=None,
)
def run_cutlass_from_graph(
@ -155,19 +136,9 @@ def bench_run(
w2_q: torch.Tensor,
w1_scale: torch.Tensor,
w2_scale: torch.Tensor,
ab_strides1: torch.Tensor,
ab_strides2: torch.Tensor,
c_strides1: torch.Tensor,
c_strides2: torch.Tensor,
topk_weights: torch.Tensor,
topk_ids: torch.Tensor,
):
quant_config = fp8_w8a8_moe_quant_config(
w1_scale=w1_scale,
w2_scale=w2_scale,
per_act_token_quant=per_act_token,
)
with set_current_vllm_config(
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
):
@ -177,11 +148,10 @@ def bench_run(
w2_q,
topk_weights,
topk_ids,
ab_strides1,
ab_strides2,
c_strides1,
c_strides2,
quant_config=quant_config,
w1_scale,
w2_scale,
per_act_token,
a1_scale=None,
)
def run_triton_from_graph(
@ -194,11 +164,6 @@ def bench_run(
w2_scale: torch.Tensor,
a_scale: torch.Tensor,
):
quant_config = fp8_w8a8_moe_quant_config(
w1_scale=w1_scale,
w2_scale=w2_scale,
a1_scale=a_scale,
)
with set_current_vllm_config(
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
):
@ -208,7 +173,10 @@ def bench_run(
w2,
topk_weights,
topk_ids,
quant_config=quant_config,
use_fp8_w8a8=True,
w1_scale=w1_scale,
w2_scale=w2_scale,
a1_scale=a_scale,
)
def replay_graph(graph, num_repeats):
@ -226,10 +194,6 @@ def bench_run(
w2_q,
w1_scale,
w2_scale,
ab_strides1,
ab_strides2,
c_strides1,
c_strides2,
topk_weights,
topk_ids,
)
@ -267,10 +231,6 @@ def bench_run(
"w1_scale": w1_scale,
"w2_scale": w2_scale,
"per_act_token": per_act_token,
"ab_strides1": ab_strides1,
"ab_strides2": ab_strides2,
"c_strides1": c_strides1,
"c_strides2": c_strides2,
# cuda graph params
"cutlass_graph": cutlass_graph,
"triton_graph": triton_graph,
@ -329,10 +289,6 @@ def bench_run(
w2_q,
w1_scale,
w2_scale,
ab_strides1,
ab_strides2,
c_strides1,
c_strides2,
topk_weights,
topk_ids,
per_act_token,
@ -341,7 +297,7 @@ def bench_run(
results.append(
benchmark.Timer(
stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, ab_strides1, ab_strides2, c_strides1, c_strides2, topk_weights, topk_ids, per_act_token, num_runs)", # noqa: E501
stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, per_act_token, num_runs)", # noqa: E501
globals=globals,
label=label,
sub_label=sub_label,

Some files were not shown because too many files have changed in this diff Show More