mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-21 07:13:52 +08:00
Compare commits
7 Commits
main
...
debug-logg
Author | SHA1 | Date | |
---|---|---|---|
f0945e311d | |||
4ec76caafa | |||
1588294a88 | |||
e82e9afeb7 | |||
10abfaf309 | |||
9ff1a2b537 | |||
0abe10e4a7 |
@ -5,11 +5,11 @@ import os
|
|||||||
import sys
|
import sys
|
||||||
import zipfile
|
import zipfile
|
||||||
|
|
||||||
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 500 MiB
|
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 400 MiB
|
||||||
# Note that we have 800 MiB quota, please use it wisely.
|
# Note that we have 400 MiB quota, please use it wisely.
|
||||||
# See https://github.com/pypi/support/issues/6326 .
|
# See https://github.com/pypi/support/issues/3792 .
|
||||||
# Please also sync the value with the one in Dockerfile.
|
# Please also sync the value with the one in Dockerfile.
|
||||||
VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 500))
|
VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 400))
|
||||||
|
|
||||||
|
|
||||||
def print_top_10_largest_files(zip_file):
|
def print_top_10_largest_files(zip_file):
|
||||||
|
@ -8,8 +8,7 @@ template = """<!DOCTYPE html>
|
|||||||
<html>
|
<html>
|
||||||
<body>
|
<body>
|
||||||
<h1>Links for vLLM</h1/>
|
<h1>Links for vLLM</h1/>
|
||||||
<a href="../{x86_wheel_html_escaped}">{x86_wheel}</a><br/>
|
<a href="../{wheel_html_escaped}">{wheel}</a><br/>
|
||||||
<a href="../{arm_wheel_html_escaped}">{arm_wheel}</a><br/>
|
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
"""
|
"""
|
||||||
@ -22,25 +21,7 @@ filename = os.path.basename(args.wheel)
|
|||||||
|
|
||||||
with open("index.html", "w") as f:
|
with open("index.html", "w") as f:
|
||||||
print(f"Generated index.html for {args.wheel}")
|
print(f"Generated index.html for {args.wheel}")
|
||||||
# sync the abi tag with .buildkite/scripts/upload-wheels.sh
|
|
||||||
if "x86_64" in filename:
|
|
||||||
x86_wheel = filename
|
|
||||||
arm_wheel = filename.replace("x86_64", "aarch64").replace(
|
|
||||||
"manylinux1", "manylinux2014"
|
|
||||||
)
|
|
||||||
elif "aarch64" in filename:
|
|
||||||
x86_wheel = filename.replace("aarch64", "x86_64").replace(
|
|
||||||
"manylinux2014", "manylinux1"
|
|
||||||
)
|
|
||||||
arm_wheel = filename
|
|
||||||
else:
|
|
||||||
raise ValueError(f"Unsupported wheel: {filename}")
|
|
||||||
# cloudfront requires escaping the '+' character
|
# cloudfront requires escaping the '+' character
|
||||||
f.write(
|
f.write(
|
||||||
template.format(
|
template.format(wheel=filename, wheel_html_escaped=filename.replace("+", "%2B"))
|
||||||
x86_wheel=x86_wheel,
|
|
||||||
x86_wheel_html_escaped=x86_wheel.replace("+", "%2B"),
|
|
||||||
arm_wheel=arm_wheel,
|
|
||||||
arm_wheel_html_escaped=arm_wheel.replace("+", "%2B"),
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
|
@ -1,12 +0,0 @@
|
|||||||
# For hf script, without -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 100 -t 8
|
|
||||||
model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
|
|
||||||
backend: "vllm-vlm"
|
|
||||||
tasks:
|
|
||||||
- name: "chartqa"
|
|
||||||
metrics:
|
|
||||||
- name: "relaxed_accuracy,none"
|
|
||||||
# TODO(zhewenl): model card is 0.90, but the actual score is 0.80.
|
|
||||||
value: 0.80
|
|
||||||
limit: 100
|
|
||||||
num_fewshot: 0
|
|
@ -1,10 +0,0 @@
|
|||||||
# For hf script, without -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 250 -t 8 -f 5
|
|
||||||
model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
|
|
||||||
tasks:
|
|
||||||
- name: "mmlu_pro"
|
|
||||||
metrics:
|
|
||||||
- name: "exact_match,custom-extract"
|
|
||||||
value: 0.80
|
|
||||||
limit: 250 # will run on 250 * 14 subjects = 3500 samples
|
|
||||||
num_fewshot: 5
|
|
@ -1,5 +1,4 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size)
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -l 1319 -t 1
|
|
||||||
model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
|
model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
|
||||||
tasks:
|
tasks:
|
||||||
- name: "gsm8k"
|
- name: "gsm8k"
|
||||||
|
@ -1,12 +0,0 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m Qwen/Qwen2.5-VL-7B-Instruct -l 2500 -t 1
|
|
||||||
|
|
||||||
model_name: "Qwen/Qwen2.5-VL-7B-Instruct"
|
|
||||||
backend: "vllm-vlm"
|
|
||||||
tasks:
|
|
||||||
- name: "chartqa"
|
|
||||||
metrics:
|
|
||||||
- name: "relaxed_accuracy,none"
|
|
||||||
value: 0.855
|
|
||||||
limit: 2500
|
|
||||||
num_fewshot: 0
|
|
@ -1 +0,0 @@
|
|||||||
Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
|
|
@ -3,3 +3,4 @@ Meta-Llama-3-70B-Instruct.yaml
|
|||||||
Mixtral-8x7B-Instruct-v0.1.yaml
|
Mixtral-8x7B-Instruct-v0.1.yaml
|
||||||
Qwen2-57B-A14-Instruct.yaml
|
Qwen2-57B-A14-Instruct.yaml
|
||||||
DeepSeek-V2-Lite-Chat.yaml
|
DeepSeek-V2-Lite-Chat.yaml
|
||||||
|
Meta-Llama-3-8B-QQQ.yaml
|
||||||
|
@ -1 +0,0 @@
|
|||||||
Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
|
|
@ -1 +0,0 @@
|
|||||||
Qwen2.5-VL-7B-Instruct.yaml
|
|
@ -1,44 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
# We can use this script to compute baseline accuracy on chartqa for vllm.
|
|
||||||
#
|
|
||||||
# Make sure you have lm-eval-harness installed:
|
|
||||||
# pip install lm-eval==0.4.9
|
|
||||||
|
|
||||||
usage() {
|
|
||||||
echo``
|
|
||||||
echo "Runs lm eval harness on ChartQA using multimodal vllm."
|
|
||||||
echo "This pathway is intended to be used to create baselines for "
|
|
||||||
echo "our correctness tests in vllm's CI."
|
|
||||||
echo
|
|
||||||
echo "usage: ${0} <options>"
|
|
||||||
echo
|
|
||||||
echo " -m - huggingface stub or local directory of the model"
|
|
||||||
echo " -l - limit number of samples to run"
|
|
||||||
echo " -t - tensor parallel size to run at"
|
|
||||||
echo
|
|
||||||
}
|
|
||||||
|
|
||||||
while getopts "m:l:t:" OPT; do
|
|
||||||
case ${OPT} in
|
|
||||||
m )
|
|
||||||
MODEL="$OPTARG"
|
|
||||||
;;
|
|
||||||
l )
|
|
||||||
LIMIT="$OPTARG"
|
|
||||||
;;
|
|
||||||
t )
|
|
||||||
TP_SIZE="$OPTARG"
|
|
||||||
;;
|
|
||||||
\? )
|
|
||||||
usage
|
|
||||||
exit 1
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
done
|
|
||||||
|
|
||||||
lm_eval --model vllm-vlm \
|
|
||||||
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE" \
|
|
||||||
--tasks chartqa \
|
|
||||||
--batch_size auto \
|
|
||||||
--apply_chat_template \
|
|
||||||
--limit $LIMIT
|
|
2
.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
Executable file → Normal file
2
.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
Executable file → Normal file
@ -2,7 +2,7 @@
|
|||||||
# We can use this script to compute baseline accuracy on GSM for transformers.
|
# We can use this script to compute baseline accuracy on GSM for transformers.
|
||||||
#
|
#
|
||||||
# Make sure you have lm-eval-harness installed:
|
# Make sure you have lm-eval-harness installed:
|
||||||
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
|
# pip install lm-eval==0.4.4
|
||||||
|
|
||||||
usage() {
|
usage() {
|
||||||
echo``
|
echo``
|
||||||
|
@ -3,7 +3,7 @@
|
|||||||
# We use this for fp8, which HF does not support.
|
# We use this for fp8, which HF does not support.
|
||||||
#
|
#
|
||||||
# Make sure you have lm-eval-harness installed:
|
# Make sure you have lm-eval-harness installed:
|
||||||
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
|
# pip install lm-eval==0.4.4
|
||||||
|
|
||||||
usage() {
|
usage() {
|
||||||
echo``
|
echo``
|
||||||
|
@ -1,50 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
# We can use this script to compute baseline accuracy on MMLUPRO for vllm.
|
|
||||||
# We use this for fp8, which HF does not support.
|
|
||||||
#
|
|
||||||
# Make sure you have lm-eval-harness installed:
|
|
||||||
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
|
|
||||||
|
|
||||||
usage() {
|
|
||||||
echo``
|
|
||||||
echo "Runs lm eval harness on MMLU Pro using huggingface transformers."
|
|
||||||
echo "This pathway is intended to be used to create baselines for "
|
|
||||||
echo "our automated nm-test-accuracy workflow"
|
|
||||||
echo
|
|
||||||
echo "usage: ${0} <options>"
|
|
||||||
echo
|
|
||||||
echo " -m - huggingface stub or local directory of the model"
|
|
||||||
echo " -l - limit number of samples to run"
|
|
||||||
echo " -f - number of fewshot samples to use"
|
|
||||||
echo " -t - tensor parallel size to run at"
|
|
||||||
echo
|
|
||||||
}
|
|
||||||
|
|
||||||
while getopts "m:b:l:f:t:" OPT; do
|
|
||||||
case ${OPT} in
|
|
||||||
m )
|
|
||||||
MODEL="$OPTARG"
|
|
||||||
;;
|
|
||||||
b )
|
|
||||||
BATCH_SIZE="$OPTARG"
|
|
||||||
;;
|
|
||||||
l )
|
|
||||||
LIMIT="$OPTARG"
|
|
||||||
;;
|
|
||||||
f )
|
|
||||||
FEWSHOT="$OPTARG"
|
|
||||||
;;
|
|
||||||
t )
|
|
||||||
TP_SIZE="$OPTARG"
|
|
||||||
;;
|
|
||||||
\? )
|
|
||||||
usage
|
|
||||||
exit 1
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
done
|
|
||||||
|
|
||||||
lm_eval --model vllm \
|
|
||||||
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \
|
|
||||||
--tasks mmlu_pro --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
|
|
||||||
--batch_size auto
|
|
@ -19,27 +19,21 @@ RTOL = 0.08
|
|||||||
def launch_lm_eval(eval_config, tp_size):
|
def launch_lm_eval(eval_config, tp_size):
|
||||||
trust_remote_code = eval_config.get("trust_remote_code", False)
|
trust_remote_code = eval_config.get("trust_remote_code", False)
|
||||||
max_model_len = eval_config.get("max_model_len", 4096)
|
max_model_len = eval_config.get("max_model_len", 4096)
|
||||||
batch_size = eval_config.get("batch_size", "auto")
|
|
||||||
backend = eval_config.get("backend", "vllm")
|
|
||||||
model_args = (
|
model_args = (
|
||||||
f"pretrained={eval_config['model_name']},"
|
f"pretrained={eval_config['model_name']},"
|
||||||
f"tensor_parallel_size={tp_size},"
|
f"tensor_parallel_size={tp_size},"
|
||||||
f"enforce_eager=true,"
|
f"enforce_eager=true,"
|
||||||
f"add_bos_token=true,"
|
f"add_bos_token=true,"
|
||||||
f"trust_remote_code={trust_remote_code},"
|
f"trust_remote_code={trust_remote_code},"
|
||||||
f"max_model_len={max_model_len},"
|
f"max_model_len={max_model_len}"
|
||||||
)
|
)
|
||||||
results = lm_eval.simple_evaluate(
|
results = lm_eval.simple_evaluate(
|
||||||
model=backend,
|
model="vllm",
|
||||||
model_args=model_args,
|
model_args=model_args,
|
||||||
tasks=[task["name"] for task in eval_config["tasks"]],
|
tasks=[task["name"] for task in eval_config["tasks"]],
|
||||||
num_fewshot=eval_config["num_fewshot"],
|
num_fewshot=eval_config["num_fewshot"],
|
||||||
limit=eval_config["limit"],
|
limit=eval_config["limit"],
|
||||||
# TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
|
batch_size="auto",
|
||||||
# text models. however, this is regressing measured strict-match for
|
|
||||||
# existing text models in CI, so only apply it for mm.
|
|
||||||
apply_chat_template=backend == "vllm-vlm",
|
|
||||||
batch_size=batch_size,
|
|
||||||
)
|
)
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
@ -7,7 +7,7 @@ This directory contains two sets of benchmark for vllm.
|
|||||||
- Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance
|
- Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance
|
||||||
- Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm.
|
- Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm.
|
||||||
|
|
||||||
See [vLLM performance dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
|
See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
|
||||||
|
|
||||||
## Performance benchmark quick overview
|
## Performance benchmark quick overview
|
||||||
|
|
||||||
@ -28,7 +28,6 @@ See [vLLM performance dashboard](https://hud.pytorch.org/benchmark/llms?repoName
|
|||||||
## Trigger the benchmark
|
## Trigger the benchmark
|
||||||
|
|
||||||
Performance benchmark will be triggered when:
|
Performance benchmark will be triggered when:
|
||||||
|
|
||||||
- A PR being merged into vllm.
|
- A PR being merged into vllm.
|
||||||
- Every commit for those PRs with `perf-benchmarks` label AND `ready` label.
|
- Every commit for those PRs with `perf-benchmarks` label AND `ready` label.
|
||||||
|
|
||||||
@ -39,7 +38,6 @@ bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
|||||||
```
|
```
|
||||||
|
|
||||||
Runtime environment variables:
|
Runtime environment variables:
|
||||||
|
|
||||||
- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
|
- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
|
||||||
- `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
|
- `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
|
||||||
- `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
|
- `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
|
||||||
@ -48,14 +46,12 @@ Runtime environment variables:
|
|||||||
- `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.
|
- `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.
|
||||||
|
|
||||||
Nightly benchmark will be triggered when:
|
Nightly benchmark will be triggered when:
|
||||||
|
|
||||||
- Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
|
- Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
|
||||||
|
|
||||||
## Performance benchmark details
|
## Performance benchmark details
|
||||||
|
|
||||||
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
|
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
|
||||||
> NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead.
|
> NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead.
|
||||||
>
|
|
||||||
### Latency test
|
### Latency test
|
||||||
|
|
||||||
Here is an example of one test inside `latency-tests.json`:
|
Here is an example of one test inside `latency-tests.json`:
|
||||||
@ -78,7 +74,7 @@ Here is an example of one test inside `latency-tests.json`:
|
|||||||
In this example:
|
In this example:
|
||||||
|
|
||||||
- The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
|
- The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
|
||||||
- The `parameters` attribute control the command line arguments to be used for `vllm bench latency`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `vllm bench latency`. For example, the corresponding command line arguments for `vllm bench latency` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
|
- The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
|
||||||
|
|
||||||
Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
|
Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
|
||||||
|
|
||||||
@ -86,13 +82,13 @@ WARNING: The benchmarking script will save json results by itself, so please do
|
|||||||
|
|
||||||
### Throughput test
|
### Throughput test
|
||||||
|
|
||||||
The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `vllm bench throughput`.
|
The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`.
|
||||||
|
|
||||||
The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
|
The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
|
||||||
|
|
||||||
### Serving test
|
### Serving test
|
||||||
|
|
||||||
We test the throughput by using `vllm bench serve` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
|
We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
|
||||||
|
|
||||||
```json
|
```json
|
||||||
[
|
[
|
||||||
@ -104,6 +100,7 @@ We test the throughput by using `vllm bench serve` with request rate = inf to co
|
|||||||
"tensor_parallel_size": 1,
|
"tensor_parallel_size": 1,
|
||||||
"swap_space": 16,
|
"swap_space": 16,
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
"load_format": "dummy"
|
"load_format": "dummy"
|
||||||
},
|
},
|
||||||
"client_parameters": {
|
"client_parameters": {
|
||||||
@ -121,8 +118,8 @@ Inside this example:
|
|||||||
|
|
||||||
- The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`.
|
- The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`.
|
||||||
- The `server-parameters` includes the command line arguments for vLLM server.
|
- The `server-parameters` includes the command line arguments for vLLM server.
|
||||||
- The `client-parameters` includes the command line arguments for `vllm bench serve`.
|
- The `client-parameters` includes the command line arguments for `benchmark_serving.py`.
|
||||||
- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `vllm bench serve`
|
- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `benchmark_serving.py`
|
||||||
|
|
||||||
The number of this test is less stable compared to the delay and latency benchmarks (due to randomized sharegpt dataset sampling inside `benchmark_serving.py`), but a large change on this number (e.g. 5% change) still vary the output greatly.
|
The number of this test is less stable compared to the delay and latency benchmarks (due to randomized sharegpt dataset sampling inside `benchmark_serving.py`), but a large change on this number (e.g. 5% change) still vary the output greatly.
|
||||||
|
|
||||||
@ -138,20 +135,27 @@ The raw benchmarking results (in the format of json files) are in the `Artifacts
|
|||||||
|
|
||||||
The `compare-json-results.py` helps to compare benchmark results JSON files converted using `convert-results-json-to-markdown.py`.
|
The `compare-json-results.py` helps to compare benchmark results JSON files converted using `convert-results-json-to-markdown.py`.
|
||||||
When run, benchmark script generates results under `benchmark/results` folder, along with the `benchmark_results.md` and `benchmark_results.json`.
|
When run, benchmark script generates results under `benchmark/results` folder, along with the `benchmark_results.md` and `benchmark_results.json`.
|
||||||
`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.
|
`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.
|
||||||
If only one benchmark_results.json is passed, `compare-json-results.py` compares different TP and PP configurations in the benchmark_results.json instead.
|
|
||||||
|
|
||||||
Here is an example using the script to compare result_a and result_b with Model, Dataset name, input/output length, max concurrency and qps.
|
Here is an example using the script to compare result_a and result_b without detail test name.
|
||||||
|
`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json --ignore_test_name`
|
||||||
|
|
||||||
|
| | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio |
|
||||||
|
|----|----------------------------------------|----------------------------------------|----------|
|
||||||
|
| 0 | 142.633982 | 156.526018 | 1.097396 |
|
||||||
|
| 1 | 241.620334 | 294.018783 | 1.216863 |
|
||||||
|
| 2 | 218.298905 | 262.664916 | 1.203235 |
|
||||||
|
| 3 | 242.743860 | 299.816190 | 1.235113 |
|
||||||
|
|
||||||
|
Here is an example using the script to compare result_a and result_b with detail test name.
|
||||||
`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json`
|
`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json`
|
||||||
|
| | results_a/benchmark_results.json_name | results_a/benchmark_results.json | results_b/benchmark_results.json_name | results_b/benchmark_results.json | perf_ratio |
|
||||||
| | Model | Dataset Name | Input Len | Output Len | # of max concurrency | qps | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio |
|
|---|---------------------------------------------|----------------------------------------|---------------------------------------------|----------------------------------------|----------|
|
||||||
|----|---------------------------------------|--------|-----|-----|------|-----|-----------|----------|----------|
|
| 0 | serving_llama8B_tp1_sharegpt_qps_1 | 142.633982 | serving_llama8B_tp1_sharegpt_qps_1 | 156.526018 | 1.097396 |
|
||||||
| 0 | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | 1 | 142.633982 | 156.526018 | 1.097396 |
|
| 1 | serving_llama8B_tp1_sharegpt_qps_16 | 241.620334 | serving_llama8B_tp1_sharegpt_qps_16 | 294.018783 | 1.216863 |
|
||||||
| 1 | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | inf| 241.620334 | 294.018783 | 1.216863 |
|
| 2 | serving_llama8B_tp1_sharegpt_qps_4 | 218.298905 | serving_llama8B_tp1_sharegpt_qps_4 | 262.664916 | 1.203235 |
|
||||||
|
| 3 | serving_llama8B_tp1_sharegpt_qps_inf | 242.743860 | serving_llama8B_tp1_sharegpt_qps_inf | 299.816190 | 1.235113 |
|
||||||
A comparison diagram will be generated below the table.
|
| 4 | serving_llama8B_tp2_random_1024_128_qps_1 | 96.613390 | serving_llama8B_tp4_random_1024_128_qps_1 | 108.404853 | 1.122048 |
|
||||||
Here is an example to compare between 96c/results_gnr_96c_091_tp2pp3 and 128c/results_gnr_128c_091_tp2pp3
|
|
||||||
<img width="1886" height="828" alt="image" src="https://github.com/user-attachments/assets/c02a43ef-25d0-4fd6-90e5-2169a28682dd" />
|
|
||||||
|
|
||||||
## Nightly test details
|
## Nightly test details
|
||||||
|
|
||||||
@ -160,9 +164,9 @@ See [nightly-descriptions.md](nightly-descriptions.md) for the detailed descript
|
|||||||
### Workflow
|
### Workflow
|
||||||
|
|
||||||
- The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines.
|
- The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines.
|
||||||
- Inside each container, we run [scripts/run-nightly-benchmarks.sh](scripts/run-nightly-benchmarks.sh), which will probe the serving engine of the current container.
|
- Inside each container, we run [run-nightly-suite.sh](run-nightly-suite.sh), which will probe the serving engine of the current container.
|
||||||
- The `scripts/run-nightly-benchmarks.sh` will parse the workload described in [nightly-tests.json](tests/nightly-tests.json) and launch the right benchmark for the specified serving engine via `scripts/launch-server.sh`.
|
- The `run-nightly-suite.sh` will redirect the request to `tests/run-[llm serving engine name]-nightly.sh`, which parses the workload described in [nightly-tests.json](tests/nightly-tests.json) and performs the benchmark.
|
||||||
- At last, we run [scripts/summary-nightly-results.py](scripts/summary-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
|
- At last, we run [scripts/plot-nightly-results.py](scripts/plot-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
|
||||||
|
|
||||||
### Nightly tests
|
### Nightly tests
|
||||||
|
|
||||||
@ -172,6 +176,6 @@ In [nightly-tests.json](tests/nightly-tests.json), we include the command line a
|
|||||||
|
|
||||||
The docker containers for benchmarking are specified in `nightly-pipeline.yaml`.
|
The docker containers for benchmarking are specified in `nightly-pipeline.yaml`.
|
||||||
|
|
||||||
WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `scripts/run-nightly-benchmarks.sh` and `scripts/launch-server.sh`.
|
WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `tests/run-[llm serving engine name]-nightly.sh`.
|
||||||
|
|
||||||
WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git).
|
WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git).
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
# Nightly benchmark annotation
|
|
||||||
|
|
||||||
## Description
|
## Description
|
||||||
|
|
||||||
@ -14,15 +13,15 @@ Please download the visualization scripts in the post
|
|||||||
|
|
||||||
- Find the docker we use in `benchmarking pipeline`
|
- Find the docker we use in `benchmarking pipeline`
|
||||||
- Deploy the docker, and inside the docker:
|
- Deploy the docker, and inside the docker:
|
||||||
- Download `nightly-benchmarks.zip`.
|
- Download `nightly-benchmarks.zip`.
|
||||||
- In the same folder, run the following code:
|
- In the same folder, run the following code:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
export HF_TOKEN=<your HF token>
|
export HF_TOKEN=<your HF token>
|
||||||
apt update
|
apt update
|
||||||
apt install -y git
|
apt install -y git
|
||||||
unzip nightly-benchmarks.zip
|
unzip nightly-benchmarks.zip
|
||||||
VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
|
VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
|
||||||
```
|
```
|
||||||
|
|
||||||
And the results will be inside `./benchmarks/results`.
|
And the results will be inside `./benchmarks/results`.
|
||||||
|
@ -8,30 +8,30 @@ This benchmark aims to:
|
|||||||
|
|
||||||
Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end.
|
Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end.
|
||||||
|
|
||||||
Latest reproduction guide: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
|
Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
|
||||||
|
|
||||||
## Setup
|
## Setup
|
||||||
|
|
||||||
- Docker images:
|
- Docker images:
|
||||||
- vLLM: `vllm/vllm-openai:v0.6.2`
|
- vLLM: `vllm/vllm-openai:v0.6.2`
|
||||||
- SGLang: `lmsysorg/sglang:v0.3.2-cu121`
|
- SGLang: `lmsysorg/sglang:v0.3.2-cu121`
|
||||||
- LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
|
- LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
|
||||||
- TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
|
- TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
|
||||||
- *NOTE: we use r24.07 as the current implementation only works for this version. We are going to bump this up.*
|
- *NOTE: we uses r24.07 as the current implementation only works for this version. We are going to bump this up.*
|
||||||
- Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
|
- Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
|
||||||
- Hardware
|
- Hardware
|
||||||
- 8x Nvidia A100 GPUs
|
- 8x Nvidia A100 GPUs
|
||||||
- Workload:
|
- Workload:
|
||||||
- Dataset
|
- Dataset
|
||||||
- ShareGPT dataset
|
- ShareGPT dataset
|
||||||
- Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output)
|
- Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output)
|
||||||
- Decode-heavy dataset (in average 462 input tokens, 256 output tokens)
|
- Decode-heavy dataset (in average 462 input tokens, 256 output tokens)
|
||||||
- Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use.
|
- Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use.
|
||||||
- Models: llama-3 8B, llama-3 70B.
|
- Models: llama-3 8B, llama-3 70B.
|
||||||
- We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)).
|
- We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)).
|
||||||
- Average QPS (query per second): 2, 4, 8, 16, 32 and inf.
|
- Average QPS (query per second): 2, 4, 8, 16, 32 and inf.
|
||||||
- Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
|
- Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
|
||||||
- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
|
- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
|
||||||
|
|
||||||
## Known issues
|
## Known issues
|
||||||
|
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
# Performance benchmarks descriptions
|
|
||||||
|
|
||||||
## Latency tests
|
## Latency tests
|
||||||
|
|
||||||
|
@ -1,202 +1,33 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
import argparse
|
import argparse
|
||||||
import json
|
|
||||||
import os
|
|
||||||
from importlib import util
|
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
plotly_found = util.find_spec("plotly.express") is not None
|
|
||||||
|
|
||||||
|
|
||||||
def compare_data_columns(
|
def compare_data_columns(
|
||||||
files, name_column, data_column, info_cols, drop_column, debug=False
|
files, name_column, data_column, drop_column, ignore_test_name=False
|
||||||
):
|
):
|
||||||
"""
|
print("\ncompare_data_column: " + data_column)
|
||||||
Align concatenation by keys derived from info_cols instead of row order.
|
|
||||||
- Pick one canonical key list: subset of info_cols present in ALL files.
|
|
||||||
- For each file: set index to those keys, aggregate duplicates
|
|
||||||
- (mean for metric, first for names).
|
|
||||||
- Concat along axis=1 (indexes align), then reset_index so callers can
|
|
||||||
- group by columns.
|
|
||||||
- If --debug, add a <file_label>_name column per file.
|
|
||||||
"""
|
|
||||||
print("\ncompare_data_column:", data_column)
|
|
||||||
|
|
||||||
frames = []
|
frames = []
|
||||||
raw_data_cols = []
|
|
||||||
compare_frames = []
|
compare_frames = []
|
||||||
|
|
||||||
# 1) choose a canonical key list from info_cols that exists in ALL files
|
|
||||||
cols_per_file = []
|
|
||||||
for f in files:
|
|
||||||
try:
|
|
||||||
df_tmp = pd.read_json(f, orient="records")
|
|
||||||
except Exception as err:
|
|
||||||
raise ValueError(f"Failed to read {f}") from err
|
|
||||||
cols_per_file.append(set(df_tmp.columns))
|
|
||||||
|
|
||||||
key_cols = [c for c in info_cols if all(c in cset for cset in cols_per_file)]
|
|
||||||
if not key_cols:
|
|
||||||
# soft fallback: use any info_cols present in the first file
|
|
||||||
key_cols = [c for c in info_cols if c in list(cols_per_file[0])]
|
|
||||||
if not key_cols:
|
|
||||||
raise ValueError(
|
|
||||||
"No common key columns found from info_cols across the input files."
|
|
||||||
)
|
|
||||||
|
|
||||||
# 2) build a single "meta" block (keys as columns) once, aligned by the key index
|
|
||||||
meta_added = False
|
|
||||||
|
|
||||||
for file in files:
|
for file in files:
|
||||||
df = pd.read_json(file, orient="records")
|
data_df = pd.read_json(file)
|
||||||
|
serving_df = data_df.dropna(subset=[drop_column], ignore_index=True)
|
||||||
# Keep rows that actually have the compared metric (same as original behavior)
|
if ignore_test_name is False:
|
||||||
if drop_column in df.columns:
|
serving_df = serving_df.rename(columns={name_column: file + "_name"})
|
||||||
df = df.dropna(subset=[drop_column], ignore_index=True)
|
frames.append(serving_df[file + "_name"])
|
||||||
|
serving_df = serving_df.rename(columns={data_column: file})
|
||||||
# Stabilize numeric key columns (harmless if missing)
|
frames.append(serving_df[file])
|
||||||
for c in (
|
compare_frames.append(serving_df[file])
|
||||||
"Input Len",
|
|
||||||
"Output Len",
|
|
||||||
"TP Size",
|
|
||||||
"PP Size",
|
|
||||||
"# of max concurrency.",
|
|
||||||
"qps",
|
|
||||||
):
|
|
||||||
if c in df.columns:
|
|
||||||
df[c] = pd.to_numeric(df[c], errors="coerce")
|
|
||||||
|
|
||||||
# Ensure all key columns exist
|
|
||||||
for c in key_cols:
|
|
||||||
if c not in df.columns:
|
|
||||||
df[c] = pd.NA
|
|
||||||
|
|
||||||
# Set index = key_cols and aggregate duplicates → unique MultiIndex
|
|
||||||
df_idx = df.set_index(key_cols, drop=False)
|
|
||||||
|
|
||||||
# meta (key columns), unique per key
|
|
||||||
meta = df_idx[key_cols]
|
|
||||||
if not meta.index.is_unique:
|
|
||||||
meta = meta.groupby(level=key_cols, dropna=False).first()
|
|
||||||
|
|
||||||
# metric series for this file, aggregated to one row per key
|
|
||||||
file_label = "/".join(file.split("/")[:-1]) or os.path.basename(file)
|
|
||||||
s = df_idx[data_column]
|
|
||||||
if not s.index.is_unique:
|
|
||||||
s = s.groupby(level=key_cols, dropna=False).mean()
|
|
||||||
s.name = file_label # column label like original
|
|
||||||
|
|
||||||
# add meta once (from first file) so keys are the leftmost columns
|
|
||||||
if not meta_added:
|
|
||||||
frames.append(meta)
|
|
||||||
meta_added = True
|
|
||||||
|
|
||||||
# (NEW) debug: aligned test-name column per file
|
|
||||||
if debug and name_column in df_idx.columns:
|
|
||||||
name_s = df_idx[name_column]
|
|
||||||
if not name_s.index.is_unique:
|
|
||||||
name_s = name_s.groupby(level=key_cols, dropna=False).first()
|
|
||||||
name_s.name = f"{file_label}_name"
|
|
||||||
frames.append(name_s)
|
|
||||||
|
|
||||||
frames.append(s)
|
|
||||||
raw_data_cols.append(file_label)
|
|
||||||
compare_frames.append(s)
|
|
||||||
|
|
||||||
# Generalize ratio: for any file N>=2, add ratio (fileN / file1)
|
|
||||||
if len(compare_frames) >= 2:
|
if len(compare_frames) >= 2:
|
||||||
base = compare_frames[0]
|
# Compare numbers among two files
|
||||||
current = compare_frames[-1]
|
ratio_df = compare_frames[1] / compare_frames[0]
|
||||||
ratio = current / base
|
frames.append(ratio_df)
|
||||||
ratio = ratio.mask(base == 0) # avoid inf when baseline is 0
|
compare_frames.pop(1)
|
||||||
ratio.name = f"Ratio 1 vs {len(compare_frames)}"
|
|
||||||
frames.append(ratio)
|
|
||||||
|
|
||||||
# 4) concat on columns with aligned MultiIndex;
|
|
||||||
# then reset_index to return keys as columns
|
|
||||||
concat_df = pd.concat(frames, axis=1)
|
concat_df = pd.concat(frames, axis=1)
|
||||||
concat_df = concat_df.reset_index(drop=True).reset_index()
|
return concat_df
|
||||||
if "index" in concat_df.columns:
|
|
||||||
concat_df = concat_df.drop(columns=["index"])
|
|
||||||
|
|
||||||
# Ensure key/info columns appear first (in your info_cols order)
|
|
||||||
front = [c for c in info_cols if c in concat_df.columns]
|
|
||||||
rest = [c for c in concat_df.columns if c not in front]
|
|
||||||
concat_df = concat_df[front + rest]
|
|
||||||
|
|
||||||
print(raw_data_cols)
|
|
||||||
return concat_df, raw_data_cols
|
|
||||||
|
|
||||||
|
|
||||||
def split_json_by_tp_pp(
|
|
||||||
input_file: str = "benchmark_results.json", output_root: str = "."
|
|
||||||
) -> list[str]:
|
|
||||||
"""
|
|
||||||
Split a benchmark JSON into separate folders by (TP Size, PP Size).
|
|
||||||
|
|
||||||
Creates: <output_root>/tp{TP}_pp{PP}/benchmark_results.json
|
|
||||||
Returns: list of file paths written.
|
|
||||||
"""
|
|
||||||
# Load JSON data into DataFrame
|
|
||||||
with open(input_file, encoding="utf-8") as f:
|
|
||||||
data = json.load(f)
|
|
||||||
|
|
||||||
# If the JSON is a dict with a list under common keys, use that list
|
|
||||||
if isinstance(data, dict):
|
|
||||||
for key in ("results", "serving_results", "benchmarks", "data"):
|
|
||||||
if isinstance(data.get(key), list):
|
|
||||||
data = data[key]
|
|
||||||
break
|
|
||||||
|
|
||||||
df = pd.DataFrame(data)
|
|
||||||
|
|
||||||
# Keep only "serving" tests
|
|
||||||
name_col = next(
|
|
||||||
(c for c in ["Test name", "test_name", "Test Name"] if c in df.columns), None
|
|
||||||
)
|
|
||||||
if name_col:
|
|
||||||
df = df[
|
|
||||||
df[name_col].astype(str).str.contains(r"serving", case=False, na=False)
|
|
||||||
].copy()
|
|
||||||
|
|
||||||
# Handle alias column names
|
|
||||||
rename_map = {
|
|
||||||
"tp_size": "TP Size",
|
|
||||||
"tensor_parallel_size": "TP Size",
|
|
||||||
"pp_size": "PP Size",
|
|
||||||
"pipeline_parallel_size": "PP Size",
|
|
||||||
}
|
|
||||||
df.rename(
|
|
||||||
columns={k: v for k, v in rename_map.items() if k in df.columns}, inplace=True
|
|
||||||
)
|
|
||||||
|
|
||||||
# Ensure TP/PP columns exist (default to 1 if missing)
|
|
||||||
if "TP Size" not in df.columns:
|
|
||||||
df["TP Size"] = 1
|
|
||||||
if "PP Size" not in df.columns:
|
|
||||||
df["PP Size"] = 1
|
|
||||||
|
|
||||||
# make sure TP/PP are numeric ints with no NaN
|
|
||||||
df["TP Size"] = (
|
|
||||||
pd.to_numeric(df.get("TP Size", 1), errors="coerce").fillna(1).astype(int)
|
|
||||||
)
|
|
||||||
df["PP Size"] = (
|
|
||||||
pd.to_numeric(df.get("PP Size", 1), errors="coerce").fillna(1).astype(int)
|
|
||||||
)
|
|
||||||
|
|
||||||
# Split into separate folders
|
|
||||||
saved_paths: list[str] = []
|
|
||||||
for (tp, pp), group_df in df.groupby(["TP Size", "PP Size"], dropna=False):
|
|
||||||
folder_name = os.path.join(output_root, f"tp{int(tp)}_pp{int(pp)}")
|
|
||||||
os.makedirs(folder_name, exist_ok=True)
|
|
||||||
filepath = os.path.join(folder_name, "benchmark_results.json")
|
|
||||||
group_df.to_json(filepath, orient="records", indent=2, force_ascii=False)
|
|
||||||
print(f"Saved: {filepath}")
|
|
||||||
saved_paths.append(filepath)
|
|
||||||
|
|
||||||
return saved_paths
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
@ -205,103 +36,31 @@ if __name__ == "__main__":
|
|||||||
"-f", "--file", action="append", type=str, help="input file name"
|
"-f", "--file", action="append", type=str, help="input file name"
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--debug", action="store_true", help="show all information for debugging"
|
"--ignore_test_name", action="store_true", help="ignore_test_name or not"
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--plot",
|
|
||||||
action=argparse.BooleanOptionalAction,
|
|
||||||
default=True,
|
|
||||||
help="plot perf diagrams or not --no-plot --plot",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"-x",
|
|
||||||
"--xaxis",
|
|
||||||
type=str,
|
|
||||||
default="# of max concurrency.",
|
|
||||||
help="column name to use as X Axis in comparison graph",
|
|
||||||
)
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
files = args.file
|
||||||
|
print("comparing : " + ", ".join(files))
|
||||||
|
|
||||||
drop_column = "P99"
|
drop_column = "P99"
|
||||||
name_column = "Test name"
|
name_column = "Test name"
|
||||||
info_cols = [
|
|
||||||
"Model",
|
|
||||||
"Dataset Name",
|
|
||||||
"Input Len",
|
|
||||||
"Output Len",
|
|
||||||
"TP Size",
|
|
||||||
"PP Size",
|
|
||||||
"# of max concurrency.",
|
|
||||||
"qps",
|
|
||||||
]
|
|
||||||
data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
|
data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
|
||||||
html_msgs_for_data_cols = [
|
html_msgs_for_data_cols = [
|
||||||
"Compare Output Tokens /n",
|
"Compare Output Tokens /n",
|
||||||
"Median TTFT /n",
|
"Median TTFT /n",
|
||||||
"Median TPOT /n",
|
"Median TPOT /n",
|
||||||
]
|
]
|
||||||
|
ignore_test_name = args.ignore_test_name
|
||||||
if len(args.file) == 1:
|
|
||||||
files = split_json_by_tp_pp(args.file[0], output_root="splits")
|
|
||||||
info_cols = [c for c in info_cols if c not in ("TP Size", "PP Size")]
|
|
||||||
else:
|
|
||||||
files = args.file
|
|
||||||
print("comparing : " + ", ".join(files))
|
|
||||||
debug = args.debug
|
|
||||||
plot = args.plot
|
|
||||||
# For Plot feature, assign y axis from one of info_cols
|
|
||||||
y_axis_index = info_cols.index(args.xaxis) if args.xaxis in info_cols else 6
|
|
||||||
with open("perf_comparison.html", "w") as text_file:
|
with open("perf_comparison.html", "w") as text_file:
|
||||||
for i in range(len(data_cols_to_compare)):
|
for i in range(len(data_cols_to_compare)):
|
||||||
output_df, raw_data_cols = compare_data_columns(
|
output_df = compare_data_columns(
|
||||||
files,
|
files,
|
||||||
name_column,
|
name_column,
|
||||||
data_cols_to_compare[i],
|
data_cols_to_compare[i],
|
||||||
info_cols,
|
|
||||||
drop_column,
|
drop_column,
|
||||||
debug=debug,
|
ignore_test_name=ignore_test_name,
|
||||||
)
|
)
|
||||||
|
print(output_df)
|
||||||
# For Plot feature, insert y axis from one of info_cols
|
html = output_df.to_html()
|
||||||
raw_data_cols.insert(0, info_cols[y_axis_index])
|
text_file.write(html_msgs_for_data_cols[i])
|
||||||
|
text_file.write(html)
|
||||||
filtered_info_cols = info_cols[:-2]
|
|
||||||
existing_group_cols = [
|
|
||||||
c for c in filtered_info_cols if c in output_df.columns
|
|
||||||
]
|
|
||||||
if not existing_group_cols:
|
|
||||||
raise ValueError(
|
|
||||||
f"No valid group-by columns "
|
|
||||||
f"Expected subset: {filtered_info_cols}, "
|
|
||||||
f"but DataFrame has: {list(output_df.columns)}"
|
|
||||||
)
|
|
||||||
output_df_sorted = output_df.sort_values(by=existing_group_cols)
|
|
||||||
output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False)
|
|
||||||
for name, group in output_groups:
|
|
||||||
html = group.to_html()
|
|
||||||
text_file.write(html_msgs_for_data_cols[i])
|
|
||||||
text_file.write(html)
|
|
||||||
|
|
||||||
if plot and plotly_found:
|
|
||||||
import plotly.express as px
|
|
||||||
|
|
||||||
df = group[raw_data_cols]
|
|
||||||
df_sorted = df.sort_values(by=info_cols[y_axis_index])
|
|
||||||
# Melt DataFrame for plotting
|
|
||||||
df_melted = df_sorted.melt(
|
|
||||||
id_vars=info_cols[y_axis_index],
|
|
||||||
var_name="Configuration",
|
|
||||||
value_name=data_cols_to_compare[i],
|
|
||||||
)
|
|
||||||
title = data_cols_to_compare[i] + " vs " + info_cols[y_axis_index]
|
|
||||||
# Create Plotly line chart
|
|
||||||
fig = px.line(
|
|
||||||
df_melted,
|
|
||||||
x=info_cols[y_axis_index],
|
|
||||||
y=data_cols_to_compare[i],
|
|
||||||
color="Configuration",
|
|
||||||
title=title,
|
|
||||||
markers=True,
|
|
||||||
)
|
|
||||||
# Export to HTML
|
|
||||||
text_file.write(fig.to_html(full_html=True, include_plotlyjs="cdn"))
|
|
||||||
|
@ -1,19 +1,17 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import argparse
|
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import shlex
|
|
||||||
from importlib import util
|
from importlib import util
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any
|
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import psutil
|
import psutil
|
||||||
import regex as re
|
|
||||||
from tabulate import tabulate
|
from tabulate import tabulate
|
||||||
|
|
||||||
|
results_folder = Path("results/")
|
||||||
|
|
||||||
# latency results and the keys that will be printed into markdown
|
# latency results and the keys that will be printed into markdown
|
||||||
latency_results = []
|
latency_results = []
|
||||||
latency_column_mapping = {
|
latency_column_mapping = {
|
||||||
@ -44,22 +42,13 @@ throughput_results_column_mapping = {
|
|||||||
serving_results = []
|
serving_results = []
|
||||||
serving_column_mapping = {
|
serving_column_mapping = {
|
||||||
"test_name": "Test name",
|
"test_name": "Test name",
|
||||||
"model_id": "Model",
|
|
||||||
"dataset_name": "Dataset Name",
|
|
||||||
"input_len": "Input Len",
|
|
||||||
"output_len": "Output Len",
|
|
||||||
"tp_size": "TP Size",
|
|
||||||
"pp_size": "PP Size",
|
|
||||||
"dtype": "dtype",
|
|
||||||
"gpu_type": "GPU",
|
"gpu_type": "GPU",
|
||||||
"completed": "# of req.",
|
"completed": "# of req.",
|
||||||
"qps": "qps",
|
|
||||||
"max_concurrency": "# of max concurrency.",
|
|
||||||
"request_throughput": "Tput (req/s)",
|
"request_throughput": "Tput (req/s)",
|
||||||
"total_token_throughput": "Total Token Tput (tok/s)",
|
"total_token_throughput": "Total Token Tput (tok/s)",
|
||||||
"output_throughput": "Output Tput (tok/s)",
|
"output_throughput": "Output Tput (tok/s)",
|
||||||
# "total_input_tokens": "Total input tokens",
|
"total_input_tokens": "Total input tokens",
|
||||||
# "total_output_tokens": "Total output tokens",
|
"total_output_tokens": "Total output tokens",
|
||||||
"mean_ttft_ms": "Mean TTFT (ms)",
|
"mean_ttft_ms": "Mean TTFT (ms)",
|
||||||
"median_ttft_ms": "Median TTFT (ms)",
|
"median_ttft_ms": "Median TTFT (ms)",
|
||||||
"p99_ttft_ms": "P99 TTFT (ms)",
|
"p99_ttft_ms": "P99 TTFT (ms)",
|
||||||
@ -104,111 +93,15 @@ def get_size_with_unit(bytes, suffix="B"):
|
|||||||
bytes /= factor
|
bytes /= factor
|
||||||
|
|
||||||
|
|
||||||
def _coerce(val: str) -> Any:
|
|
||||||
"""Best-effort type coercion from string to Python types."""
|
|
||||||
low = val.lower()
|
|
||||||
if low == "null":
|
|
||||||
return None
|
|
||||||
if low == "true":
|
|
||||||
return True
|
|
||||||
if low == "false":
|
|
||||||
return False
|
|
||||||
# integers
|
|
||||||
if re.fullmatch(r"[+-]?\d+", val):
|
|
||||||
try:
|
|
||||||
return int(val)
|
|
||||||
except ValueError:
|
|
||||||
pass
|
|
||||||
# floats (keep 'inf'/'-inf'/'nan' as strings)
|
|
||||||
if re.fullmatch(r"[+-]?\d*\.\d+", val):
|
|
||||||
try:
|
|
||||||
return float(val)
|
|
||||||
except ValueError:
|
|
||||||
pass
|
|
||||||
return val
|
|
||||||
|
|
||||||
|
|
||||||
def parse_client_command(cmd: str) -> dict[str, Any]:
|
|
||||||
"""Parse the client_command shell string into {executable, script, args}."""
|
|
||||||
toks = shlex.split(cmd)
|
|
||||||
if len(toks) < 2:
|
|
||||||
raise ValueError("client_command must include an executable and a script")
|
|
||||||
executable, script = toks[0], toks[1]
|
|
||||||
args: dict[str, Any] = {}
|
|
||||||
|
|
||||||
i = 2
|
|
||||||
while i < len(toks):
|
|
||||||
t = toks[i]
|
|
||||||
if t.startswith("--"):
|
|
||||||
# --key=value or --key (value) or boolean flag
|
|
||||||
if "=" in t:
|
|
||||||
key, val = t.split("=", 1)
|
|
||||||
if key == "--metadata":
|
|
||||||
md = {}
|
|
||||||
if val:
|
|
||||||
if "=" in val:
|
|
||||||
k, v = val.split("=", 1)
|
|
||||||
md[k] = _coerce(v)
|
|
||||||
else:
|
|
||||||
md[val] = True
|
|
||||||
args[key] = md
|
|
||||||
else:
|
|
||||||
args[key] = _coerce(val)
|
|
||||||
i += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
key = t
|
|
||||||
|
|
||||||
# Special: consume metadata k=v pairs until next --flag
|
|
||||||
if key == "--metadata":
|
|
||||||
i += 1
|
|
||||||
md = {}
|
|
||||||
while i < len(toks) and not toks[i].startswith("--"):
|
|
||||||
pair = toks[i]
|
|
||||||
if "=" in pair:
|
|
||||||
k, v = pair.split("=", 1)
|
|
||||||
md[k] = _coerce(v)
|
|
||||||
else:
|
|
||||||
md[pair] = True
|
|
||||||
i += 1
|
|
||||||
args[key] = md
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Standard: check if next token is a value (not a flag)
|
|
||||||
if i + 1 < len(toks) and not toks[i + 1].startswith("--"):
|
|
||||||
args[key] = _coerce(toks[i + 1])
|
|
||||||
i += 2
|
|
||||||
else:
|
|
||||||
# lone flag -> True
|
|
||||||
args[key] = True
|
|
||||||
i += 1
|
|
||||||
else:
|
|
||||||
# unexpected positional; skip
|
|
||||||
i += 1
|
|
||||||
|
|
||||||
return {"executable": executable, "script": script, "args": args}
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument(
|
|
||||||
"-r",
|
|
||||||
"--result",
|
|
||||||
type=str,
|
|
||||||
default="results",
|
|
||||||
help="Folder name for benchmark output results.",
|
|
||||||
)
|
|
||||||
args = parser.parse_args()
|
|
||||||
results_folder = Path(args.result)
|
|
||||||
if not results_folder.exists():
|
|
||||||
raise FileNotFoundError(f"results folder does not exist: {results_folder}")
|
|
||||||
# collect results
|
# collect results
|
||||||
for test_file in results_folder.glob("*.json"):
|
for test_file in results_folder.glob("*.json"):
|
||||||
with open(test_file) as f:
|
with open(test_file) as f:
|
||||||
raw_result = json.loads(f.read())
|
raw_result = json.loads(f.read())
|
||||||
|
|
||||||
if "serving" in str(test_file):
|
if "serving" in str(test_file):
|
||||||
# this result is generated via `vllm bench serve` command
|
# this result is generated via `benchmark_serving.py`
|
||||||
|
|
||||||
# attach the benchmarking command to raw_result
|
# attach the benchmarking command to raw_result
|
||||||
try:
|
try:
|
||||||
with open(test_file.with_suffix(".commands")) as f:
|
with open(test_file.with_suffix(".commands")) as f:
|
||||||
@ -216,50 +109,18 @@ if __name__ == "__main__":
|
|||||||
except OSError as e:
|
except OSError as e:
|
||||||
print(e)
|
print(e)
|
||||||
continue
|
continue
|
||||||
# Parse Server Command Arg
|
|
||||||
out: dict[str, Any] = {
|
|
||||||
"server_command": parse_client_command(command["server_command"])
|
|
||||||
}
|
|
||||||
parse_args = [
|
|
||||||
"--tensor-parallel-size",
|
|
||||||
"--pipeline-parallel-size",
|
|
||||||
"--dtype",
|
|
||||||
]
|
|
||||||
col_mapping = ["tp_size", "pp_size", "dtype"]
|
|
||||||
for index, arg in enumerate(parse_args):
|
|
||||||
if arg in out["server_command"]["args"]:
|
|
||||||
raw_result.update(
|
|
||||||
{col_mapping[index]: out["server_command"]["args"][arg]}
|
|
||||||
)
|
|
||||||
|
|
||||||
# Parse Client Command Arg
|
|
||||||
out: dict[str, Any] = {
|
|
||||||
"client_command": parse_client_command(command["client_command"])
|
|
||||||
}
|
|
||||||
parse_args = [
|
|
||||||
"--dataset-name",
|
|
||||||
"--random-input-len",
|
|
||||||
"--random-output-len",
|
|
||||||
"--request-rate",
|
|
||||||
]
|
|
||||||
col_mapping = ["dataset_name", "input_len", "output_len", "qps"]
|
|
||||||
|
|
||||||
for index, arg in enumerate(parse_args):
|
|
||||||
if arg in out["client_command"]["args"]:
|
|
||||||
raw_result.update(
|
|
||||||
{col_mapping[index]: out["client_command"]["args"][arg]}
|
|
||||||
)
|
|
||||||
# Add Server, Client command
|
|
||||||
raw_result.update(command)
|
raw_result.update(command)
|
||||||
|
|
||||||
# update the test name of this result
|
# update the test name of this result
|
||||||
raw_result.update({"test_name": test_file.stem})
|
raw_result.update({"test_name": test_file.stem})
|
||||||
|
|
||||||
# add the result to raw_result
|
# add the result to raw_result
|
||||||
serving_results.append(raw_result)
|
serving_results.append(raw_result)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
elif "latency" in f.name:
|
elif "latency" in f.name:
|
||||||
# this result is generated via `vllm bench latency` command
|
# this result is generated via `benchmark_latency.py`
|
||||||
|
|
||||||
# attach the benchmarking command to raw_result
|
# attach the benchmarking command to raw_result
|
||||||
try:
|
try:
|
||||||
@ -287,7 +148,7 @@ if __name__ == "__main__":
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
elif "throughput" in f.name:
|
elif "throughput" in f.name:
|
||||||
# this result is generated via `vllm bench throughput` command
|
# this result is generated via `benchmark_throughput.py`
|
||||||
|
|
||||||
# attach the benchmarking command to raw_result
|
# attach the benchmarking command to raw_result
|
||||||
try:
|
try:
|
||||||
@ -343,10 +204,7 @@ if __name__ == "__main__":
|
|||||||
columns=latency_column_mapping
|
columns=latency_column_mapping
|
||||||
)
|
)
|
||||||
if not serving_results.empty:
|
if not serving_results.empty:
|
||||||
valid_columns = [
|
serving_results = serving_results[list(serving_column_mapping.keys())].rename(
|
||||||
col for col in serving_column_mapping if col in serving_results.columns
|
|
||||||
]
|
|
||||||
serving_results = serving_results[valid_columns].rename(
|
|
||||||
columns=serving_column_mapping
|
columns=serving_column_mapping
|
||||||
)
|
)
|
||||||
if not throughput_results.empty:
|
if not throughput_results.empty:
|
||||||
@ -368,7 +226,7 @@ if __name__ == "__main__":
|
|||||||
# The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
|
# The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
|
||||||
# we want to turn it into "8xGPUTYPE"
|
# we want to turn it into "8xGPUTYPE"
|
||||||
df["GPU"] = df["GPU"].apply(
|
df["GPU"] = df["GPU"].apply(
|
||||||
lambda x: f"{len(x.splitlines())}x{x.splitlines()[0]}"
|
lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}"
|
||||||
)
|
)
|
||||||
|
|
||||||
# get markdown tables
|
# get markdown tables
|
||||||
@ -386,9 +244,7 @@ if __name__ == "__main__":
|
|||||||
)
|
)
|
||||||
|
|
||||||
# document the result
|
# document the result
|
||||||
md_file = "benchmark_results.md"
|
with open(results_folder / "benchmark_results.md", "w") as f:
|
||||||
json_file = "benchmark_results.json"
|
|
||||||
with open(results_folder / md_file, "w") as f:
|
|
||||||
results = read_markdown(
|
results = read_markdown(
|
||||||
"../.buildkite/nightly-benchmarks/"
|
"../.buildkite/nightly-benchmarks/"
|
||||||
+ "performance-benchmarks-descriptions.md"
|
+ "performance-benchmarks-descriptions.md"
|
||||||
@ -403,7 +259,7 @@ if __name__ == "__main__":
|
|||||||
f.write(results)
|
f.write(results)
|
||||||
|
|
||||||
# document benchmarking results in json
|
# document benchmarking results in json
|
||||||
with open(results_folder / json_file, "w") as f:
|
with open(results_folder / "benchmark_results.json", "w") as f:
|
||||||
results = (
|
results = (
|
||||||
latency_results.to_dict(orient="records")
|
latency_results.to_dict(orient="records")
|
||||||
+ throughput_results.to_dict(orient="records")
|
+ throughput_results.to_dict(orient="records")
|
||||||
|
@ -181,14 +181,18 @@ launch_vllm_server() {
|
|||||||
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
|
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
|
||||||
echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
|
echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
|
||||||
model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
|
model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
|
||||||
server_command="vllm serve $model \
|
server_command="python3 \
|
||||||
|
-m vllm.entrypoints.openai.api_server \
|
||||||
-tp $tp \
|
-tp $tp \
|
||||||
|
--model $model \
|
||||||
--port $port \
|
--port $port \
|
||||||
$server_args"
|
$server_args"
|
||||||
else
|
else
|
||||||
echo "Key 'fp8' does not exist in common params."
|
echo "Key 'fp8' does not exist in common params."
|
||||||
server_command="vllm serve $model \
|
server_command="python3 \
|
||||||
|
-m vllm.entrypoints.openai.api_server \
|
||||||
-tp $tp \
|
-tp $tp \
|
||||||
|
--model $model \
|
||||||
--port $port \
|
--port $port \
|
||||||
$server_args"
|
$server_args"
|
||||||
fi
|
fi
|
||||||
|
@ -73,7 +73,7 @@ get_current_llm_serving_engine() {
|
|||||||
echo "Container: vllm"
|
echo "Container: vllm"
|
||||||
# move to a completely irrelevant directory, to avoid import vllm from current folder
|
# move to a completely irrelevant directory, to avoid import vllm from current folder
|
||||||
export CURRENT_LLM_SERVING_ENGINE=vllm
|
export CURRENT_LLM_SERVING_ENGINE=vllm
|
||||||
|
|
||||||
return
|
return
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
@ -95,14 +95,12 @@ json2args() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
kill_gpu_processes() {
|
kill_gpu_processes() {
|
||||||
pkill -f '[p]ython'
|
pkill -f python
|
||||||
pkill -f '[p]ython3'
|
pkill -f python3
|
||||||
pkill -f '[t]ritonserver'
|
pkill -f tritonserver
|
||||||
pkill -f '[p]t_main_thread'
|
pkill -f pt_main_thread
|
||||||
pkill -f '[t]ext-generation'
|
pkill -f text-generation
|
||||||
pkill -f '[l]mdeploy'
|
pkill -f lmdeploy
|
||||||
# vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
|
|
||||||
pkill -f '[V]LLM'
|
|
||||||
|
|
||||||
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
|
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
|
||||||
sleep 1
|
sleep 1
|
||||||
@ -127,7 +125,7 @@ ensure_installed() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
run_serving_tests() {
|
run_serving_tests() {
|
||||||
# run serving tests using `vllm bench serve` command
|
# run serving tests using `benchmark_serving.py`
|
||||||
# $1: a json file specifying serving test cases
|
# $1: a json file specifying serving test cases
|
||||||
|
|
||||||
local serving_test_file
|
local serving_test_file
|
||||||
@ -227,7 +225,7 @@ run_serving_tests() {
|
|||||||
|
|
||||||
if [[ "$dataset_name" = "sharegpt" ]]; then
|
if [[ "$dataset_name" = "sharegpt" ]]; then
|
||||||
|
|
||||||
client_command="vllm bench serve \
|
client_command="python3 benchmark_serving.py \
|
||||||
--backend $backend \
|
--backend $backend \
|
||||||
--tokenizer /tokenizer_cache \
|
--tokenizer /tokenizer_cache \
|
||||||
--model $model \
|
--model $model \
|
||||||
@ -248,7 +246,7 @@ run_serving_tests() {
|
|||||||
sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
|
sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
|
||||||
sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
|
sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
|
||||||
|
|
||||||
client_command="vllm bench serve \
|
client_command="python3 benchmark_serving.py \
|
||||||
--backend $backend \
|
--backend $backend \
|
||||||
--tokenizer /tokenizer_cache \
|
--tokenizer /tokenizer_cache \
|
||||||
--model $model \
|
--model $model \
|
||||||
@ -267,13 +265,13 @@ run_serving_tests() {
|
|||||||
$client_args"
|
$client_args"
|
||||||
|
|
||||||
else
|
else
|
||||||
|
|
||||||
echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name."
|
echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name."
|
||||||
exit 1
|
exit 1
|
||||||
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
echo "Running test case $test_name with qps $qps"
|
echo "Running test case $test_name with qps $qps"
|
||||||
echo "Client command: $client_command"
|
echo "Client command: $client_command"
|
||||||
@ -304,7 +302,7 @@ run_serving_tests() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
run_genai_perf_tests() {
|
run_genai_perf_tests() {
|
||||||
# run genai-perf tests
|
# run genai-perf tests
|
||||||
|
|
||||||
# $1: a json file specifying genai-perf test cases
|
# $1: a json file specifying genai-perf test cases
|
||||||
local genai_perf_test_file
|
local genai_perf_test_file
|
||||||
@ -313,14 +311,14 @@ run_genai_perf_tests() {
|
|||||||
# Iterate over genai-perf tests
|
# Iterate over genai-perf tests
|
||||||
jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
|
jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
|
||||||
# get the test name, and append the GPU type back to it.
|
# get the test name, and append the GPU type back to it.
|
||||||
test_name=$(echo "$params" | jq -r '.test_name')
|
test_name=$(echo "$params" | jq -r '.test_name')
|
||||||
|
|
||||||
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
||||||
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
||||||
echo "Skip test case $test_name."
|
echo "Skip test case $test_name."
|
||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# prepend the current serving engine to the test name
|
# prepend the current serving engine to the test name
|
||||||
test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
|
test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
|
||||||
|
|
||||||
@ -371,10 +369,10 @@ run_genai_perf_tests() {
|
|||||||
qps=$num_prompts
|
qps=$num_prompts
|
||||||
echo "now qps is $qps"
|
echo "now qps is $qps"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
new_test_name=$test_name"_qps_"$qps
|
new_test_name=$test_name"_qps_"$qps
|
||||||
backend=$CURRENT_LLM_SERVING_ENGINE
|
backend=$CURRENT_LLM_SERVING_ENGINE
|
||||||
|
|
||||||
if [[ "$backend" == *"vllm"* ]]; then
|
if [[ "$backend" == *"vllm"* ]]; then
|
||||||
backend="vllm"
|
backend="vllm"
|
||||||
fi
|
fi
|
||||||
@ -382,7 +380,7 @@ run_genai_perf_tests() {
|
|||||||
client_command="genai-perf profile \
|
client_command="genai-perf profile \
|
||||||
-m $model \
|
-m $model \
|
||||||
--service-kind openai \
|
--service-kind openai \
|
||||||
--backend "$backend" \
|
--backend vllm \
|
||||||
--endpoint-type chat \
|
--endpoint-type chat \
|
||||||
--streaming \
|
--streaming \
|
||||||
--url localhost:$port \
|
--url localhost:$port \
|
||||||
@ -415,7 +413,7 @@ prepare_dataset() {
|
|||||||
do
|
do
|
||||||
cat sonnet.txt >> sonnet_4x.txt
|
cat sonnet.txt >> sonnet_4x.txt
|
||||||
done
|
done
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
main() {
|
main() {
|
||||||
|
@ -33,7 +33,7 @@ check_gpus() {
|
|||||||
|
|
||||||
check_cpus() {
|
check_cpus() {
|
||||||
# check the number of CPUs and NUMA Node and GPU type.
|
# check the number of CPUs and NUMA Node and GPU type.
|
||||||
declare -g numa_count=$(lscpu | grep "NUMA node(s):" | awk '{print $3}')
|
declare -g numa_count=$(python3 -c "from numa import info;numa_size = info.get_num_configured_nodes(); print(numa_size)")
|
||||||
if [[ $numa_count -gt 0 ]]; then
|
if [[ $numa_count -gt 0 ]]; then
|
||||||
echo "NUMA found."
|
echo "NUMA found."
|
||||||
echo $numa_count
|
echo $numa_count
|
||||||
@ -126,8 +126,7 @@ kill_gpu_processes() {
|
|||||||
ps -aux
|
ps -aux
|
||||||
lsof -t -i:8000 | xargs -r kill -9
|
lsof -t -i:8000 | xargs -r kill -9
|
||||||
pgrep python3 | xargs -r kill -9
|
pgrep python3 | xargs -r kill -9
|
||||||
# vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
|
|
||||||
pgrep VLLM | xargs -r kill -9
|
|
||||||
|
|
||||||
# wait until GPU memory usage smaller than 1GB
|
# wait until GPU memory usage smaller than 1GB
|
||||||
if command -v nvidia-smi; then
|
if command -v nvidia-smi; then
|
||||||
@ -165,7 +164,7 @@ upload_to_buildkite() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
run_latency_tests() {
|
run_latency_tests() {
|
||||||
# run latency tests using `vllm bench latency` command
|
# run latency tests using `benchmark_latency.py`
|
||||||
# $1: a json file specifying latency test cases
|
# $1: a json file specifying latency test cases
|
||||||
|
|
||||||
local latency_test_file
|
local latency_test_file
|
||||||
@ -194,11 +193,9 @@ run_latency_tests() {
|
|||||||
|
|
||||||
# check if there is enough GPU to run the test
|
# check if there is enough GPU to run the test
|
||||||
tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
|
tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
|
||||||
if [ "$ON_CPU" == "1" ]; then
|
if [ "$ON_CPU" == "1" ];then
|
||||||
pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size')
|
if [[ $numa_count -lt $tp ]]; then
|
||||||
world_size=$(($tp*$pp))
|
echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name."
|
||||||
if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
|
|
||||||
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
|
|
||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
else
|
else
|
||||||
@ -208,7 +205,7 @@ run_latency_tests() {
|
|||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
latency_command=" $latency_envs vllm bench latency \
|
latency_command=" $latency_envs python3 benchmark_latency.py \
|
||||||
--output-json $RESULTS_FOLDER/${test_name}.json \
|
--output-json $RESULTS_FOLDER/${test_name}.json \
|
||||||
$latency_args"
|
$latency_args"
|
||||||
|
|
||||||
@ -234,7 +231,7 @@ run_latency_tests() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
run_throughput_tests() {
|
run_throughput_tests() {
|
||||||
# run throughput tests using `vllm bench throughput`
|
# run throughput tests using `benchmark_throughput.py`
|
||||||
# $1: a json file specifying throughput test cases
|
# $1: a json file specifying throughput test cases
|
||||||
|
|
||||||
local throughput_test_file
|
local throughput_test_file
|
||||||
@ -263,11 +260,9 @@ run_throughput_tests() {
|
|||||||
|
|
||||||
# check if there is enough GPU to run the test
|
# check if there is enough GPU to run the test
|
||||||
tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
|
tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
|
||||||
if [ "$ON_CPU" == "1" ]; then
|
if [ "$ON_CPU" == "1" ];then
|
||||||
pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size')
|
if [[ $numa_count -lt $tp ]]; then
|
||||||
world_size=$(($tp*$pp))
|
echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name."
|
||||||
if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
|
|
||||||
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
|
|
||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
else
|
else
|
||||||
@ -277,7 +272,7 @@ run_throughput_tests() {
|
|||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
throughput_command=" $throughput_envs vllm bench throughput \
|
throughput_command=" $throughput_envs python3 benchmark_throughput.py \
|
||||||
--output-json $RESULTS_FOLDER/${test_name}.json \
|
--output-json $RESULTS_FOLDER/${test_name}.json \
|
||||||
$throughput_args"
|
$throughput_args"
|
||||||
|
|
||||||
@ -302,7 +297,7 @@ run_throughput_tests() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
run_serving_tests() {
|
run_serving_tests() {
|
||||||
# run serving tests using `vllm bench serve` command
|
# run serving tests using `benchmark_serving.py`
|
||||||
# $1: a json file specifying serving test cases
|
# $1: a json file specifying serving test cases
|
||||||
|
|
||||||
local serving_test_file
|
local serving_test_file
|
||||||
@ -333,21 +328,12 @@ run_serving_tests() {
|
|||||||
qps_list=$(echo "$params" | jq -r '.qps_list')
|
qps_list=$(echo "$params" | jq -r '.qps_list')
|
||||||
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
||||||
echo "Running over qps list $qps_list"
|
echo "Running over qps list $qps_list"
|
||||||
max_concurrency_list=$(echo "$params" | jq -r '.max_concurrency_list')
|
|
||||||
if [[ -z "$max_concurrency_list" || "$max_concurrency_list" == "null" ]]; then
|
|
||||||
num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
|
|
||||||
max_concurrency_list="[$num_prompts]"
|
|
||||||
fi
|
|
||||||
max_concurrency_list=$(echo "$max_concurrency_list" | jq -r '.[] | @sh')
|
|
||||||
echo "Running over max concurrency list $max_concurrency_list"
|
|
||||||
|
|
||||||
# check if there is enough resources to run the test
|
# check if there is enough resources to run the test
|
||||||
tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
|
tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
|
||||||
if [ "$ON_CPU" == "1" ]; then
|
if [ "$ON_CPU" == "1" ];then
|
||||||
pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size')
|
if [[ $numa_count -lt $tp ]]; then
|
||||||
world_size=$(($tp*$pp))
|
echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name."
|
||||||
if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
|
|
||||||
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
|
|
||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
else
|
else
|
||||||
@ -365,7 +351,8 @@ run_serving_tests() {
|
|||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
|
|
||||||
server_command="$server_envs vllm serve \
|
server_command="$server_envs python3 \
|
||||||
|
-m vllm.entrypoints.openai.api_server \
|
||||||
$server_args"
|
$server_args"
|
||||||
|
|
||||||
# run the server
|
# run the server
|
||||||
@ -402,39 +389,35 @@ run_serving_tests() {
|
|||||||
echo "now qps is $qps"
|
echo "now qps is $qps"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# iterate over different max_concurrency
|
new_test_name=$test_name"_qps_"$qps
|
||||||
for max_concurrency in $max_concurrency_list; do
|
|
||||||
new_test_name=$test_name"_qps_"$qps"_concurrency_"$max_concurrency
|
|
||||||
echo " new test name $new_test_name"
|
|
||||||
# pass the tensor parallel size to the client so that it can be displayed
|
|
||||||
# on the benchmark dashboard
|
|
||||||
client_command="vllm bench serve \
|
|
||||||
--save-result \
|
|
||||||
--result-dir $RESULTS_FOLDER \
|
|
||||||
--result-filename ${new_test_name}.json \
|
|
||||||
--request-rate $qps \
|
|
||||||
--max-concurrency $max_concurrency \
|
|
||||||
--metadata "tensor_parallel_size=$tp" \
|
|
||||||
$client_args $client_remote_args "
|
|
||||||
|
|
||||||
echo "Running test case $test_name with qps $qps"
|
# pass the tensor parallel size to the client so that it can be displayed
|
||||||
echo "Client command: $client_command"
|
# on the benchmark dashboard
|
||||||
|
client_command="python3 benchmark_serving.py \
|
||||||
|
--save-result \
|
||||||
|
--result-dir $RESULTS_FOLDER \
|
||||||
|
--result-filename ${new_test_name}.json \
|
||||||
|
--request-rate $qps \
|
||||||
|
--metadata "tensor_parallel_size=$tp" \
|
||||||
|
$client_args $client_remote_args "
|
||||||
|
|
||||||
bash -c "$client_command"
|
echo "Running test case $test_name with qps $qps"
|
||||||
|
echo "Client command: $client_command"
|
||||||
|
|
||||||
# record the benchmarking commands
|
bash -c "$client_command"
|
||||||
jq_output=$(jq -n \
|
|
||||||
--arg server "$server_command" \
|
# record the benchmarking commands
|
||||||
--arg client "$client_command" \
|
jq_output=$(jq -n \
|
||||||
--arg gpu "$gpu_type" \
|
--arg server "$server_command" \
|
||||||
'{
|
--arg client "$client_command" \
|
||||||
server_command: $server,
|
--arg gpu "$gpu_type" \
|
||||||
client_command: $client,
|
'{
|
||||||
gpu_type: $gpu
|
server_command: $server,
|
||||||
}')
|
client_command: $client,
|
||||||
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
|
gpu_type: $gpu
|
||||||
|
}')
|
||||||
|
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
|
||||||
|
|
||||||
done
|
|
||||||
done
|
done
|
||||||
|
|
||||||
# clean up
|
# clean up
|
||||||
@ -454,12 +437,17 @@ main() {
|
|||||||
fi
|
fi
|
||||||
check_hf_token
|
check_hf_token
|
||||||
|
|
||||||
|
# Set to v1 to run v1 benchmark
|
||||||
|
if [[ "${ENGINE_VERSION:-v0}" == "v1" ]]; then
|
||||||
|
export VLLM_USE_V1=1
|
||||||
|
fi
|
||||||
|
|
||||||
# dependencies
|
# dependencies
|
||||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||||
(which jq) || (apt-get update && apt-get -y install jq)
|
(which jq) || (apt-get update && apt-get -y install jq)
|
||||||
(which lsof) || (apt-get update && apt-get install -y lsof)
|
(which lsof) || (apt-get update && apt-get install -y lsof)
|
||||||
|
|
||||||
# get the current IP address, required by `vllm bench serve` command
|
# get the current IP address, required by benchmark_serving.py
|
||||||
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
|
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
|
||||||
# turn of the reporting of the status of each request, to clean up the terminal output
|
# turn of the reporting of the status of each request, to clean up the terminal output
|
||||||
export VLLM_LOGGING_LEVEL="WARNING"
|
export VLLM_LOGGING_LEVEL="WARNING"
|
||||||
|
@ -11,7 +11,9 @@
|
|||||||
},
|
},
|
||||||
"vllm_server_parameters": {
|
"vllm_server_parameters": {
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
"gpu_memory_utilization": 0.9,
|
"gpu_memory_utilization": 0.9,
|
||||||
|
"num_scheduler_steps": 10,
|
||||||
"max_num_seqs": 512,
|
"max_num_seqs": 512,
|
||||||
"dtype": "bfloat16"
|
"dtype": "bfloat16"
|
||||||
},
|
},
|
||||||
|
@ -6,7 +6,7 @@
|
|||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
},
|
},
|
||||||
"parameters": {
|
"parameters": {
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
"tensor_parallel_size": 1,
|
"tensor_parallel_size": 1,
|
||||||
"load_format": "dummy",
|
"load_format": "dummy",
|
||||||
"num_iters_warmup": 5,
|
"num_iters_warmup": 5,
|
||||||
@ -20,7 +20,7 @@
|
|||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
},
|
},
|
||||||
"parameters": {
|
"parameters": {
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
"tensor_parallel_size": 4,
|
"tensor_parallel_size": 4,
|
||||||
"load_format": "dummy",
|
"load_format": "dummy",
|
||||||
"num_iters_warmup": 5,
|
"num_iters_warmup": 5,
|
||||||
|
@ -35,7 +35,9 @@
|
|||||||
},
|
},
|
||||||
"vllm_server_parameters": {
|
"vllm_server_parameters": {
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
"gpu_memory_utilization": 0.9,
|
"gpu_memory_utilization": 0.9,
|
||||||
|
"num_scheduler_steps": 10,
|
||||||
"max_num_seqs": 512,
|
"max_num_seqs": 512,
|
||||||
"dtype": "bfloat16"
|
"dtype": "bfloat16"
|
||||||
},
|
},
|
||||||
@ -88,7 +90,9 @@
|
|||||||
},
|
},
|
||||||
"vllm_server_parameters": {
|
"vllm_server_parameters": {
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
"gpu_memory_utilization": 0.9,
|
"gpu_memory_utilization": 0.9,
|
||||||
|
"num_scheduler_steps": 10,
|
||||||
"max_num_seqs": 512,
|
"max_num_seqs": 512,
|
||||||
"dtype": "bfloat16"
|
"dtype": "bfloat16"
|
||||||
},
|
},
|
||||||
@ -141,7 +145,9 @@
|
|||||||
},
|
},
|
||||||
"vllm_server_parameters": {
|
"vllm_server_parameters": {
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
"gpu_memory_utilization": 0.9,
|
"gpu_memory_utilization": 0.9,
|
||||||
|
"num_scheduler_steps": 10,
|
||||||
"max_num_seqs": 512,
|
"max_num_seqs": 512,
|
||||||
"dtype": "bfloat16"
|
"dtype": "bfloat16"
|
||||||
},
|
},
|
||||||
@ -191,7 +197,9 @@
|
|||||||
},
|
},
|
||||||
"vllm_server_parameters": {
|
"vllm_server_parameters": {
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
"gpu_memory_utilization": 0.9,
|
"gpu_memory_utilization": 0.9,
|
||||||
|
"num_scheduler_steps": 10,
|
||||||
"max_num_seqs": 512,
|
"max_num_seqs": 512,
|
||||||
"dtype": "bfloat16"
|
"dtype": "bfloat16"
|
||||||
},
|
},
|
||||||
@ -243,7 +251,9 @@
|
|||||||
},
|
},
|
||||||
"vllm_server_parameters": {
|
"vllm_server_parameters": {
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
"gpu_memory_utilization": 0.9,
|
"gpu_memory_utilization": 0.9,
|
||||||
|
"num_scheduler_steps": 10,
|
||||||
"max_num_seqs": 512,
|
"max_num_seqs": 512,
|
||||||
"dtype": "bfloat16"
|
"dtype": "bfloat16"
|
||||||
},
|
},
|
||||||
@ -295,7 +305,9 @@
|
|||||||
},
|
},
|
||||||
"vllm_server_parameters": {
|
"vllm_server_parameters": {
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
"gpu_memory_utilization": 0.9,
|
"gpu_memory_utilization": 0.9,
|
||||||
|
"num_scheduler_steps": 10,
|
||||||
"max_num_seqs": 512,
|
"max_num_seqs": 512,
|
||||||
"dtype": "bfloat16"
|
"dtype": "bfloat16"
|
||||||
},
|
},
|
||||||
|
@ -1,610 +0,0 @@
|
|||||||
[
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_bf16_tp1_sharegpt",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 1,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_bf16_tp2_sharegpt",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_bf16_tp4_sharegpt",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 4,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_bf16_tp1_random_128_128",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 1,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128,
|
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 1000
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_bf16_tp2_random_128_128",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128,
|
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 1000
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_bf16_tp4_random_128_128",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 4,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128,
|
|
||||||
"num_prompts": 1000
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int8_tp1_sharegpt",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
|
||||||
"tensor_parallel_size": 1,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int8_tp2_sharegpt",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int8_tp4_sharegpt",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
|
||||||
"tensor_parallel_size": 4,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int8_tp1_random_128_128",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
|
||||||
"tensor_parallel_size": 1,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128,
|
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 1000
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int8_tp2_random_128_128",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128,
|
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 1000
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int8_tp4_random_128_128",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
|
||||||
"tensor_parallel_size": 4,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128,
|
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 1000
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int4_tp1_sharegpt",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"quantization": "awq",
|
|
||||||
"tensor_parallel_size": 1,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int4_tp2_sharegpt",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"quantization": "awq",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int4_tp4_sharegpt",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"quantization": "awq",
|
|
||||||
"tensor_parallel_size": 4,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int4_tp1_random_128_128",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"quantization": "awq",
|
|
||||||
"tensor_parallel_size": 1,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128,
|
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 1000
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int4_tp2_random_128_128",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"quantization": "awq",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128,
|
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 1000
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int4_tp4_random_128_128",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"quantization": "awq",
|
|
||||||
"tensor_parallel_size": 4,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128,
|
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 1000
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
@ -1,820 +0,0 @@
|
|||||||
[
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_bf16_pp1_sharegpt",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"pipeline_parallel_size": 1,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_bf16_tp2_sharegpt",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_bf16_pp3_sharegpt",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"pipeline_parallel_size": 3,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_bf16_tp2pp3_sharegpt",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"pipeline_parallel_size": 3,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_bf16_pp1_random_128_128",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"pipeline_parallel_size": 1,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128,
|
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 1000
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_bf16_tp2_random_128_128",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128,
|
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 1000
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_bf16_pp3_random_128_128",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"pipeline_parallel_size": 3,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128,
|
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 1000
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_bf16_tp2pp3_random_128_128",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"pipeline_parallel_size": 3,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128,
|
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 1000
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int8_pp1_sharegpt",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
|
||||||
"pipeline_parallel_size": 1,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int8_tp2_sharegpt",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int8_pp3_sharegpt",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
|
||||||
"pipeline_parallel_size": 3,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int8_tp2pp3_sharegpt",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"pipeline_parallel_size": 3,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int8_pp1_random_128_128",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
|
||||||
"pipeline_parallel_size": 1,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128,
|
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 1000
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int8_tp2_random_128_128",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128,
|
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 1000
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int8_pp3_random_128_128",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
|
||||||
"pipeline_parallel_size": 3,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128,
|
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 1000
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int8_tp2pp3_random_128_128",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"pipeline_parallel_size": 3,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128,
|
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 1000
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int4_pp1_sharegpt",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"quantization": "awq",
|
|
||||||
"pipeline_parallel_size": 1,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int4_tp2_sharegpt",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"quantization": "awq",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int4_pp3_sharegpt",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"quantization": "awq",
|
|
||||||
"pipeline_parallel_size": 3,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int4_tp2pp3_sharegpt",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"quantization": "awq",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"pipeline_parallel_size": 3,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int4_pp1_random_128_128",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"quantization": "awq",
|
|
||||||
"pipeline_parallel_size": 1,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128,
|
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 1000
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int4_tp2_random_128_128",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"quantization": "awq",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128,
|
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 1000
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int4_pp3_random_128_128",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"quantization": "awq",
|
|
||||||
"pipeline_parallel_size": 3,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128,
|
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 1000
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int4_tp2pp3_random_128_128",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"quantization": "awq",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"pipeline_parallel_size": 3,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128,
|
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 1000
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
@ -2,112 +2,104 @@
|
|||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_tp1_sharegpt",
|
"test_name": "serving_llama8B_tp1_sharegpt",
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
"server_environment_variables": {
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
},
|
},
|
||||||
"server_parameters": {
|
"server_parameters": {
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
"tensor_parallel_size": 1,
|
"tensor_parallel_size": 1,
|
||||||
"dtype": "bfloat16",
|
"dtype": "bfloat16",
|
||||||
"distributed_executor_backend": "mp",
|
"distributed_executor_backend": "mp",
|
||||||
"block_size": 128,
|
"block_size": 128,
|
||||||
"trust_remote_code": "",
|
"trust_remote_code": "",
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
"enforce_eager": "",
|
"enforce_eager": "",
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
"load_format": "dummy"
|
||||||
},
|
},
|
||||||
"client_parameters": {
|
"client_parameters": {
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
"backend": "vllm",
|
"backend": "vllm",
|
||||||
"dataset_name": "sharegpt",
|
"dataset_name": "sharegpt",
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"max_concurrency": 60,
|
||||||
"num_prompts": 200
|
"num_prompts": 200
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_tp2_sharegpt",
|
"test_name": "serving_llama8B_tp2_sharegpt",
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
"server_environment_variables": {
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
},
|
},
|
||||||
"server_parameters": {
|
"server_parameters": {
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
"tensor_parallel_size": 2,
|
"tensor_parallel_size": 2,
|
||||||
"dtype": "bfloat16",
|
"dtype": "bfloat16",
|
||||||
"distributed_executor_backend": "mp",
|
"distributed_executor_backend": "mp",
|
||||||
"block_size": 128,
|
"block_size": 128,
|
||||||
"trust_remote_code": "",
|
"trust_remote_code": "",
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
"enforce_eager": "",
|
"enforce_eager": "",
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
"load_format": "dummy"
|
||||||
},
|
},
|
||||||
"client_parameters": {
|
"client_parameters": {
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
"backend": "vllm",
|
"backend": "vllm",
|
||||||
"dataset_name": "sharegpt",
|
"dataset_name": "sharegpt",
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"max_concurrency": 60,
|
||||||
"num_prompts": 200
|
"num_prompts": 200
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_tp4_sharegpt",
|
"test_name": "serving_llama8B_tp4_sharegpt",
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
"server_environment_variables": {
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
},
|
},
|
||||||
"server_parameters": {
|
"server_parameters": {
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
"tensor_parallel_size": 4,
|
"tensor_parallel_size": 4,
|
||||||
"dtype": "bfloat16",
|
"dtype": "bfloat16",
|
||||||
"distributed_executor_backend": "mp",
|
"distributed_executor_backend": "mp",
|
||||||
"block_size": 128,
|
"block_size": 128,
|
||||||
"trust_remote_code": "",
|
"trust_remote_code": "",
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
"enforce_eager": "",
|
"enforce_eager": "",
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
"load_format": "dummy"
|
||||||
},
|
},
|
||||||
"client_parameters": {
|
"client_parameters": {
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
"backend": "vllm",
|
"backend": "vllm",
|
||||||
"dataset_name": "sharegpt",
|
"dataset_name": "sharegpt",
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"max_concurrency": 60,
|
||||||
"num_prompts": 200
|
"num_prompts": 200
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_tp4_random_1024_128",
|
"test_name": "serving_llama8B_tp4_random_1024_128",
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
"server_environment_variables": {
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
},
|
},
|
||||||
"server_parameters": {
|
"server_parameters": {
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
"tensor_parallel_size": 4,
|
"tensor_parallel_size": 4,
|
||||||
"dtype": "bfloat16",
|
"dtype": "bfloat16",
|
||||||
"distributed_executor_backend": "mp",
|
"distributed_executor_backend": "mp",
|
||||||
@ -115,34 +107,32 @@
|
|||||||
"trust_remote_code": "",
|
"trust_remote_code": "",
|
||||||
"enable_chunked_prefill": "",
|
"enable_chunked_prefill": "",
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
"enforce_eager": "",
|
"enforce_eager": "",
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
"load_format": "dummy"
|
||||||
},
|
},
|
||||||
"client_parameters": {
|
"client_parameters": {
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
"backend": "vllm",
|
"backend": "vllm",
|
||||||
"dataset_name": "random",
|
"dataset_name": "random",
|
||||||
"random-input-len": 1024,
|
"random-input-len": 1024,
|
||||||
"random-output-len": 128,
|
"random-output-len": 128,
|
||||||
"ignore-eos": "",
|
"ignore-eos": "",
|
||||||
|
"max_concurrency": 100,
|
||||||
"num_prompts": 100
|
"num_prompts": 100
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_pp6_random_1024_128",
|
"test_name": "serving_llama8B_pp6_random_1024_128",
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
"server_environment_variables": {
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
},
|
},
|
||||||
"server_parameters": {
|
"server_parameters": {
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
"pipeline_parallel_size": 6,
|
"pipeline_parallel_size": 6,
|
||||||
"dtype": "bfloat16",
|
"dtype": "bfloat16",
|
||||||
"distributed_executor_backend": "mp",
|
"distributed_executor_backend": "mp",
|
||||||
@ -150,18 +140,18 @@
|
|||||||
"trust_remote_code": "",
|
"trust_remote_code": "",
|
||||||
"enable_chunked_prefill": "",
|
"enable_chunked_prefill": "",
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
"enforce_eager": "",
|
"enforce_eager": "",
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
"load_format": "dummy"
|
||||||
},
|
},
|
||||||
"client_parameters": {
|
"client_parameters": {
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
"backend": "vllm",
|
"backend": "vllm",
|
||||||
"dataset_name": "random",
|
"dataset_name": "random",
|
||||||
"random-input-len": 1024,
|
"random-input-len": 1024,
|
||||||
"random-output-len": 128,
|
"random-output-len": 128,
|
||||||
"ignore-eos": "",
|
"ignore-eos": "",
|
||||||
|
"max_concurrency": 100,
|
||||||
"num_prompts": 100
|
"num_prompts": 100
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -7,6 +7,7 @@
|
|||||||
"tensor_parallel_size": 1,
|
"tensor_parallel_size": 1,
|
||||||
"swap_space": 16,
|
"swap_space": 16,
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
"load_format": "dummy"
|
"load_format": "dummy"
|
||||||
},
|
},
|
||||||
"client_parameters": {
|
"client_parameters": {
|
||||||
@ -25,6 +26,7 @@
|
|||||||
"tensor_parallel_size": 4,
|
"tensor_parallel_size": 4,
|
||||||
"swap_space": 16,
|
"swap_space": 16,
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
"load_format": "dummy"
|
"load_format": "dummy"
|
||||||
},
|
},
|
||||||
"client_parameters": {
|
"client_parameters": {
|
||||||
@ -43,6 +45,7 @@
|
|||||||
"tensor_parallel_size": 2,
|
"tensor_parallel_size": 2,
|
||||||
"swap_space": 16,
|
"swap_space": 16,
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
"load_format": "dummy"
|
"load_format": "dummy"
|
||||||
},
|
},
|
||||||
"client_parameters": {
|
"client_parameters": {
|
||||||
@ -57,7 +60,8 @@
|
|||||||
"test_name": "serving_llama70B_tp4_sharegpt_specdecode",
|
"test_name": "serving_llama70B_tp4_sharegpt_specdecode",
|
||||||
"qps_list": [2],
|
"qps_list": [2],
|
||||||
"server_parameters": {
|
"server_parameters": {
|
||||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||||
|
"disable_log_requests": "",
|
||||||
"tensor_parallel_size": 4,
|
"tensor_parallel_size": 4,
|
||||||
"swap_space": 16,
|
"swap_space": 16,
|
||||||
"speculative_config": {
|
"speculative_config": {
|
||||||
|
@ -6,7 +6,7 @@
|
|||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
},
|
},
|
||||||
"parameters": {
|
"parameters": {
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
"tensor_parallel_size": 1,
|
"tensor_parallel_size": 1,
|
||||||
"load_format": "dummy",
|
"load_format": "dummy",
|
||||||
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
@ -21,7 +21,7 @@
|
|||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
},
|
},
|
||||||
"parameters": {
|
"parameters": {
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
"tensor_parallel_size": 4,
|
"tensor_parallel_size": 4,
|
||||||
"load_format": "dummy",
|
"load_format": "dummy",
|
||||||
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
46
.buildkite/pyproject.toml
Normal file
46
.buildkite/pyproject.toml
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
# This local pyproject file is part of the migration from yapf to ruff format.
|
||||||
|
# It uses the same core rules as the main pyproject.toml file, but with the
|
||||||
|
# following differences:
|
||||||
|
# - ruff line length is overridden to 88
|
||||||
|
# - deprecated typing ignores (UP006, UP035) have been removed
|
||||||
|
|
||||||
|
[tool.ruff]
|
||||||
|
line-length = 88
|
||||||
|
|
||||||
|
[tool.ruff.lint.per-file-ignores]
|
||||||
|
"vllm/third_party/**" = ["ALL"]
|
||||||
|
"vllm/version.py" = ["F401"]
|
||||||
|
"vllm/_version.py" = ["ALL"]
|
||||||
|
|
||||||
|
[tool.ruff.lint]
|
||||||
|
select = [
|
||||||
|
# pycodestyle
|
||||||
|
"E",
|
||||||
|
# Pyflakes
|
||||||
|
"F",
|
||||||
|
# pyupgrade
|
||||||
|
"UP",
|
||||||
|
# flake8-bugbear
|
||||||
|
"B",
|
||||||
|
# flake8-simplify
|
||||||
|
"SIM",
|
||||||
|
# isort
|
||||||
|
"I",
|
||||||
|
# flake8-logging-format
|
||||||
|
"G",
|
||||||
|
]
|
||||||
|
ignore = [
|
||||||
|
# star imports
|
||||||
|
"F405", "F403",
|
||||||
|
# lambda expression assignment
|
||||||
|
"E731",
|
||||||
|
# Loop control variable not used within loop body
|
||||||
|
"B007",
|
||||||
|
# f-string format
|
||||||
|
"UP032",
|
||||||
|
# Can remove once 3.10+ is the minimum Python version
|
||||||
|
"UP007",
|
||||||
|
]
|
||||||
|
|
||||||
|
[tool.ruff.format]
|
||||||
|
docstring-code-format = true
|
@ -1,36 +1,5 @@
|
|||||||
steps:
|
steps:
|
||||||
# aarch64 + CUDA builds. PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
|
|
||||||
- label: "Build arm64 wheel - CUDA 12.9"
|
|
||||||
depends_on: ~
|
|
||||||
id: build-wheel-arm64-cuda-12-9
|
|
||||||
agents:
|
|
||||||
queue: arm64_cpu_queue_postmerge
|
|
||||||
commands:
|
|
||||||
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
|
|
||||||
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
|
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
|
||||||
- "mkdir artifacts"
|
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
|
|
||||||
# aarch64 build.
|
|
||||||
- label: "Build arm64 CPU wheel"
|
|
||||||
depends_on: ~
|
|
||||||
id: build-wheel-arm64-cpu
|
|
||||||
agents:
|
|
||||||
queue: arm64_cpu_queue_postmerge
|
|
||||||
commands:
|
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile.cpu ."
|
|
||||||
- "mkdir artifacts"
|
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
|
|
||||||
- label: "Build wheel - CUDA 12.8"
|
- label: "Build wheel - CUDA 12.8"
|
||||||
depends_on: ~
|
|
||||||
id: build-wheel-cuda-12-8
|
id: build-wheel-cuda-12-8
|
||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
@ -43,7 +12,6 @@ steps:
|
|||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
- label: "Build wheel - CUDA 12.6"
|
- label: "Build wheel - CUDA 12.6"
|
||||||
depends_on: ~
|
|
||||||
id: build-wheel-cuda-12-6
|
id: build-wheel-cuda-12-6
|
||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
@ -55,61 +23,44 @@ steps:
|
|||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
# x86 + CUDA builds
|
# Note(simon): We can always build CUDA 11.8 wheel to ensure the build is working.
|
||||||
- label: "Build wheel - CUDA 12.9"
|
# However, this block can be uncommented to save some compute hours.
|
||||||
depends_on: ~
|
# - block: "Build CUDA 11.8 wheel"
|
||||||
id: build-wheel-cuda-12-9
|
# key: block-build-cu118-wheel
|
||||||
|
|
||||||
|
- label: "Build wheel - CUDA 11.8"
|
||||||
|
# depends_on: block-build-cu118-wheel
|
||||||
|
id: build-wheel-cuda-11-8
|
||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
- label: "Build release image (x86)"
|
- block: "Build release image"
|
||||||
depends_on: ~
|
depends_on: ~
|
||||||
id: build-release-image-x86
|
key: block-release-image-build
|
||||||
|
|
||||||
|
- label: "Build release image"
|
||||||
|
depends_on: block-release-image-build
|
||||||
|
id: build-release-image
|
||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
|
|
||||||
# re-tag to default image tag and push, just in case arm64 build fails
|
|
||||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||||
|
|
||||||
# PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
|
|
||||||
- label: "Build release image (arm64)"
|
|
||||||
depends_on: ~
|
|
||||||
id: build-release-image-arm64
|
|
||||||
agents:
|
|
||||||
queue: arm64_cpu_queue_postmerge
|
|
||||||
commands:
|
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
|
|
||||||
|
|
||||||
# Add job to create multi-arch manifest
|
|
||||||
- label: "Create multi-arch manifest"
|
|
||||||
depends_on:
|
|
||||||
- build-release-image-x86
|
|
||||||
- build-release-image-arm64
|
|
||||||
id: create-multi-arch-manifest
|
|
||||||
agents:
|
|
||||||
queue: cpu_queue_postmerge
|
|
||||||
commands:
|
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
|
||||||
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
|
|
||||||
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
|
||||||
|
|
||||||
- label: "Annotate release workflow"
|
- label: "Annotate release workflow"
|
||||||
depends_on:
|
depends_on:
|
||||||
- create-multi-arch-manifest
|
- build-release-image
|
||||||
- build-wheel-cuda-12-8
|
- build-wheel-cuda-12-8
|
||||||
|
- build-wheel-cuda-12-6
|
||||||
|
- build-wheel-cuda-11-8
|
||||||
id: annotate-release-workflow
|
id: annotate-release-workflow
|
||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
@ -156,46 +107,18 @@ steps:
|
|||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
- block: "Build arm64 CPU release image"
|
- block: "Build Neuron release image"
|
||||||
key: block-arm64-cpu-release-image-build
|
key: block-neuron-release-image-build
|
||||||
depends_on: ~
|
depends_on: ~
|
||||||
|
|
||||||
- label: "Build and publish arm64 CPU release image"
|
- label: "Build and publish Neuron release image"
|
||||||
depends_on: block-arm64-cpu-release-image-build
|
depends_on: block-neuron-release-image-build
|
||||||
agents:
|
agents:
|
||||||
queue: arm64_cpu_queue_postmerge
|
queue: neuron-postmerge
|
||||||
commands:
|
commands:
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ."
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest"
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest"
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)"
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
- label: "Build and publish nightly multi-arch image to DockerHub"
|
|
||||||
depends_on:
|
|
||||||
- create-multi-arch-manifest
|
|
||||||
if: build.env("NIGHTLY") == "1"
|
|
||||||
agents:
|
|
||||||
queue: cpu_queue_postmerge
|
|
||||||
commands:
|
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
|
||||||
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64"
|
|
||||||
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64"
|
|
||||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 vllm/vllm-openai:nightly-x86_64"
|
|
||||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 vllm/vllm-openai:nightly-aarch64"
|
|
||||||
- "docker push vllm/vllm-openai:nightly-x86_64"
|
|
||||||
- "docker push vllm/vllm-openai:nightly-aarch64"
|
|
||||||
- "docker manifest create vllm/vllm-openai:nightly vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
|
|
||||||
- "docker manifest create vllm/vllm-openai:nightly-$BUILDKITE_COMMIT vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
|
|
||||||
- "docker manifest push vllm/vllm-openai:nightly"
|
|
||||||
- "docker manifest push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
|
|
||||||
# Clean up old nightly builds (keep only last 14)
|
|
||||||
- "bash .buildkite/scripts/cleanup-nightly-builds.sh"
|
|
||||||
plugins:
|
|
||||||
- docker-login#v3.0.0:
|
|
||||||
username: vllmbot
|
|
||||||
password-env: DOCKERHUB_TOKEN
|
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
DOCKERHUB_USERNAME: "vllmbot"
|
|
||||||
|
@ -14,33 +14,18 @@ buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
|
|||||||
To download the wheel:
|
To download the wheel:
|
||||||
\`\`\`
|
\`\`\`
|
||||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
|
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
|
||||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
|
|
||||||
|
|
||||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
|
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
|
||||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
|
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu118/vllm-${RELEASE_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl .
|
||||||
\`\`\`
|
\`\`\`
|
||||||
|
|
||||||
To download and upload the image:
|
To download and upload the image:
|
||||||
|
|
||||||
\`\`\`
|
\`\`\`
|
||||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
|
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}
|
||||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
|
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} vllm/vllm-openai
|
||||||
|
docker tag vllm/vllm-openai vllm/vllm-openai:latest
|
||||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
|
docker tag vllm/vllm-openai vllm/vllm-openai:v${RELEASE_VERSION}
|
||||||
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
|
docker push vllm/vllm-openai:latest
|
||||||
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
|
docker push vllm/vllm-openai:v${RELEASE_VERSION}
|
||||||
docker push vllm/vllm-openai:latest-x86_64
|
|
||||||
docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
|
|
||||||
|
|
||||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64
|
|
||||||
docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64
|
|
||||||
docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
|
||||||
docker push vllm/vllm-openai:latest-aarch64
|
|
||||||
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
|
||||||
|
|
||||||
docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 --amend
|
|
||||||
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 --amend
|
|
||||||
docker manifest push vllm/vllm-openai:latest
|
|
||||||
docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
|
|
||||||
\`\`\`
|
\`\`\`
|
||||||
EOF
|
EOF
|
@ -1,120 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
set -ex
|
|
||||||
|
|
||||||
# Clean up old nightly builds from DockerHub, keeping only the last 14 builds
|
|
||||||
# This script uses DockerHub API to list and delete old tags with "nightly-" prefix
|
|
||||||
|
|
||||||
# DockerHub API endpoint for vllm/vllm-openai repository
|
|
||||||
REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
|
|
||||||
|
|
||||||
# Get DockerHub credentials from environment
|
|
||||||
if [ -z "$DOCKERHUB_TOKEN" ]; then
|
|
||||||
echo "Error: DOCKERHUB_TOKEN environment variable is not set"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ -z "$DOCKERHUB_USERNAME" ]; then
|
|
||||||
echo "Error: DOCKERHUB_USERNAME environment variable is not set"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Get DockerHub bearer token
|
|
||||||
echo "Getting DockerHub bearer token..."
|
|
||||||
set +x
|
|
||||||
BEARER_TOKEN=$(curl -s -X POST \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d "{\"username\": \"$DOCKERHUB_USERNAME\", \"password\": \"$DOCKERHUB_TOKEN\"}" \
|
|
||||||
"https://hub.docker.com/v2/users/login" | jq -r '.token')
|
|
||||||
set -x
|
|
||||||
|
|
||||||
if [ -z "$BEARER_TOKEN" ] || [ "$BEARER_TOKEN" = "null" ]; then
|
|
||||||
echo "Error: Failed to get DockerHub bearer token"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Function to get all tags from DockerHub
|
|
||||||
get_all_tags() {
|
|
||||||
local page=1
|
|
||||||
local all_tags=""
|
|
||||||
|
|
||||||
while true; do
|
|
||||||
set +x
|
|
||||||
local response=$(curl -s -H "Authorization: Bearer $BEARER_TOKEN" \
|
|
||||||
"$REPO_API_URL?page=$page&page_size=100")
|
|
||||||
set -x
|
|
||||||
|
|
||||||
# Get both last_updated timestamp and tag name, separated by |
|
|
||||||
local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"')
|
|
||||||
|
|
||||||
if [ -z "$tags" ]; then
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
|
|
||||||
all_tags="$all_tags$tags"$'\n'
|
|
||||||
page=$((page + 1))
|
|
||||||
done
|
|
||||||
|
|
||||||
# Sort by timestamp (newest first) and extract just the tag names
|
|
||||||
echo "$all_tags" | sort -r | cut -d'|' -f2
|
|
||||||
}
|
|
||||||
|
|
||||||
delete_tag() {
|
|
||||||
local tag_name="$1"
|
|
||||||
echo "Deleting tag: $tag_name"
|
|
||||||
|
|
||||||
local delete_url="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags/$tag_name"
|
|
||||||
set +x
|
|
||||||
local response=$(curl -s -X DELETE -H "Authorization: Bearer $BEARER_TOKEN" "$delete_url")
|
|
||||||
set -x
|
|
||||||
|
|
||||||
if echo "$response" | jq -e '.detail' > /dev/null 2>&1; then
|
|
||||||
echo "Warning: Failed to delete tag $tag_name: $(echo "$response" | jq -r '.detail')"
|
|
||||||
else
|
|
||||||
echo "Successfully deleted tag: $tag_name"
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
# Get all nightly- prefixed tags, sorted by last_updated timestamp (newest first)
|
|
||||||
echo "Fetching all tags from DockerHub..."
|
|
||||||
all_tags=$(get_all_tags)
|
|
||||||
|
|
||||||
if [ -z "$all_tags" ]; then
|
|
||||||
echo "No tags found to clean up"
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Count total tags
|
|
||||||
total_tags=$(echo "$all_tags" | wc -l)
|
|
||||||
echo "Found $total_tags tags"
|
|
||||||
|
|
||||||
# Keep only the last 14 builds (including the current one)
|
|
||||||
tags_to_keep=14
|
|
||||||
tags_to_delete=$((total_tags - tags_to_keep))
|
|
||||||
|
|
||||||
if [ $tags_to_delete -le 0 ]; then
|
|
||||||
echo "No tags need to be deleted (only $total_tags tags found, keeping $tags_to_keep)"
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "Will delete $tags_to_delete old tags, keeping the newest $tags_to_keep"
|
|
||||||
|
|
||||||
# Get tags to delete (skip the first $tags_to_keep tags)
|
|
||||||
tags_to_delete_list=$(echo "$all_tags" | tail -n +$((tags_to_keep + 1)))
|
|
||||||
|
|
||||||
if [ -z "$tags_to_delete_list" ]; then
|
|
||||||
echo "No tags to delete"
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Delete old tags
|
|
||||||
echo "Deleting old tags..."
|
|
||||||
while IFS= read -r tag; do
|
|
||||||
if [ -n "$tag" ]; then
|
|
||||||
delete_tag "$tag"
|
|
||||||
# Add a small delay to avoid rate limiting
|
|
||||||
sleep 1
|
|
||||||
fi
|
|
||||||
done <<< "$tags_to_delete_list"
|
|
||||||
|
|
||||||
echo "Cleanup completed successfully"
|
|
@ -86,6 +86,10 @@ if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
|
|||||||
commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
|
commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
if [[ $commands == *"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"* ]]; then
|
||||||
|
commands=${commands//"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"/"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2 and not BambaForCausalLM and not Gemma2ForCausalLM and not Grok1ModelForCausalLM and not Zamba2ForCausalLM and not Gemma2Model and not GritLM'"}
|
||||||
|
fi
|
||||||
|
|
||||||
if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
|
if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
|
||||||
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
|
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
|
||||||
fi
|
fi
|
||||||
@ -117,6 +121,7 @@ fi
|
|||||||
if [[ $commands == *" kernels/quantization"* ]]; then
|
if [[ $commands == *" kernels/quantization"* ]]; then
|
||||||
commands="${commands} \
|
commands="${commands} \
|
||||||
--ignore=kernels/quantization/test_int8_quant.py \
|
--ignore=kernels/quantization/test_int8_quant.py \
|
||||||
|
--ignore=kernels/quantization/test_aqlm.py \
|
||||||
--ignore=kernels/quantization/test_machete_mm.py \
|
--ignore=kernels/quantization/test_machete_mm.py \
|
||||||
--ignore=kernels/quantization/test_block_fp8.py \
|
--ignore=kernels/quantization/test_block_fp8.py \
|
||||||
--ignore=kernels/quantization/test_block_int8.py \
|
--ignore=kernels/quantization/test_block_int8.py \
|
||||||
@ -160,9 +165,16 @@ if [[ $commands == *" entrypoints/llm "* ]]; then
|
|||||||
--ignore=entrypoints/llm/test_chat.py \
|
--ignore=entrypoints/llm/test_chat.py \
|
||||||
--ignore=entrypoints/llm/test_accuracy.py \
|
--ignore=entrypoints/llm/test_accuracy.py \
|
||||||
--ignore=entrypoints/llm/test_init.py \
|
--ignore=entrypoints/llm/test_init.py \
|
||||||
|
--ignore=entrypoints/llm/test_generate_multiple_loras.py \
|
||||||
--ignore=entrypoints/llm/test_prompt_validation.py "}
|
--ignore=entrypoints/llm/test_prompt_validation.py "}
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
#Obsolete currently
|
||||||
|
##ignore certain Entrypoints/llm tests
|
||||||
|
#if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
|
||||||
|
# commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
|
||||||
|
#fi
|
||||||
|
|
||||||
# --ignore=entrypoints/openai/test_encoder_decoder.py \
|
# --ignore=entrypoints/openai/test_encoder_decoder.py \
|
||||||
# --ignore=entrypoints/openai/test_embedding.py \
|
# --ignore=entrypoints/openai/test_embedding.py \
|
||||||
# --ignore=entrypoints/openai/test_oot_registration.py
|
# --ignore=entrypoints/openai/test_oot_registration.py
|
||||||
|
@ -25,28 +25,25 @@ function cpu_tests() {
|
|||||||
|
|
||||||
# offline inference
|
# offline inference
|
||||||
podman exec -it "$container_id" bash -c "
|
podman exec -it "$container_id" bash -c "
|
||||||
set -xve
|
set -e
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
||||||
|
|
||||||
# Run basic model test
|
# Run basic model test
|
||||||
podman exec -it "$container_id" bash -c "
|
podman exec -it "$container_id" bash -c "
|
||||||
set -evx
|
set -e
|
||||||
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
|
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
|
||||||
pip install sentence-transformers datamodel_code_generator
|
pip install sentence-transformers datamodel_code_generator
|
||||||
|
pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
|
||||||
# Note: disable Bart until supports V1
|
|
||||||
# pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
|
|
||||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
|
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
|
||||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
|
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
|
||||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
|
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
|
||||||
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
|
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
|
||||||
# TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
|
pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model"
|
||||||
# pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log
|
|
||||||
}
|
}
|
||||||
|
|
||||||
# All of CPU tests are expected to be finished less than 40 mins.
|
# All of CPU tests are expected to be finished less than 40 mins.
|
||||||
|
|
||||||
export container_id
|
export container_id
|
||||||
export -f cpu_tests
|
export -f cpu_tests
|
||||||
timeout 120m bash -c cpu_tests
|
timeout 40m bash -c cpu_tests
|
||||||
|
|
||||||
|
@ -13,9 +13,9 @@ NUMA_NODE=${NUMA_NODE:-1}
|
|||||||
export CMAKE_BUILD_PARALLEL_LEVEL=32
|
export CMAKE_BUILD_PARALLEL_LEVEL=32
|
||||||
|
|
||||||
# Setup cleanup
|
# Setup cleanup
|
||||||
remove_docker_container() {
|
remove_docker_container() {
|
||||||
set -e;
|
set -e;
|
||||||
docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;
|
docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;
|
||||||
}
|
}
|
||||||
trap remove_docker_container EXIT
|
trap remove_docker_container EXIT
|
||||||
remove_docker_container
|
remove_docker_container
|
||||||
@ -25,8 +25,8 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE
|
|||||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
|
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
|
||||||
|
|
||||||
# Run the image, setting --shm-size=4g for tensor parallel.
|
# Run the image, setting --shm-size=4g for tensor parallel.
|
||||||
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
|
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
|
||||||
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
|
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
|
||||||
|
|
||||||
function cpu_tests() {
|
function cpu_tests() {
|
||||||
set -e
|
set -e
|
||||||
@ -46,74 +46,57 @@ function cpu_tests() {
|
|||||||
set -e
|
set -e
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
||||||
|
|
||||||
# Run kernel tests
|
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
|
||||||
set -e
|
|
||||||
pytest -x -v -s tests/kernels/test_onednn.py"
|
|
||||||
|
|
||||||
# Run basic model test
|
# Run basic model test
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
# Note: disable until supports V1
|
# Note: disable until supports V1
|
||||||
# pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
|
# pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
|
||||||
# pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
|
# pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
|
||||||
|
|
||||||
pytest -x -v -s tests/models/language/generation -m cpu_model
|
# Note: disable Bart until supports V1
|
||||||
VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model
|
pytest -v -s tests/models/language/generation -m cpu_model \
|
||||||
|
--ignore=tests/models/language/generation/test_bart.py
|
||||||
|
VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model \
|
||||||
|
--ignore=tests/models/language/generation/test_bart.py
|
||||||
|
|
||||||
pytest -x -v -s tests/models/language/pooling -m cpu_model
|
pytest -v -s tests/models/language/pooling -m cpu_model
|
||||||
pytest -x -v -s tests/models/multimodal/generation \
|
pytest -v -s tests/models/multimodal/generation \
|
||||||
|
--ignore=tests/models/multimodal/generation/test_mllama.py \
|
||||||
--ignore=tests/models/multimodal/generation/test_pixtral.py \
|
--ignore=tests/models/multimodal/generation/test_pixtral.py \
|
||||||
-m cpu_model"
|
-m cpu_model"
|
||||||
|
|
||||||
# Run compressed-tensor test
|
# Run compressed-tensor test
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
pytest -x -s -v \
|
pytest -s -v \
|
||||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs"
|
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
|
||||||
|
|
||||||
# Note: disable it until supports V1
|
# Note: disable it until supports V1
|
||||||
# Run AWQ test
|
# Run AWQ test
|
||||||
# docker exec cpu-test-"$NUMA_NODE" bash -c "
|
# docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||||
# set -e
|
# set -e
|
||||||
# VLLM_USE_V1=0 pytest -x -s -v \
|
# VLLM_USE_V1=0 pytest -s -v \
|
||||||
# tests/quantization/test_ipex_quant.py"
|
# tests/quantization/test_ipex_quant.py"
|
||||||
|
|
||||||
|
# online serving
|
||||||
|
docker exec cpu-test-"$NUMA_NODE" bash -c '
|
||||||
|
set -e
|
||||||
|
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
|
||||||
|
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
|
||||||
|
python3 benchmarks/benchmark_serving.py \
|
||||||
|
--backend vllm \
|
||||||
|
--dataset-name random \
|
||||||
|
--model meta-llama/Llama-3.2-3B-Instruct \
|
||||||
|
--num-prompts 20 \
|
||||||
|
--endpoint /v1/completions'
|
||||||
|
|
||||||
# Run multi-lora tests
|
# Run multi-lora tests
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
pytest -x -s -v \
|
pytest -s -v \
|
||||||
tests/lora/test_qwen2vl.py"
|
tests/lora/test_qwen2vl.py"
|
||||||
|
|
||||||
# online serving: tp+pp
|
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c '
|
|
||||||
set -e
|
|
||||||
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
|
|
||||||
server_pid=$!
|
|
||||||
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
|
|
||||||
vllm bench serve \
|
|
||||||
--backend vllm \
|
|
||||||
--dataset-name random \
|
|
||||||
--model meta-llama/Llama-3.2-3B-Instruct \
|
|
||||||
--num-prompts 20 \
|
|
||||||
--endpoint /v1/completions
|
|
||||||
kill -s SIGTERM $server_pid &'
|
|
||||||
|
|
||||||
# online serving: tp+dp
|
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c '
|
|
||||||
set -e
|
|
||||||
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 &
|
|
||||||
server_pid=$!
|
|
||||||
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
|
|
||||||
vllm bench serve \
|
|
||||||
--backend vllm \
|
|
||||||
--dataset-name random \
|
|
||||||
--model meta-llama/Llama-3.2-3B-Instruct \
|
|
||||||
--num-prompts 20 \
|
|
||||||
--endpoint /v1/completions
|
|
||||||
kill -s SIGTERM $server_pid &'
|
|
||||||
}
|
}
|
||||||
|
|
||||||
# All of CPU tests are expected to be finished less than 40 mins.
|
# All of CPU tests are expected to be finished less than 40 mins.
|
||||||
export -f cpu_tests
|
export -f cpu_tests
|
||||||
timeout 2h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
|
timeout 1.5h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
|
||||||
|
@ -16,7 +16,8 @@ DOCKER_BUILDKIT=1 docker build . \
|
|||||||
--build-arg max_jobs=66 \
|
--build-arg max_jobs=66 \
|
||||||
--build-arg nvcc_threads=2 \
|
--build-arg nvcc_threads=2 \
|
||||||
--build-arg RUN_WHEEL_CHECK=false \
|
--build-arg RUN_WHEEL_CHECK=false \
|
||||||
--build-arg torch_cuda_arch_list="9.0+PTX"
|
--build-arg torch_cuda_arch_list="9.0+PTX" \
|
||||||
|
--build-arg vllm_fa_cmake_gpu_arches="90-real"
|
||||||
|
|
||||||
# Setup cleanup
|
# Setup cleanup
|
||||||
remove_docker_container() { docker rm -f gh200-test || true; }
|
remove_docker_container() { docker rm -f gh200-test || true; }
|
||||||
|
64
.buildkite/scripts/hardware_ci/run-neuron-test.sh
Normal file
64
.buildkite/scripts/hardware_ci/run-neuron-test.sh
Normal file
@ -0,0 +1,64 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# This script build the Neuron docker image and run the API server inside the container.
|
||||||
|
# It serves a sanity check for compilation and basic model usage.
|
||||||
|
set -e
|
||||||
|
set -v
|
||||||
|
|
||||||
|
image_name="neuron/vllm-ci"
|
||||||
|
container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
||||||
|
|
||||||
|
HF_CACHE="$(realpath ~)/huggingface"
|
||||||
|
mkdir -p "${HF_CACHE}"
|
||||||
|
HF_MOUNT="/root/.cache/huggingface"
|
||||||
|
HF_TOKEN=$(aws secretsmanager get-secret-value --secret-id "ci/vllm-neuron/hf-token" --region us-west-2 --query 'SecretString' --output text | jq -r .VLLM_NEURON_CI_HF_TOKEN)
|
||||||
|
|
||||||
|
NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
|
||||||
|
mkdir -p "${NEURON_COMPILE_CACHE_URL}"
|
||||||
|
NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
|
||||||
|
|
||||||
|
# Try building the docker image
|
||||||
|
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws
|
||||||
|
|
||||||
|
# prune old image and containers to save disk space, and only once a day
|
||||||
|
# by using a timestamp file in tmp.
|
||||||
|
if [ -f /tmp/neuron-docker-build-timestamp ]; then
|
||||||
|
last_build=$(cat /tmp/neuron-docker-build-timestamp)
|
||||||
|
current_time=$(date +%s)
|
||||||
|
if [ $((current_time - last_build)) -gt 86400 ]; then
|
||||||
|
# Remove dangling images (those that are not tagged and not used by any container)
|
||||||
|
docker image prune -f
|
||||||
|
# Remove unused volumes / force the system prune for old images as well.
|
||||||
|
docker volume prune -f && docker system prune -f
|
||||||
|
echo "$current_time" > /tmp/neuron-docker-build-timestamp
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
date "+%s" > /tmp/neuron-docker-build-timestamp
|
||||||
|
fi
|
||||||
|
|
||||||
|
docker build -t "${image_name}" -f docker/Dockerfile.neuron .
|
||||||
|
|
||||||
|
# Setup cleanup
|
||||||
|
remove_docker_container() {
|
||||||
|
docker image rm -f "${image_name}" || true;
|
||||||
|
}
|
||||||
|
trap remove_docker_container EXIT
|
||||||
|
|
||||||
|
# Run the image
|
||||||
|
docker run --rm -it --device=/dev/neuron0 --network bridge \
|
||||||
|
-v "${HF_CACHE}:${HF_MOUNT}" \
|
||||||
|
-e "HF_HOME=${HF_MOUNT}" \
|
||||||
|
-e "HF_TOKEN=${HF_TOKEN}" \
|
||||||
|
-v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
|
||||||
|
-e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
|
||||||
|
--name "${container_name}" \
|
||||||
|
${image_name} \
|
||||||
|
/bin/bash -c "
|
||||||
|
set -e; # Exit on first error
|
||||||
|
python3 /workspace/vllm/examples/offline_inference/neuron.py;
|
||||||
|
python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys;
|
||||||
|
for f in /workspace/vllm/tests/neuron/2_core/*.py; do
|
||||||
|
echo \"Running test file: \$f\";
|
||||||
|
python3 -m pytest \$f -v --capture=tee-sys;
|
||||||
|
done
|
||||||
|
"
|
@ -1,191 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# This script build the Ascend NPU docker image and run the offline inference inside the container.
|
|
||||||
# It serves a sanity check for compilation and basic model usage.
|
|
||||||
set -ex
|
|
||||||
|
|
||||||
# Base ubuntu image with basic ascend development libraries and python installed
|
|
||||||
VLLM_ASCEND_REPO="https://github.com/vllm-project/vllm-ascend.git"
|
|
||||||
CONFIG_FILE_REMOTE_PATH="tests/e2e/vllm_interface/vllm_test.cfg"
|
|
||||||
TEST_RUN_CONFIG_FILE="vllm_test.cfg"
|
|
||||||
VLLM_ASCEND_TMP_DIR=
|
|
||||||
# Get the test run configuration file from the vllm-ascend repository
|
|
||||||
fetch_vllm_test_cfg() {
|
|
||||||
VLLM_ASCEND_TMP_DIR=$(mktemp -d)
|
|
||||||
# Ensure that the temporary directory is cleaned up when an exception occurs during configuration file retrieval
|
|
||||||
cleanup() {
|
|
||||||
rm -rf "${VLLM_ASCEND_TMP_DIR}"
|
|
||||||
}
|
|
||||||
trap cleanup EXIT
|
|
||||||
|
|
||||||
GIT_TRACE=1 git clone -v --depth 1 "${VLLM_ASCEND_REPO}" "${VLLM_ASCEND_TMP_DIR}"
|
|
||||||
if [ ! -f "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" ]; then
|
|
||||||
echo "Error: file '${CONFIG_FILE_REMOTE_PATH}' does not exist in the warehouse" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# If the file already exists locally, just overwrite it
|
|
||||||
cp "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" "${TEST_RUN_CONFIG_FILE}"
|
|
||||||
echo "Copied ${CONFIG_FILE_REMOTE_PATH} to ${TEST_RUN_CONFIG_FILE}"
|
|
||||||
|
|
||||||
# Since the trap will be overwritten later, and when it is executed here, the task of cleaning up resources
|
|
||||||
# when the trap is abnormal has been completed, so the temporary resources are manually deleted here.
|
|
||||||
rm -rf "${VLLM_ASCEND_TMP_DIR}"
|
|
||||||
trap - EXIT
|
|
||||||
}
|
|
||||||
|
|
||||||
# Downloads test run configuration file from a remote URL.
|
|
||||||
# Loads the configuration into the current script environment.
|
|
||||||
get_config() {
|
|
||||||
if [ ! -f "${TEST_RUN_CONFIG_FILE}" ]; then
|
|
||||||
echo "Error: file '${TEST_RUN_CONFIG_FILE}' does not exist in the warehouse" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
source "${TEST_RUN_CONFIG_FILE}"
|
|
||||||
echo "Base docker image name that get from configuration: ${BASE_IMAGE_NAME}"
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
|
|
||||||
# get test running configuration.
|
|
||||||
fetch_vllm_test_cfg
|
|
||||||
get_config
|
|
||||||
# Check if the function call was successful. If not, exit the script.
|
|
||||||
if [ $? -ne 0 ]; then
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
image_name="npu/vllm-ci:${BUILDKITE_COMMIT}_${EPOCHSECONDS}"
|
|
||||||
container_name="npu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
|
||||||
|
|
||||||
# BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards
|
|
||||||
agent_idx=$(echo "${BUILDKITE_AGENT_NAME}" | awk -F'-' '{print $(NF-1)}')
|
|
||||||
echo "agent_idx: ${agent_idx}"
|
|
||||||
builder_name="cachebuilder${agent_idx}"
|
|
||||||
builder_cache_dir="/mnt/docker-cache${agent_idx}"
|
|
||||||
mkdir -p ${builder_cache_dir}
|
|
||||||
|
|
||||||
# Try building the docker image
|
|
||||||
cat <<EOF | DOCKER_BUILDKIT=1 docker build \
|
|
||||||
--add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_HOST} \
|
|
||||||
--builder ${builder_name} --cache-from type=local,src=${builder_cache_dir} \
|
|
||||||
--cache-to type=local,dest=${builder_cache_dir},mode=max \
|
|
||||||
--progress=plain --load -t ${image_name} -f - .
|
|
||||||
FROM ${BASE_IMAGE_NAME}
|
|
||||||
|
|
||||||
# Define environments
|
|
||||||
ENV DEBIAN_FRONTEND=noninteractive
|
|
||||||
|
|
||||||
RUN pip config set global.index-url http://cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_PORT}/pypi/simple && \
|
|
||||||
pip config set global.trusted-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local && \
|
|
||||||
apt-get update -y && \
|
|
||||||
apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \
|
|
||||||
rm -rf /var/cache/apt/* && \
|
|
||||||
rm -rf /var/lib/apt/lists/*
|
|
||||||
|
|
||||||
# Install for pytest to make the docker build cache layer always valid
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
pip install pytest>=6.0 modelscope
|
|
||||||
|
|
||||||
WORKDIR /workspace/vllm
|
|
||||||
|
|
||||||
# Install vLLM dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid.
|
|
||||||
COPY requirements/common.txt /workspace/vllm/requirements/common.txt
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
pip install -r requirements/common.txt
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
# Install vLLM
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
|
|
||||||
python3 -m pip uninstall -y triton
|
|
||||||
|
|
||||||
# Install vllm-ascend
|
|
||||||
WORKDIR /workspace
|
|
||||||
ARG VLLM_ASCEND_REPO=https://github.com/vllm-project/vllm-ascend.git
|
|
||||||
ARG VLLM_ASCEND_TAG=main
|
|
||||||
RUN git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf "https://github.com/" && \
|
|
||||||
git clone --depth 1 \$VLLM_ASCEND_REPO --branch \$VLLM_ASCEND_TAG /workspace/vllm-ascend
|
|
||||||
|
|
||||||
# Install vllm dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid.
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
pip install -r /workspace/vllm-ascend/requirements.txt
|
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
|
|
||||||
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
|
|
||||||
source /usr/local/Ascend/nnal/atb/set_env.sh && \
|
|
||||||
export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
|
|
||||||
python3 -m pip install -v -e /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/
|
|
||||||
|
|
||||||
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
|
|
||||||
ENV VLLM_USE_MODELSCOPE=True
|
|
||||||
|
|
||||||
WORKDIR /workspace/vllm-ascend
|
|
||||||
|
|
||||||
CMD ["/bin/bash"]
|
|
||||||
|
|
||||||
EOF
|
|
||||||
|
|
||||||
# Setup cleanup
|
|
||||||
remove_docker_container() {
|
|
||||||
docker rm -f "${container_name}" || true;
|
|
||||||
docker image rm -f "${image_name}" || true;
|
|
||||||
docker system prune -f || true;
|
|
||||||
}
|
|
||||||
trap remove_docker_container EXIT
|
|
||||||
|
|
||||||
# Generate corresponding --device args based on BUILDKITE_AGENT_NAME
|
|
||||||
# Ascend NPU BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards, and agent_idx starts from 1.
|
|
||||||
# e.g. atlas-a2-001-1-2cards means this is the 1-th agent on atlas-a2-001 host, and it has 2 NPU cards.
|
|
||||||
# returns --device /dev/davinci0 --device /dev/davinci1
|
|
||||||
parse_and_gen_devices() {
|
|
||||||
local input="$1"
|
|
||||||
local index cards_num
|
|
||||||
if [[ "$input" =~ ([0-9]+)-([0-9]+)cards$ ]]; then
|
|
||||||
index="${BASH_REMATCH[1]}"
|
|
||||||
cards_num="${BASH_REMATCH[2]}"
|
|
||||||
else
|
|
||||||
echo "parse error" >&2
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
local devices=""
|
|
||||||
local i=0
|
|
||||||
while (( i < cards_num )); do
|
|
||||||
local dev_idx=$(((index - 1)*cards_num + i ))
|
|
||||||
devices="$devices --device /dev/davinci${dev_idx}"
|
|
||||||
((i++))
|
|
||||||
done
|
|
||||||
|
|
||||||
# trim leading space
|
|
||||||
devices="${devices#"${devices%%[![:space:]]*}"}"
|
|
||||||
# Output devices: assigned to the caller variable
|
|
||||||
printf '%s' "$devices"
|
|
||||||
}
|
|
||||||
|
|
||||||
devices=$(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1
|
|
||||||
|
|
||||||
# Run the image and execute the Out-Of-Tree (OOT) platform interface test case on Ascend NPU hardware.
|
|
||||||
# This test checks whether the OOT platform interface is functioning properly in conjunction with
|
|
||||||
# the hardware plugin vllm-ascend.
|
|
||||||
model_cache_dir=/mnt/modelscope${agent_idx}
|
|
||||||
mkdir -p ${model_cache_dir}
|
|
||||||
docker run \
|
|
||||||
${devices} \
|
|
||||||
--device /dev/davinci_manager \
|
|
||||||
--device /dev/devmm_svm \
|
|
||||||
--device /dev/hisi_hdc \
|
|
||||||
-v /usr/local/dcmi:/usr/local/dcmi \
|
|
||||||
-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
|
|
||||||
-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
|
|
||||||
-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
|
|
||||||
-v /etc/ascend_install.info:/etc/ascend_install.info \
|
|
||||||
-v ${model_cache_dir}:/root/.cache/modelscope \
|
|
||||||
--entrypoint="" \
|
|
||||||
--name "${container_name}" \
|
|
||||||
"${image_name}" \
|
|
||||||
bash -c '
|
|
||||||
set -e
|
|
||||||
pytest -v -s tests/e2e/vllm_interface/
|
|
||||||
'
|
|
@ -1,166 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
set -xu
|
|
||||||
|
|
||||||
|
|
||||||
remove_docker_container() {
|
|
||||||
docker rm -f tpu-test || true;
|
|
||||||
}
|
|
||||||
|
|
||||||
trap remove_docker_container EXIT
|
|
||||||
|
|
||||||
# Remove the container that might not be cleaned up in the previous run.
|
|
||||||
remove_docker_container
|
|
||||||
|
|
||||||
# Build the docker image.
|
|
||||||
docker build -f docker/Dockerfile.tpu -t vllm-tpu .
|
|
||||||
|
|
||||||
# Set up cleanup.
|
|
||||||
cleanup_docker() {
|
|
||||||
# Get Docker's root directory
|
|
||||||
docker_root=$(docker info -f '{{.DockerRootDir}}')
|
|
||||||
if [ -z "$docker_root" ]; then
|
|
||||||
echo "Failed to determine Docker root directory."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
echo "Docker root directory: $docker_root"
|
|
||||||
# Check disk usage of the filesystem where Docker's root directory is located
|
|
||||||
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
|
|
||||||
# Define the threshold
|
|
||||||
threshold=70
|
|
||||||
if [ "$disk_usage" -gt "$threshold" ]; then
|
|
||||||
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
|
|
||||||
# Remove dangling images (those that are not tagged and not used by any container)
|
|
||||||
docker image prune -f
|
|
||||||
# Remove unused volumes / force the system prune for old images as well.
|
|
||||||
docker volume prune -f && docker system prune --force --filter "until=72h" --all
|
|
||||||
echo "Docker images and volumes cleanup completed."
|
|
||||||
else
|
|
||||||
echo "Disk usage is below $threshold%. No cleanup needed."
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
cleanup_docker
|
|
||||||
|
|
||||||
# For HF_TOKEN.
|
|
||||||
source /etc/environment
|
|
||||||
|
|
||||||
docker run --privileged --net host --shm-size=16G -it \
|
|
||||||
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
|
|
||||||
vllm-tpu /bin/bash -c '
|
|
||||||
set -e # Exit immediately if a command exits with a non-zero status.
|
|
||||||
set -u # Treat unset variables as an error.
|
|
||||||
|
|
||||||
echo "--- Starting script inside Docker container ---"
|
|
||||||
|
|
||||||
# Create results directory
|
|
||||||
RESULTS_DIR=$(mktemp -d)
|
|
||||||
# If mktemp fails, set -e will cause the script to exit.
|
|
||||||
echo "Results will be stored in: $RESULTS_DIR"
|
|
||||||
|
|
||||||
# Install dependencies
|
|
||||||
echo "--- Installing Python dependencies ---"
|
|
||||||
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
|
||||||
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
|
||||||
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
|
|
||||||
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
|
|
||||||
echo "--- Python dependencies installed ---"
|
|
||||||
|
|
||||||
export VLLM_XLA_CHECK_RECOMPILATION=1
|
|
||||||
export VLLM_XLA_CACHE_PATH=
|
|
||||||
|
|
||||||
echo "--- Hardware Information ---"
|
|
||||||
# tpu-info
|
|
||||||
echo "--- Starting Tests ---"
|
|
||||||
set +e
|
|
||||||
overall_script_exit_code=0
|
|
||||||
|
|
||||||
# --- Test Definitions ---
|
|
||||||
# If a test fails, this function will print logs and will not cause the main script to exit.
|
|
||||||
run_test() {
|
|
||||||
local test_num=$1
|
|
||||||
local test_name=$2
|
|
||||||
local test_command=$3
|
|
||||||
local log_file="$RESULTS_DIR/test_${test_num}.log"
|
|
||||||
local actual_exit_code
|
|
||||||
|
|
||||||
echo "--- TEST_$test_num: Running $test_name ---"
|
|
||||||
|
|
||||||
# Execute the test command.
|
|
||||||
eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2)
|
|
||||||
actual_exit_code=$?
|
|
||||||
|
|
||||||
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log
|
|
||||||
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log
|
|
||||||
|
|
||||||
if [ "$actual_exit_code" -ne 0 ]; then
|
|
||||||
echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2
|
|
||||||
echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2
|
|
||||||
if [ -f "$log_file" ]; then
|
|
||||||
cat "$log_file" >&2
|
|
||||||
else
|
|
||||||
echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2
|
|
||||||
fi
|
|
||||||
echo "--- End of log for TEST_$test_num ($test_name) ---" >&2
|
|
||||||
return "$actual_exit_code" # Return the failure code
|
|
||||||
else
|
|
||||||
echo "TEST_$test_num ($test_name) PASSED."
|
|
||||||
return 0 # Return success
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
# Helper function to call run_test and update the overall script exit code
|
|
||||||
run_and_track_test() {
|
|
||||||
local test_num_arg="$1"
|
|
||||||
local test_name_arg="$2"
|
|
||||||
local test_command_arg="$3"
|
|
||||||
|
|
||||||
# Run the test
|
|
||||||
run_test "$test_num_arg" "$test_name_arg" "$test_command_arg"
|
|
||||||
local test_specific_exit_code=$?
|
|
||||||
|
|
||||||
# If the test failed, set the overall script exit code to 1
|
|
||||||
if [ "$test_specific_exit_code" -ne 0 ]; then
|
|
||||||
# No need for extra echo here, run_test already logged the failure.
|
|
||||||
overall_script_exit_code=1
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
# --- Actual Test Execution ---
|
|
||||||
run_and_track_test 1 "test_struct_output_generate.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
|
|
||||||
run_and_track_test 2 "test_moe_pallas.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
|
|
||||||
run_and_track_test 3 "test_lora.py" \
|
|
||||||
"VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
|
|
||||||
run_and_track_test 4 "test_tpu_qkv_linear.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
|
|
||||||
run_and_track_test 5 "test_spmd_model_weight_loading.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
|
|
||||||
run_and_track_test 6 "test_kv_cache_update_kernel.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py"
|
|
||||||
run_and_track_test 7 "test_tpu_int8.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_int8.py"
|
|
||||||
|
|
||||||
# After all tests have been attempted, exit with the overall status.
|
|
||||||
if [ "$overall_script_exit_code" -ne 0 ]; then
|
|
||||||
echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---"
|
|
||||||
else
|
|
||||||
echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---"
|
|
||||||
fi
|
|
||||||
exit "$overall_script_exit_code"
|
|
||||||
' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct.
|
|
||||||
|
|
||||||
# Capture the exit code of the docker run command
|
|
||||||
DOCKER_RUN_EXIT_CODE=$?
|
|
||||||
|
|
||||||
# The trap will run for cleanup.
|
|
||||||
# Exit the main script with the Docker run command's exit code.
|
|
||||||
if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then
|
|
||||||
echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE."
|
|
||||||
exit "$DOCKER_RUN_EXIT_CODE"
|
|
||||||
else
|
|
||||||
echo "Docker run command completed successfully."
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
# TODO: This test fails because it uses RANDOM_SEED sampling
|
|
||||||
# pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
|
|
@ -5,6 +5,7 @@ set -xu
|
|||||||
|
|
||||||
remove_docker_container() {
|
remove_docker_container() {
|
||||||
docker rm -f tpu-test || true;
|
docker rm -f tpu-test || true;
|
||||||
|
docker rm -f vllm-tpu || true;
|
||||||
}
|
}
|
||||||
|
|
||||||
trap remove_docker_container EXIT
|
trap remove_docker_container EXIT
|
||||||
@ -61,12 +62,12 @@ echo "Results will be stored in: $RESULTS_DIR"
|
|||||||
echo "--- Installing Python dependencies ---"
|
echo "--- Installing Python dependencies ---"
|
||||||
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
||||||
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
||||||
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
|
&& python3 -m pip install --progress-bar off lm_eval[api]==0.4.4
|
||||||
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
|
|
||||||
echo "--- Python dependencies installed ---"
|
echo "--- Python dependencies installed ---"
|
||||||
|
export VLLM_USE_V1=1
|
||||||
export VLLM_XLA_CHECK_RECOMPILATION=1
|
export VLLM_XLA_CHECK_RECOMPILATION=1
|
||||||
export VLLM_XLA_CACHE_PATH=
|
export VLLM_XLA_CACHE_PATH=
|
||||||
|
echo "Using VLLM V1"
|
||||||
|
|
||||||
echo "--- Hardware Information ---"
|
echo "--- Hardware Information ---"
|
||||||
# tpu-info
|
# tpu-info
|
||||||
@ -148,6 +149,18 @@ run_and_track_test 9 "test_multimodal.py" \
|
|||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py"
|
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py"
|
||||||
run_and_track_test 10 "test_pallas.py" \
|
run_and_track_test 10 "test_pallas.py" \
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
|
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
|
||||||
|
run_and_track_test 11 "test_struct_output_generate.py" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
|
||||||
|
run_and_track_test 12 "test_moe_pallas.py" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
|
||||||
|
run_and_track_test 13 "test_lora.py" \
|
||||||
|
"VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
|
||||||
|
run_and_track_test 14 "test_tpu_qkv_linear.py" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
|
||||||
|
run_and_track_test 15 "test_spmd_model_weight_loading.py" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
|
||||||
|
run_and_track_test 16 "test_kv_cache_update_kernel.py" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py"
|
||||||
|
|
||||||
# After all tests have been attempted, exit with the overall status.
|
# After all tests have been attempted, exit with the overall status.
|
||||||
if [ "$overall_script_exit_code" -ne 0 ]; then
|
if [ "$overall_script_exit_code" -ne 0 ]; then
|
||||||
|
@ -23,26 +23,12 @@ docker run \
|
|||||||
--device /dev/dri \
|
--device /dev/dri \
|
||||||
-v /dev/dri/by-path:/dev/dri/by-path \
|
-v /dev/dri/by-path:/dev/dri/by-path \
|
||||||
--entrypoint="" \
|
--entrypoint="" \
|
||||||
-e "HF_TOKEN=${HF_TOKEN}" \
|
|
||||||
-e "ZE_AFFINITY_MASK=${ZE_AFFINITY_MASK}" \
|
|
||||||
--name "${container_name}" \
|
--name "${container_name}" \
|
||||||
"${image_name}" \
|
"${image_name}" \
|
||||||
bash -c '
|
sh -c '
|
||||||
set -e
|
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
|
||||||
echo $ZE_AFFINITY_MASK
|
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
|
||||||
pip install tblib==3.1.0
|
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
|
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
|
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
|
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
|
|
||||||
VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
|
|
||||||
cd tests
|
cd tests
|
||||||
pytest -v -s v1/core
|
pytest -v -s v1/core
|
||||||
pytest -v -s v1/engine
|
|
||||||
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
|
|
||||||
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
|
|
||||||
pytest -v -s v1/structured_output
|
|
||||||
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py
|
|
||||||
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
|
|
||||||
pytest -v -s v1/test_serial_utils.py
|
|
||||||
'
|
'
|
||||||
|
@ -11,20 +11,20 @@ cd "$(dirname "${BASH_SOURCE[0]}")/../.."
|
|||||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||||
|
|
||||||
# run python-based benchmarks and upload the result to buildkite
|
# run python-based benchmarks and upload the result to buildkite
|
||||||
vllm bench latency --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
|
python3 benchmarks/benchmark_latency.py --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
|
||||||
bench_latency_exit_code=$?
|
bench_latency_exit_code=$?
|
||||||
|
|
||||||
vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
|
python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
|
||||||
bench_throughput_exit_code=$?
|
bench_throughput_exit_code=$?
|
||||||
|
|
||||||
# run server-based benchmarks and upload the result to buildkite
|
# run server-based benchmarks and upload the result to buildkite
|
||||||
vllm serve meta-llama/Llama-2-7b-chat-hf &
|
python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf &
|
||||||
server_pid=$!
|
server_pid=$!
|
||||||
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||||
|
|
||||||
# wait for server to start, timeout after 600 seconds
|
# wait for server to start, timeout after 600 seconds
|
||||||
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
|
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
|
||||||
vllm bench serve \
|
python3 benchmarks/benchmark_serving.py \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--dataset-name sharegpt \
|
--dataset-name sharegpt \
|
||||||
--dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \
|
--dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \
|
||||||
|
@ -1,59 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
# Setup script for Prime-RL integration tests
|
|
||||||
# This script prepares the environment for running Prime-RL tests with nightly vLLM
|
|
||||||
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
|
|
||||||
PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git"
|
|
||||||
PRIME_RL_DIR="${REPO_ROOT}/prime-rl"
|
|
||||||
|
|
||||||
echo "Setting up Prime-RL integration test environment..."
|
|
||||||
|
|
||||||
# Clean up any existing Prime-RL directory
|
|
||||||
if [ -d "${PRIME_RL_DIR}" ]; then
|
|
||||||
echo "Removing existing Prime-RL directory..."
|
|
||||||
rm -rf "${PRIME_RL_DIR}"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Install UV if not available
|
|
||||||
if ! command -v uv &> /dev/null; then
|
|
||||||
echo "Installing UV package manager..."
|
|
||||||
curl -LsSf https://astral.sh/uv/install.sh | sh
|
|
||||||
source $HOME/.local/bin/env
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Clone Prime-RL repository at specific branch for reproducible tests
|
|
||||||
PRIME_RL_BRANCH="integ-vllm-main"
|
|
||||||
echo "Cloning Prime-RL repository at branch: ${PRIME_RL_BRANCH}..."
|
|
||||||
git clone --branch "${PRIME_RL_BRANCH}" --single-branch "${PRIME_RL_REPO}" "${PRIME_RL_DIR}"
|
|
||||||
cd "${PRIME_RL_DIR}"
|
|
||||||
|
|
||||||
echo "Setting up UV project environment..."
|
|
||||||
export UV_PROJECT_ENVIRONMENT=/usr/local
|
|
||||||
ln -s /usr/bin/python3 /usr/local/bin/python
|
|
||||||
|
|
||||||
# Remove vllm pin from pyproject.toml
|
|
||||||
echo "Removing vllm pin from pyproject.toml..."
|
|
||||||
sed -i '/vllm==/d' pyproject.toml
|
|
||||||
|
|
||||||
# Sync Prime-RL dependencies
|
|
||||||
echo "Installing Prime-RL dependencies..."
|
|
||||||
uv sync --inexact && uv sync --inexact --all-extras
|
|
||||||
|
|
||||||
# Verify installation
|
|
||||||
echo "Verifying installations..."
|
|
||||||
uv run python -c "import vllm; print(f'vLLM version: {vllm.__version__}')"
|
|
||||||
uv run python -c "import prime_rl; print('Prime-RL imported successfully')"
|
|
||||||
|
|
||||||
echo "Prime-RL integration test environment setup complete!"
|
|
||||||
|
|
||||||
echo "Running Prime-RL integration tests..."
|
|
||||||
export WANDB_MODE=offline # this makes this test not require a WANDB_API_KEY
|
|
||||||
uv run pytest -vs tests/integration/test_rl.py -m gpu
|
|
||||||
|
|
||||||
echo "Prime-RL integration tests completed!"
|
|
@ -17,7 +17,7 @@ if [ "$disk_usage" -gt "$threshold" ]; then
|
|||||||
# Remove dangling images (those that are not tagged and not used by any container)
|
# Remove dangling images (those that are not tagged and not used by any container)
|
||||||
docker image prune -f
|
docker image prune -f
|
||||||
# Remove unused volumes / force the system prune for old images as well.
|
# Remove unused volumes / force the system prune for old images as well.
|
||||||
docker volume prune -f && docker system prune --force --filter "until=24h" --all
|
docker volume prune -f && docker system prune --force --filter "until=72h" --all
|
||||||
echo "Docker images and volumes cleanup completed."
|
echo "Docker images and volumes cleanup completed."
|
||||||
else
|
else
|
||||||
echo "Disk usage is below $threshold%. No cleanup needed."
|
echo "Disk usage is below $threshold%. No cleanup needed."
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
# Environment config
|
# Environment config
|
||||||
TEST_NAME=llama8b
|
TEST_NAME=llama8b
|
||||||
CONTAINER_NAME=tpu-test
|
CONTAINER_NAME=vllm-tpu
|
||||||
|
|
||||||
# vllm config
|
# vllm config
|
||||||
MODEL=meta-llama/Llama-3.1-8B-Instruct
|
MODEL=meta-llama/Llama-3.1-8B-Instruct
|
||||||
|
@ -12,6 +12,8 @@ source /etc/environment
|
|||||||
source $ENV_FILE
|
source $ENV_FILE
|
||||||
|
|
||||||
remove_docker_container() {
|
remove_docker_container() {
|
||||||
|
docker rm -f tpu-test || true;
|
||||||
|
docker rm -f vllm-tpu || true;
|
||||||
docker rm -f $CONTAINER_NAME || true;
|
docker rm -f $CONTAINER_NAME || true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
# Environment config
|
# Environment config
|
||||||
TEST_NAME=llama8bw8a8
|
TEST_NAME=llama8bw8a8
|
||||||
CONTAINER_NAME=tpu-test
|
CONTAINER_NAME=vllm-tpu
|
||||||
|
|
||||||
# vllm config
|
# vllm config
|
||||||
MODEL=RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8
|
MODEL=RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8
|
||||||
@ -9,6 +9,6 @@ MAX_NUM_BATCHED_TOKENS=1024
|
|||||||
TENSOR_PARALLEL_SIZE=1
|
TENSOR_PARALLEL_SIZE=1
|
||||||
MAX_MODEL_LEN=2048
|
MAX_MODEL_LEN=2048
|
||||||
DOWNLOAD_DIR=/mnt/disks/persist
|
DOWNLOAD_DIR=/mnt/disks/persist
|
||||||
EXPECTED_THROUGHPUT=8.7
|
EXPECTED_THROUGHPUT=10.0
|
||||||
INPUT_LEN=1800
|
INPUT_LEN=1800
|
||||||
OUTPUT_LEN=128
|
OUTPUT_LEN=128
|
||||||
|
@ -42,8 +42,9 @@ echo "lanching vllm..."
|
|||||||
echo "logging to $VLLM_LOG"
|
echo "logging to $VLLM_LOG"
|
||||||
echo
|
echo
|
||||||
|
|
||||||
vllm serve $MODEL \
|
VLLM_USE_V1=1 vllm serve $MODEL \
|
||||||
--seed 42 \
|
--seed 42 \
|
||||||
|
--disable-log-requests \
|
||||||
--max-num-seqs $MAX_NUM_SEQS \
|
--max-num-seqs $MAX_NUM_SEQS \
|
||||||
--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
|
--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
|
||||||
--tensor-parallel-size $TENSOR_PARALLEL_SIZE \
|
--tensor-parallel-size $TENSOR_PARALLEL_SIZE \
|
||||||
@ -76,7 +77,7 @@ done
|
|||||||
echo "run benchmark test..."
|
echo "run benchmark test..."
|
||||||
echo "logging to $BM_LOG"
|
echo "logging to $BM_LOG"
|
||||||
echo
|
echo
|
||||||
vllm bench serve \
|
python benchmarks/benchmark_serving.py \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--model $MODEL \
|
--model $MODEL \
|
||||||
--dataset-name sonnet \
|
--dataset-name sonnet \
|
||||||
|
@ -14,19 +14,8 @@ fi
|
|||||||
# Get the single wheel file
|
# Get the single wheel file
|
||||||
wheel="${wheel_files[0]}"
|
wheel="${wheel_files[0]}"
|
||||||
|
|
||||||
# Detect architecture and rename 'linux' to appropriate manylinux version
|
# Rename 'linux' to 'manylinux1' in the wheel filename
|
||||||
arch=$(uname -m)
|
new_wheel="${wheel/linux/manylinux1}"
|
||||||
if [[ $arch == "x86_64" ]]; then
|
|
||||||
manylinux_version="manylinux1"
|
|
||||||
elif [[ $arch == "aarch64" ]]; then
|
|
||||||
manylinux_version="manylinux2014"
|
|
||||||
else
|
|
||||||
echo "Warning: Unknown architecture $arch, using manylinux1 as default"
|
|
||||||
manylinux_version="manylinux1"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Rename 'linux' to the appropriate manylinux version in the wheel filename
|
|
||||||
new_wheel="${wheel/linux/$manylinux_version}"
|
|
||||||
mv -- "$wheel" "$new_wheel"
|
mv -- "$wheel" "$new_wheel"
|
||||||
wheel="$new_wheel"
|
wheel="$new_wheel"
|
||||||
|
|
||||||
@ -58,15 +47,14 @@ python3 .buildkite/generate_index.py --wheel "$normal_wheel"
|
|||||||
aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
||||||
aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
||||||
|
|
||||||
if [[ $normal_wheel == *"cu126"* ]]; then
|
if [[ $normal_wheel == *"cu118"* ]]; then
|
||||||
|
# if $normal_wheel matches cu118, do not upload the index.html
|
||||||
|
echo "Skipping index files for cu118 wheels"
|
||||||
|
elif [[ $normal_wheel == *"cu126"* ]]; then
|
||||||
# if $normal_wheel matches cu126, do not upload the index.html
|
# if $normal_wheel matches cu126, do not upload the index.html
|
||||||
echo "Skipping index files for cu126 wheels"
|
echo "Skipping index files for cu126 wheels"
|
||||||
elif [[ $normal_wheel == *"cu128"* ]]; then
|
|
||||||
# if $normal_wheel matches cu128, do not upload the index.html
|
|
||||||
echo "Skipping index files for cu128 wheels"
|
|
||||||
else
|
else
|
||||||
# only upload index.html for cu129 wheels (default wheels) as it
|
# only upload index.html for cu128 wheels (default wheels)
|
||||||
# is available on both x86 and arm64
|
|
||||||
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
|
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
|
||||||
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
|
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
|
||||||
fi
|
fi
|
||||||
@ -75,15 +63,14 @@ fi
|
|||||||
aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
|
aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
|
||||||
aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
|
aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
|
||||||
|
|
||||||
if [[ $normal_wheel == *"cu126"* ]]; then
|
if [[ $normal_wheel == *"cu118"* ]]; then
|
||||||
|
# if $normal_wheel matches cu118, do not upload the index.html
|
||||||
|
echo "Skipping index files for cu118 wheels"
|
||||||
|
elif [[ $normal_wheel == *"cu126"* ]]; then
|
||||||
# if $normal_wheel matches cu126, do not upload the index.html
|
# if $normal_wheel matches cu126, do not upload the index.html
|
||||||
echo "Skipping index files for cu126 wheels"
|
echo "Skipping index files for cu126 wheels"
|
||||||
elif [[ $normal_wheel == *"cu128"* ]]; then
|
|
||||||
# if $normal_wheel matches cu128, do not upload the index.html
|
|
||||||
echo "Skipping index files for cu128 wheels"
|
|
||||||
else
|
else
|
||||||
# only upload index.html for cu129 wheels (default wheels) as it
|
# only upload index.html for cu128 wheels (default wheels)
|
||||||
# is available on both x86 and arm64
|
|
||||||
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
|
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
47
.coveragerc
47
.coveragerc
@ -1,47 +0,0 @@
|
|||||||
[run]
|
|
||||||
# Track the installed vllm package (this is what actually gets imported during tests)
|
|
||||||
# Use wildcard pattern to match the installed location
|
|
||||||
source =
|
|
||||||
vllm
|
|
||||||
*/dist-packages/vllm
|
|
||||||
*/site-packages/vllm
|
|
||||||
omit =
|
|
||||||
*/tests/*
|
|
||||||
*/test_*
|
|
||||||
*/__pycache__/*
|
|
||||||
*/build/*
|
|
||||||
*/dist/*
|
|
||||||
*/vllm.egg-info/*
|
|
||||||
*/third_party/*
|
|
||||||
*/examples/*
|
|
||||||
*/benchmarks/*
|
|
||||||
*/docs/*
|
|
||||||
|
|
||||||
[paths]
|
|
||||||
# Map all possible vllm locations to a canonical "vllm" path
|
|
||||||
# This ensures coverage.combine properly merges data from different test runs
|
|
||||||
source =
|
|
||||||
vllm
|
|
||||||
/vllm-workspace/src/vllm
|
|
||||||
/vllm-workspace/vllm
|
|
||||||
*/site-packages/vllm
|
|
||||||
*/dist-packages/vllm
|
|
||||||
|
|
||||||
[report]
|
|
||||||
exclude_lines =
|
|
||||||
pragma: no cover
|
|
||||||
def __repr__
|
|
||||||
if self.debug:
|
|
||||||
if settings.DEBUG
|
|
||||||
raise AssertionError
|
|
||||||
raise NotImplementedError
|
|
||||||
if 0:
|
|
||||||
if __name__ == .__main__.:
|
|
||||||
class .*\bProtocol\):
|
|
||||||
@(abc\.)?abstractmethod
|
|
||||||
|
|
||||||
[html]
|
|
||||||
directory = htmlcov
|
|
||||||
|
|
||||||
[xml]
|
|
||||||
output = coverage.xml
|
|
@ -1,4 +0,0 @@
|
|||||||
# Migrate from `yapf` & `isort` to `ruff`
|
|
||||||
d6953beb91da4e9c99be4c0a1304a2d24189535c
|
|
||||||
# Convert `Optional[x]` to `x | None` and `Union[x, y]` to `x | y`
|
|
||||||
8fcaaf6a165e661f63fc51be906bc05b0767332f
|
|
24
.github/.bc-linter.yml
vendored
24
.github/.bc-linter.yml
vendored
@ -1,24 +0,0 @@
|
|||||||
# doc: https://github.com/pytorch/test-infra/blob/main/tools/stronghold/docs/bc_linter_config.md
|
|
||||||
version: 1
|
|
||||||
paths:
|
|
||||||
# We temporarily disable globally, and will only enable with `annotations.include`
|
|
||||||
# include:
|
|
||||||
# - "vllm/v1/attetion/*.py"
|
|
||||||
# - "vllm/v1/core/*.py"
|
|
||||||
exclude:
|
|
||||||
- "**/*.py"
|
|
||||||
|
|
||||||
scan:
|
|
||||||
functions: true # check free functions and methods
|
|
||||||
classes: true # check classes/dataclasses
|
|
||||||
public_only: true # ignore names starting with "_" at any level
|
|
||||||
|
|
||||||
annotations:
|
|
||||||
include: # decorators that force‑include a symbol
|
|
||||||
- name: "bc_linter_include" # matched by simple name or dotted suffix
|
|
||||||
propagate_to_members: false # for classes, include methods/inner classes
|
|
||||||
exclude: # decorators that force‑exclude a symbol
|
|
||||||
- name: "bc_linter_skip" # matched by simple name or dotted suffix
|
|
||||||
propagate_to_members: true # for classes, exclude methods/inner classes
|
|
||||||
|
|
||||||
excluded_violations: [] # e.g. ["ParameterRenamed", "FieldTypeChanged"]
|
|
126
.github/CODEOWNERS
vendored
126
.github/CODEOWNERS
vendored
@ -2,127 +2,53 @@
|
|||||||
# for more info about CODEOWNERS file
|
# for more info about CODEOWNERS file
|
||||||
|
|
||||||
# This lists cover the "core" components of vLLM that require careful review
|
# This lists cover the "core" components of vLLM that require careful review
|
||||||
/vllm/attention @LucasWilkinson
|
|
||||||
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||||
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
|
/vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||||
/vllm/model_executor/layers/fused_moe @mgoin
|
/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||||
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
|
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||||
/vllm/model_executor/layers/mamba @tdoublep
|
/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||||
/vllm/model_executor/model_loader @22quinn
|
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||||
/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
|
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||||
|
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
|
||||||
|
/vllm/model_executor/guided_decoding @mgoin @russellb @aarnphm
|
||||||
|
/vllm/multimodal @DarkLight1337 @ywang96
|
||||||
/vllm/vllm_flash_attn @LucasWilkinson
|
/vllm/vllm_flash_attn @LucasWilkinson
|
||||||
/vllm/lora @jeejeelee
|
/vllm/lora @jeejeelee
|
||||||
/vllm/reasoning @aarnphm @chaunceyjiang
|
/vllm/reasoning @aarnphm
|
||||||
/vllm/entrypoints @aarnphm @chaunceyjiang
|
/vllm/entrypoints @aarnphm
|
||||||
/vllm/compilation @zou3519 @youkaichao @ProExpertProg
|
/vllm/compilation @zou3519 @youkaichao @ProExpertProg
|
||||||
/vllm/distributed/kv_transfer @NickLucche @ApostaC
|
|
||||||
CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
||||||
|
|
||||||
# Any change to the VllmConfig changes can have a large user-facing impact,
|
# Any change to the VllmConfig changes can have a large user-facing impact,
|
||||||
# so spam a lot of people
|
# so spam a lot of people
|
||||||
/vllm/config @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
|
/vllm/config.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor
|
||||||
/vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
|
|
||||||
|
|
||||||
# vLLM V1
|
# vLLM V1
|
||||||
/vllm/v1/attention @LucasWilkinson
|
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
|
||||||
/vllm/v1/attention/backends/flashinfer.py @mgoin
|
/vllm/v1/structured_output @mgoin @russellb @aarnphm
|
||||||
/vllm/v1/attention/backends/triton_attn.py @tdoublep
|
|
||||||
/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
|
|
||||||
/vllm/v1/sample @22quinn @houseroad @njhill
|
|
||||||
/vllm/v1/spec_decode @benchislett @luccafong
|
|
||||||
/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
|
|
||||||
/vllm/v1/kv_cache_interface.py @heheda12345
|
|
||||||
/vllm/v1/offloading @ApostaC
|
|
||||||
|
|
||||||
# Test ownership
|
# Test ownership
|
||||||
/.buildkite/lm-eval-harness @mgoin @simon-mo
|
/.buildkite/lm-eval-harness @mgoin @simon-mo
|
||||||
|
/tests/async_engine @njhill @robertgshaw2-redhat @simon-mo
|
||||||
|
/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac
|
||||||
/tests/distributed/test_multi_node_assignment.py @youkaichao
|
/tests/distributed/test_multi_node_assignment.py @youkaichao
|
||||||
/tests/distributed/test_pipeline_parallel.py @youkaichao
|
/tests/distributed/test_pipeline_parallel.py @youkaichao
|
||||||
/tests/distributed/test_same_node.py @youkaichao
|
/tests/distributed/test_same_node.py @youkaichao
|
||||||
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm @NickLucche
|
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm
|
||||||
/tests/evals @mgoin
|
/tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb @aarnphm
|
||||||
/tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
|
/tests/kernels @tlrmchlsmth @WoosukKwon
|
||||||
|
/tests/model_executor/test_guided_processors.py @mgoin @russellb
|
||||||
/tests/models @DarkLight1337 @ywang96
|
/tests/models @DarkLight1337 @ywang96
|
||||||
/tests/multimodal @DarkLight1337 @ywang96 @NickLucche
|
/tests/multi_step @alexm-redhat @comaniac
|
||||||
/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256
|
/tests/multimodal @DarkLight1337 @ywang96
|
||||||
|
/tests/prefix_caching @comaniac @KuntaiDu
|
||||||
|
/tests/quantization @mgoin @robertgshaw2-redhat
|
||||||
/tests/test_inputs.py @DarkLight1337 @ywang96
|
/tests/test_inputs.py @DarkLight1337 @ywang96
|
||||||
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
|
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
|
||||||
/tests/v1/structured_output @mgoin @russellb @aarnphm
|
/tests/v1/structured_output @mgoin @russellb @aarnphm
|
||||||
/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
|
/tests/weight_loading @mgoin @youkaichao
|
||||||
/tests/weight_loading @mgoin @youkaichao @yewentao256
|
|
||||||
/tests/lora @jeejeelee
|
/tests/lora @jeejeelee
|
||||||
/tests/models/language/generation/test_hybrid.py @tdoublep
|
|
||||||
/tests/v1/kv_connector/nixl_integration @NickLucche
|
|
||||||
/tests/v1/kv_connector @ApostaC
|
|
||||||
/tests/v1/offloading @ApostaC
|
|
||||||
|
|
||||||
# Transformers backend
|
|
||||||
/vllm/model_executor/models/transformers @hmellor
|
|
||||||
/tests/models/test_transformers.py @hmellor
|
|
||||||
|
|
||||||
# Docs
|
# Docs
|
||||||
/docs/mkdocs @hmellor
|
/docs @hmellor
|
||||||
/docs/**/*.yml @hmellor
|
|
||||||
/requirements/docs.txt @hmellor
|
|
||||||
.readthedocs.yaml @hmellor
|
|
||||||
mkdocs.yaml @hmellor
|
mkdocs.yaml @hmellor
|
||||||
|
|
||||||
# Linting
|
|
||||||
.markdownlint.yaml @hmellor
|
|
||||||
.pre-commit-config.yaml @hmellor
|
|
||||||
/tools/pre_commit @hmellor
|
|
||||||
|
|
||||||
# CPU
|
|
||||||
/vllm/v1/worker/cpu* @bigPYJ1151
|
|
||||||
/csrc/cpu @bigPYJ1151
|
|
||||||
/vllm/platforms/cpu.py @bigPYJ1151
|
|
||||||
/cmake/cpu_extension.cmake @bigPYJ1151
|
|
||||||
/docker/Dockerfile.cpu @bigPYJ1151
|
|
||||||
|
|
||||||
# Intel GPU
|
|
||||||
/vllm/v1/worker/xpu* @jikunshang
|
|
||||||
/vllm/platforms/xpu.py @jikunshang
|
|
||||||
/docker/Dockerfile.xpu @jikunshang
|
|
||||||
|
|
||||||
# Qwen-specific files
|
|
||||||
/vllm/attention/backends/dual_chunk_flash_attn.py @sighingnow
|
|
||||||
/vllm/model_executor/models/qwen* @sighingnow
|
|
||||||
|
|
||||||
# MTP-specific files
|
|
||||||
/vllm/model_executor/models/deepseek_mtp.py @luccafong
|
|
||||||
|
|
||||||
# Mistral-specific files
|
|
||||||
/vllm/model_executor/models/mistral*.py @patrickvonplaten
|
|
||||||
/vllm/model_executor/models/mixtral*.py @patrickvonplaten
|
|
||||||
/vllm/model_executor/models/voxtral*.py @patrickvonplaten
|
|
||||||
/vllm/model_executor/models/pixtral*.py @patrickvonplaten
|
|
||||||
/vllm/transformers_utils/configs/mistral.py @patrickvonplaten
|
|
||||||
/vllm/transformers_utils/tokenizers/mistral.py @patrickvonplaten
|
|
||||||
|
|
||||||
# Kernels
|
|
||||||
/vllm/attention/ops/chunked_prefill_paged_decode.py @tdoublep
|
|
||||||
/vllm/attention/ops/triton_unified_attention.py @tdoublep
|
|
||||||
|
|
||||||
# ROCm related: specify owner with write access to notify AMD folks for careful code review
|
|
||||||
/docker/Dockerfile.rocm* @gshtras
|
|
||||||
/vllm/v1/attention/backends/rocm*.py @gshtras
|
|
||||||
/vllm/v1/attention/backends/mla/rocm*.py @gshtras
|
|
||||||
/vllm/attention/ops/rocm*.py @gshtras
|
|
||||||
/vllm/model_executor/layers/fused_moe/rocm*.py @gshtras
|
|
||||||
|
|
||||||
# TPU
|
|
||||||
/vllm/v1/worker/tpu* @NickLucche
|
|
||||||
/vllm/platforms/tpu.py @NickLucche
|
|
||||||
/vllm/v1/sample/tpu @NickLucche
|
|
||||||
/vllm/tests/v1/tpu @NickLucche
|
|
||||||
|
|
||||||
# KVConnector installation files
|
|
||||||
/requirements/kv_connectors.txt @NickLucche
|
|
||||||
|
|
||||||
# Pooling models
|
|
||||||
/examples/*/pooling/ @noooop
|
|
||||||
/tests/models/*/pooling* @noooop
|
|
||||||
/tests/entrypoints/pooling @noooop
|
|
||||||
/vllm/config/pooler.py @noooop
|
|
||||||
/vllm/pooling_params.py @noooop
|
|
||||||
/vllm/model_executor/layers/pooler.py @noooop
|
|
||||||
|
4
.github/ISSUE_TEMPLATE/750-RFC.yml
vendored
4
.github/ISSUE_TEMPLATE/750-RFC.yml
vendored
@ -43,6 +43,10 @@ body:
|
|||||||
Any other things you would like to mention.
|
Any other things you would like to mention.
|
||||||
validations:
|
validations:
|
||||||
required: false
|
required: false
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: >
|
||||||
|
Thanks for contributing 🎉! The vLLM core team hosts a biweekly RFC review session at 9:30AM Pacific Time, while most RFCs can be discussed online, you can optionally sign up for a slot to discuss your RFC online [here](https://docs.google.com/document/d/1CiLVBZeIVfR7_PNAKVSusxpceywkoOOB78qoWqHvSZc/edit).
|
||||||
- type: checkboxes
|
- type: checkboxes
|
||||||
id: askllm
|
id: askllm
|
||||||
attributes:
|
attributes:
|
||||||
|
21
.github/PULL_REQUEST_TEMPLATE.md
vendored
21
.github/PULL_REQUEST_TEMPLATE.md
vendored
@ -1,5 +1,10 @@
|
|||||||
<!-- markdownlint-disable -->
|
## Essential Elements of an Effective PR Description Checklist
|
||||||
PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTTOM) HAVE BEEN CONSIDERED.
|
- [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)".
|
||||||
|
- [ ] The test plan, such as providing test command.
|
||||||
|
- [ ] The test results, such as pasting the results comparison before and after, or e2e results
|
||||||
|
- [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model.
|
||||||
|
|
||||||
|
PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS ABOVE HAVE BEEN CONSIDERED.
|
||||||
|
|
||||||
## Purpose
|
## Purpose
|
||||||
|
|
||||||
@ -7,15 +12,7 @@ PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTT
|
|||||||
|
|
||||||
## Test Result
|
## Test Result
|
||||||
|
|
||||||
---
|
## (Optional) Documentation Update
|
||||||
<details>
|
|
||||||
<summary> Essential Elements of an Effective PR Description Checklist </summary>
|
|
||||||
|
|
||||||
- [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)".
|
|
||||||
- [ ] The test plan, such as providing test command.
|
|
||||||
- [ ] The test results, such as pasting the results comparison before and after, or e2e results
|
|
||||||
- [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model.
|
|
||||||
- [ ] (Optional) Release notes update. If your change is user facing, please update the release notes draft in the [Google Doc](https://docs.google.com/document/d/1YyVqrgX4gHTtrstbq8oWUImOyPCKSGnJ7xtTpmXzlRs/edit?tab=t.0).
|
|
||||||
</details>
|
|
||||||
|
|
||||||
|
<!--- pyml disable-next-line no-emphasis-as-heading -->
|
||||||
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions)
|
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions)
|
||||||
|
92
.github/mergify.yml
vendored
92
.github/mergify.yml
vendored
@ -2,7 +2,6 @@ pull_request_rules:
|
|||||||
- name: label-documentation
|
- name: label-documentation
|
||||||
description: Automatically apply documentation label
|
description: Automatically apply documentation label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^[^/]+\.md$
|
- files~=^[^/]+\.md$
|
||||||
- files~=^docs/
|
- files~=^docs/
|
||||||
@ -11,13 +10,10 @@ pull_request_rules:
|
|||||||
label:
|
label:
|
||||||
add:
|
add:
|
||||||
- documentation
|
- documentation
|
||||||
comment:
|
|
||||||
message: "Documentation preview: https://vllm--{{number}}.org.readthedocs.build/en/{{number}}/"
|
|
||||||
|
|
||||||
- name: label-ci-build
|
- name: label-ci-build
|
||||||
description: Automatically apply ci/build label
|
description: Automatically apply ci/build label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^\.github/
|
- files~=^\.github/
|
||||||
- files~=\.buildkite/
|
- files~=\.buildkite/
|
||||||
@ -34,7 +30,6 @@ pull_request_rules:
|
|||||||
- name: label-deepseek
|
- name: label-deepseek
|
||||||
description: Automatically apply deepseek label
|
description: Automatically apply deepseek label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^examples/.*deepseek.*\.py
|
- files~=^examples/.*deepseek.*\.py
|
||||||
- files~=^tests/.*deepseek.*\.py
|
- files~=^tests/.*deepseek.*\.py
|
||||||
@ -51,7 +46,6 @@ pull_request_rules:
|
|||||||
- name: label-frontend
|
- name: label-frontend
|
||||||
description: Automatically apply frontend label
|
description: Automatically apply frontend label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- files~=^vllm/entrypoints/
|
- files~=^vllm/entrypoints/
|
||||||
actions:
|
actions:
|
||||||
label:
|
label:
|
||||||
@ -61,7 +55,6 @@ pull_request_rules:
|
|||||||
- name: label-llama
|
- name: label-llama
|
||||||
description: Automatically apply llama label
|
description: Automatically apply llama label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^examples/.*llama.*\.py
|
- files~=^examples/.*llama.*\.py
|
||||||
- files~=^tests/.*llama.*\.py
|
- files~=^tests/.*llama.*\.py
|
||||||
@ -77,7 +70,6 @@ pull_request_rules:
|
|||||||
- name: label-multi-modality
|
- name: label-multi-modality
|
||||||
description: Automatically apply multi-modality label
|
description: Automatically apply multi-modality label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^vllm/multimodal/
|
- files~=^vllm/multimodal/
|
||||||
- files~=^tests/multimodal/
|
- files~=^tests/multimodal/
|
||||||
@ -91,7 +83,6 @@ pull_request_rules:
|
|||||||
- name: label-new-model
|
- name: label-new-model
|
||||||
description: Automatically apply new-model label
|
description: Automatically apply new-model label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- and:
|
- and:
|
||||||
- files~=^vllm/model_executor/models/
|
- files~=^vllm/model_executor/models/
|
||||||
- files=vllm/model_executor/models/registry.py
|
- files=vllm/model_executor/models/registry.py
|
||||||
@ -103,7 +94,6 @@ pull_request_rules:
|
|||||||
- name: label-performance
|
- name: label-performance
|
||||||
description: Automatically apply performance label
|
description: Automatically apply performance label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^benchmarks/
|
- files~=^benchmarks/
|
||||||
- files~=^vllm/benchmarks/
|
- files~=^vllm/benchmarks/
|
||||||
@ -117,7 +107,6 @@ pull_request_rules:
|
|||||||
- name: label-qwen
|
- name: label-qwen
|
||||||
description: Automatically apply qwen label
|
description: Automatically apply qwen label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^examples/.*qwen.*\.py
|
- files~=^examples/.*qwen.*\.py
|
||||||
- files~=^tests/.*qwen.*\.py
|
- files~=^tests/.*qwen.*\.py
|
||||||
@ -129,32 +118,9 @@ pull_request_rules:
|
|||||||
add:
|
add:
|
||||||
- qwen
|
- qwen
|
||||||
|
|
||||||
- name: label-gpt-oss
|
|
||||||
description: Automatically apply gpt-oss label
|
|
||||||
conditions:
|
|
||||||
- label != stale
|
|
||||||
- or:
|
|
||||||
- files~=^examples/.*gpt[-_]?oss.*\.py
|
|
||||||
- files~=^tests/.*gpt[-_]?oss.*\.py
|
|
||||||
- files~=^tests/entrypoints/openai/test_response_api_with_harmony.py
|
|
||||||
- files~=^tests/entrypoints/test_context.py
|
|
||||||
- files~=^vllm/model_executor/models/.*gpt[-_]?oss.*\.py
|
|
||||||
- files~=^vllm/model_executor/layers/.*gpt[-_]?oss.*\.py
|
|
||||||
- files~=^vllm/entrypoints/harmony_utils.py
|
|
||||||
- files~=^vllm/entrypoints/tool_server.py
|
|
||||||
- files~=^vllm/entrypoints/tool.py
|
|
||||||
- files~=^vllm/entrypoints/context.py
|
|
||||||
- title~=(?i)gpt[-_]?oss
|
|
||||||
- title~=(?i)harmony
|
|
||||||
actions:
|
|
||||||
label:
|
|
||||||
add:
|
|
||||||
- gpt-oss
|
|
||||||
|
|
||||||
- name: label-rocm
|
- name: label-rocm
|
||||||
description: Automatically apply rocm label
|
description: Automatically apply rocm label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^csrc/rocm/
|
- files~=^csrc/rocm/
|
||||||
- files~=^docker/Dockerfile.rocm
|
- files~=^docker/Dockerfile.rocm
|
||||||
@ -175,7 +141,6 @@ pull_request_rules:
|
|||||||
- name: label-structured-output
|
- name: label-structured-output
|
||||||
description: Automatically apply structured-output label
|
description: Automatically apply structured-output label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^benchmarks/structured_schemas/
|
- files~=^benchmarks/structured_schemas/
|
||||||
- files=benchmarks/benchmark_serving_structured_output.py
|
- files=benchmarks/benchmark_serving_structured_output.py
|
||||||
@ -184,8 +149,11 @@ pull_request_rules:
|
|||||||
- files=examples/offline_inference/structured_outputs.py
|
- files=examples/offline_inference/structured_outputs.py
|
||||||
- files=examples/online_serving/openai_chat_completion_structured_outputs.py
|
- files=examples/online_serving/openai_chat_completion_structured_outputs.py
|
||||||
- files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
|
- files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
|
||||||
|
- files~=^vllm/model_executor/guided_decoding/
|
||||||
|
- files=tests/model_executor/test_guided_processors.py
|
||||||
|
- files=tests/entrypoints/llm/test_guided_generate.py
|
||||||
- files~=^tests/v1/structured_output/
|
- files~=^tests/v1/structured_output/
|
||||||
- files=tests/v1/entrypoints/llm/test_struct_output_generate.py
|
- files=tests/v1/entrypoints/llm/test_guided_generate.py
|
||||||
- files~=^vllm/v1/structured_output/
|
- files~=^vllm/v1/structured_output/
|
||||||
actions:
|
actions:
|
||||||
label:
|
label:
|
||||||
@ -195,7 +163,6 @@ pull_request_rules:
|
|||||||
- name: label-speculative-decoding
|
- name: label-speculative-decoding
|
||||||
description: Automatically apply speculative-decoding label
|
description: Automatically apply speculative-decoding label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^vllm/v1/spec_decode/
|
- files~=^vllm/v1/spec_decode/
|
||||||
- files~=^tests/v1/spec_decode/
|
- files~=^tests/v1/spec_decode/
|
||||||
@ -211,7 +178,6 @@ pull_request_rules:
|
|||||||
- name: label-v1
|
- name: label-v1
|
||||||
description: Automatically apply v1 label
|
description: Automatically apply v1 label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^vllm/v1/
|
- files~=^vllm/v1/
|
||||||
- files~=^tests/v1/
|
- files~=^tests/v1/
|
||||||
@ -224,7 +190,6 @@ pull_request_rules:
|
|||||||
description: Automatically apply tpu label
|
description: Automatically apply tpu label
|
||||||
# Keep this list in sync with `label-tpu-remove` conditions
|
# Keep this list in sync with `label-tpu-remove` conditions
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=tpu.py
|
- files~=tpu.py
|
||||||
- files~=_tpu
|
- files~=_tpu
|
||||||
@ -240,7 +205,6 @@ pull_request_rules:
|
|||||||
description: Automatically remove tpu label
|
description: Automatically remove tpu label
|
||||||
# Keep this list in sync with `label-tpu` conditions
|
# Keep this list in sync with `label-tpu` conditions
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- and:
|
- and:
|
||||||
- -files~=tpu.py
|
- -files~=tpu.py
|
||||||
- -files~=_tpu
|
- -files~=_tpu
|
||||||
@ -255,9 +219,9 @@ pull_request_rules:
|
|||||||
- name: label-tool-calling
|
- name: label-tool-calling
|
||||||
description: Automatically add tool-calling label
|
description: Automatically add tool-calling label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^tests/tool_use/
|
- files~=^tests/tool_use/
|
||||||
|
- files~=^tests/mistral_tool_use/
|
||||||
- files~=^tests/entrypoints/openai/tool_parsers/
|
- files~=^tests/entrypoints/openai/tool_parsers/
|
||||||
- files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
|
- files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
|
||||||
- files~=^vllm/entrypoints/openai/tool_parsers/
|
- files~=^vllm/entrypoints/openai/tool_parsers/
|
||||||
@ -274,9 +238,8 @@ pull_request_rules:
|
|||||||
|
|
||||||
- name: ping author on conflicts and add 'needs-rebase' label
|
- name: ping author on conflicts and add 'needs-rebase' label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
- conflict
|
||||||
- conflict
|
- -closed
|
||||||
- -closed
|
|
||||||
actions:
|
actions:
|
||||||
label:
|
label:
|
||||||
add:
|
add:
|
||||||
@ -290,55 +253,20 @@ pull_request_rules:
|
|||||||
|
|
||||||
- name: assign reviewer for tensorizer changes
|
- name: assign reviewer for tensorizer changes
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
|
||||||
- files~=^vllm/model_executor/model_loader/tensorizer.py
|
- files~=^vllm/model_executor/model_loader/tensorizer.py
|
||||||
- files~=^vllm/model_executor/model_loader/tensorizer_loader.py
|
- files~=^vllm/model_executor/model_loader/tensorizer_loader.py
|
||||||
- files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
- files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
||||||
- files~=^tests/model_executor/model_loader/tensorizer_loader/
|
- files~=^tests/tensorizer_loader/
|
||||||
actions:
|
actions:
|
||||||
assign:
|
assign:
|
||||||
users:
|
users:
|
||||||
- "sangstar"
|
- "sangstar"
|
||||||
|
|
||||||
- name: assign reviewer for modelopt changes
|
|
||||||
conditions:
|
|
||||||
- label != stale
|
|
||||||
- or:
|
|
||||||
- files~=^vllm/model_executor/layers/quantization/modelopt\.py$
|
|
||||||
- files~=^vllm/model_executor/layers/quantization/__init__\.py$
|
|
||||||
- files~=^tests/models/quantization/test_modelopt\.py$
|
|
||||||
- files~=^tests/quantization/test_modelopt\.py$
|
|
||||||
- files~=^tests/models/quantization/test_nvfp4\.py$
|
|
||||||
- files~=^docs/features/quantization/modelopt\.md$
|
|
||||||
actions:
|
|
||||||
assign:
|
|
||||||
users:
|
|
||||||
- "Edwardf0t1"
|
|
||||||
|
|
||||||
- name: remove 'needs-rebase' label when conflict is resolved
|
- name: remove 'needs-rebase' label when conflict is resolved
|
||||||
conditions:
|
conditions:
|
||||||
- -conflict
|
- -conflict
|
||||||
- -closed
|
- -closed
|
||||||
actions:
|
actions:
|
||||||
label:
|
label:
|
||||||
remove:
|
remove:
|
||||||
- needs-rebase
|
- needs-rebase
|
||||||
|
|
||||||
- name: label-kv-connector
|
|
||||||
description: Automatically apply kv-connector label
|
|
||||||
conditions:
|
|
||||||
- label != stale
|
|
||||||
- or:
|
|
||||||
- files~=^examples/online_serving/disaggregated[^/]*/.*
|
|
||||||
- files~=^examples/offline_inference/disaggregated[^/]*/.*
|
|
||||||
- files~=^examples/others/lmcache/
|
|
||||||
- files~=^tests/v1/kv_connector/
|
|
||||||
- files~=^vllm/distributed/kv_transfer/
|
|
||||||
- title~=(?i)\bP/?D\b
|
|
||||||
- title~=(?i)NIXL
|
|
||||||
- title~=(?i)LMCache
|
|
||||||
actions:
|
|
||||||
label:
|
|
||||||
add:
|
|
||||||
- kv-connector
|
|
21
.github/scale-config.yml
vendored
21
.github/scale-config.yml
vendored
@ -1,21 +0,0 @@
|
|||||||
# scale-config.yml:
|
|
||||||
# Powers what instance types are available for GHA auto-scaled
|
|
||||||
# runners. Runners listed here will be available as self hosted
|
|
||||||
# runners, configuration is directly pulled from the main branch.
|
|
||||||
# runner_types:
|
|
||||||
# runner_label:
|
|
||||||
# instance_type: m4.large
|
|
||||||
# os: linux
|
|
||||||
# # min_available defaults to the global cfg in the ALI Terraform
|
|
||||||
# min_available: undefined
|
|
||||||
# # when max_available value is not defined, no max runners is enforced
|
|
||||||
# max_available: undefined
|
|
||||||
# disk_size: 50
|
|
||||||
# is_ephemeral: true
|
|
||||||
|
|
||||||
runner_types:
|
|
||||||
linux.2xlarge:
|
|
||||||
disk_size: 150
|
|
||||||
instance_type: c5.2xlarge
|
|
||||||
is_ephemeral: true
|
|
||||||
os: linux
|
|
8
.github/scripts/cleanup_pr_body.sh
vendored
8
.github/scripts/cleanup_pr_body.sh
vendored
@ -15,11 +15,11 @@ NEW=/tmp/new_pr_body.txt
|
|||||||
gh pr view --json body --template "{{.body}}" "${PR_NUMBER}" > "${OLD}"
|
gh pr view --json body --template "{{.body}}" "${PR_NUMBER}" > "${OLD}"
|
||||||
cp "${OLD}" "${NEW}"
|
cp "${OLD}" "${NEW}"
|
||||||
|
|
||||||
# Remove markdown comments (like the <!-- markdownlint-disable --> at the start)
|
# Remove "FIX #xxxx (*link existing issues this PR will resolve*)"
|
||||||
sed -i '/<!--.*-->$/d' "${NEW}"
|
sed -i '/FIX #xxxx.*$/d' "${NEW}"
|
||||||
|
|
||||||
# Remove "PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTTOM) HAVE BEEN CONSIDERED."
|
# Remove "FILL IN THE PR DESCRIPTION HERE"
|
||||||
sed -i '/PLEASE FILL IN THE PR DESCRIPTION HERE.*$/d' "${NEW}"
|
sed -i '/FILL IN THE PR DESCRIPTION HERE/d' "${NEW}"
|
||||||
|
|
||||||
# Remove all lines after and including "**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**"
|
# Remove all lines after and including "**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**"
|
||||||
sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ.*\*\*/,$d' "${NEW}"
|
sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ.*\*\*/,$d' "${NEW}"
|
||||||
|
2
.github/workflows/add_label_automerge.yml
vendored
2
.github/workflows/add_label_automerge.yml
vendored
@ -10,7 +10,7 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Add label
|
- name: Add label
|
||||||
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
|
uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
|
||||||
with:
|
with:
|
||||||
script: |
|
script: |
|
||||||
github.rest.issues.addLabels({
|
github.rest.issues.addLabels({
|
||||||
|
29
.github/workflows/bc-lint.yml
vendored
29
.github/workflows/bc-lint.yml
vendored
@ -1,29 +0,0 @@
|
|||||||
name: BC Lint
|
|
||||||
|
|
||||||
on:
|
|
||||||
pull_request:
|
|
||||||
types:
|
|
||||||
- opened
|
|
||||||
- synchronize
|
|
||||||
- reopened
|
|
||||||
- labeled
|
|
||||||
- unlabeled
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
bc_lint:
|
|
||||||
if: github.repository_owner == 'vllm-project'
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
steps:
|
|
||||||
- name: Run BC Lint Action
|
|
||||||
uses: pytorch/test-infra/.github/actions/bc-lint@main
|
|
||||||
with:
|
|
||||||
repo: ${{ github.event.pull_request.head.repo.full_name }}
|
|
||||||
base_sha: ${{ github.event.pull_request.base.sha }}
|
|
||||||
head_sha: ${{ github.event.pull_request.head.sha }}
|
|
||||||
suppression: ${{ contains(github.event.pull_request.labels.*.name, 'suppress-bc-linter') }}
|
|
||||||
docs_link: 'https://github.com/pytorch/test-infra/wiki/BC-Linter'
|
|
||||||
config_dir: .github
|
|
||||||
|
|
||||||
concurrency:
|
|
||||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}
|
|
||||||
cancel-in-progress: true
|
|
2
.github/workflows/cleanup_pr_body.yml
vendored
2
.github/workflows/cleanup_pr_body.yml
vendored
@ -16,7 +16,7 @@ jobs:
|
|||||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
|
|
||||||
- name: Set up Python
|
- name: Set up Python
|
||||||
uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
|
uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
|
||||||
with:
|
with:
|
||||||
python-version: '3.12'
|
python-version: '3.12'
|
||||||
|
|
||||||
|
361
.github/workflows/issue_autolabel.yml
vendored
361
.github/workflows/issue_autolabel.yml
vendored
@ -1,361 +0,0 @@
|
|||||||
name: Label issues based on keywords
|
|
||||||
on:
|
|
||||||
issues:
|
|
||||||
types: [opened, edited, reopened]
|
|
||||||
permissions:
|
|
||||||
issues: write # needed so the workflow can add labels
|
|
||||||
contents: read
|
|
||||||
concurrency:
|
|
||||||
group: issue-labeler-${{ github.event.issue.number }}
|
|
||||||
cancel-in-progress: true
|
|
||||||
jobs:
|
|
||||||
add-labels:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
steps:
|
|
||||||
- name: Label issues based on keywords
|
|
||||||
id: label-step
|
|
||||||
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
|
|
||||||
with:
|
|
||||||
script: |
|
|
||||||
// Configuration: Add new labels and keywords here
|
|
||||||
const labelConfig = {
|
|
||||||
rocm: {
|
|
||||||
// Keyword search - matches whole words only (with word boundaries)
|
|
||||||
keywords: [
|
|
||||||
{
|
|
||||||
term: "composable kernel",
|
|
||||||
searchIn: "both"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
term: "rccl",
|
|
||||||
searchIn: "body" // only search in body
|
|
||||||
},
|
|
||||||
{
|
|
||||||
term: "migraphx",
|
|
||||||
searchIn: "title" // only search in title
|
|
||||||
},
|
|
||||||
{
|
|
||||||
term: "hipgraph",
|
|
||||||
searchIn: "both"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
term: "ROCm System Management Interface",
|
|
||||||
searchIn: "body"
|
|
||||||
},
|
|
||||||
],
|
|
||||||
// Substring search - matches anywhere in text (partial matches)
|
|
||||||
substrings: [
|
|
||||||
{
|
|
||||||
term: "VLLM_ROCM_",
|
|
||||||
searchIn: "both"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
term: "aiter",
|
|
||||||
searchIn: "title"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
term: "rocm",
|
|
||||||
searchIn: "title"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
term: "amd",
|
|
||||||
searchIn: "title"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
term: "hip-",
|
|
||||||
searchIn: "both"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
term: "gfx",
|
|
||||||
searchIn: "both"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
term: "cdna",
|
|
||||||
searchIn: "both"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
term: "rdna",
|
|
||||||
searchIn: "both"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
term: "torch_hip",
|
|
||||||
searchIn: "body" // only in body
|
|
||||||
},
|
|
||||||
{
|
|
||||||
term: "_hip",
|
|
||||||
searchIn: "both"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
term: "hip_",
|
|
||||||
searchIn: "both"
|
|
||||||
},
|
|
||||||
// ROCm tools and libraries
|
|
||||||
{
|
|
||||||
term: "hipify",
|
|
||||||
searchIn: "both"
|
|
||||||
},
|
|
||||||
],
|
|
||||||
// Regex patterns - for complex pattern matching
|
|
||||||
regexPatterns: [
|
|
||||||
{
|
|
||||||
pattern: "\\bmi\\d{3}[a-z]*\\b",
|
|
||||||
description: "AMD GPU names (mi + 3 digits + optional letters)",
|
|
||||||
flags: "gi",
|
|
||||||
searchIn: "both" // "title", "body", or "both"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
},
|
|
||||||
// Add more label configurations here as needed
|
|
||||||
// example: {
|
|
||||||
// keywords: [...],
|
|
||||||
// substrings: [...],
|
|
||||||
// regexPatterns: [...]
|
|
||||||
// },
|
|
||||||
};
|
|
||||||
// Helper function to create regex based on search type
|
|
||||||
function createSearchRegex(term, type) {
|
|
||||||
// Escape special regex characters in the term
|
|
||||||
const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
||||||
switch (type) {
|
|
||||||
case 'keyword':
|
|
||||||
// Word boundary search - matches whole words only
|
|
||||||
return new RegExp(`\\b${escapedTerm}\\b`, "gi");
|
|
||||||
case 'substring':
|
|
||||||
// Substring search - matches anywhere in the text
|
|
||||||
return new RegExp(escapedTerm, "gi");
|
|
||||||
default:
|
|
||||||
throw new Error(`Unknown search type: ${type}`);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// Helper function to find matching terms in text with line information
|
|
||||||
function findMatchingTermsWithLines(text, searchTerms = [], searchType = 'keyword', searchLocation = '') {
|
|
||||||
const matches = [];
|
|
||||||
const lines = text.split('\n');
|
|
||||||
for (const termConfig of searchTerms) {
|
|
||||||
let regex;
|
|
||||||
let term, searchIn, pattern, description, flags;
|
|
||||||
// Handle different input formats (string or object)
|
|
||||||
if (typeof termConfig === 'string') {
|
|
||||||
term = termConfig;
|
|
||||||
searchIn = 'both'; // default
|
|
||||||
} else {
|
|
||||||
term = termConfig.term;
|
|
||||||
searchIn = termConfig.searchIn || 'both';
|
|
||||||
pattern = termConfig.pattern;
|
|
||||||
description = termConfig.description;
|
|
||||||
flags = termConfig.flags;
|
|
||||||
}
|
|
||||||
// Skip if this term shouldn't be searched in the current location
|
|
||||||
if (searchIn !== 'both' && searchIn !== searchLocation) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
// Create appropriate regex
|
|
||||||
if (searchType === 'regex') {
|
|
||||||
regex = new RegExp(pattern, flags || "gi");
|
|
||||||
} else {
|
|
||||||
regex = createSearchRegex(term, searchType);
|
|
||||||
}
|
|
||||||
const termMatches = [];
|
|
||||||
// Check each line for matches
|
|
||||||
lines.forEach((line, lineIndex) => {
|
|
||||||
const lineMatches = line.match(regex);
|
|
||||||
if (lineMatches) {
|
|
||||||
lineMatches.forEach(match => {
|
|
||||||
termMatches.push({
|
|
||||||
match: match,
|
|
||||||
lineNumber: lineIndex + 1,
|
|
||||||
lineContent: line.trim(),
|
|
||||||
searchType: searchType,
|
|
||||||
searchLocation: searchLocation,
|
|
||||||
originalTerm: term || pattern,
|
|
||||||
description: description,
|
|
||||||
// Show context around the match in the line
|
|
||||||
context: line.length > 100 ?
|
|
||||||
line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30),
|
|
||||||
line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...'
|
|
||||||
: line.trim()
|
|
||||||
});
|
|
||||||
});
|
|
||||||
}
|
|
||||||
});
|
|
||||||
if (termMatches.length > 0) {
|
|
||||||
matches.push({
|
|
||||||
term: term || (description || pattern),
|
|
||||||
searchType: searchType,
|
|
||||||
searchLocation: searchLocation,
|
|
||||||
searchIn: searchIn,
|
|
||||||
pattern: pattern,
|
|
||||||
matches: termMatches,
|
|
||||||
count: termMatches.length
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return matches;
|
|
||||||
}
|
|
||||||
// Helper function to check if label should be added
|
|
||||||
async function processLabel(labelName, config) {
|
|
||||||
const body = context.payload.issue.body || "";
|
|
||||||
const title = context.payload.issue.title || "";
|
|
||||||
core.notice(`Processing label: ${labelName}`);
|
|
||||||
core.notice(`Issue Title: "${title}"`);
|
|
||||||
core.notice(`Issue Body length: ${body.length} characters`);
|
|
||||||
let shouldAddLabel = false;
|
|
||||||
let allMatches = [];
|
|
||||||
let reason = '';
|
|
||||||
const keywords = config.keywords || [];
|
|
||||||
const substrings = config.substrings || [];
|
|
||||||
const regexPatterns = config.regexPatterns || [];
|
|
||||||
core.notice(`Searching with ${keywords.length} keywords, ${substrings.length} substrings, and ${regexPatterns.length} regex patterns`);
|
|
||||||
// Search in title
|
|
||||||
if (title.trim()) {
|
|
||||||
core.notice(`Searching in title: "${title}"`);
|
|
||||||
const titleKeywordMatches = findMatchingTermsWithLines(title, keywords, 'keyword', 'title');
|
|
||||||
const titleSubstringMatches = findMatchingTermsWithLines(title, substrings, 'substring', 'title');
|
|
||||||
const titleRegexMatches = findMatchingTermsWithLines(title, regexPatterns, 'regex', 'title');
|
|
||||||
allMatches.push(...titleKeywordMatches, ...titleSubstringMatches, ...titleRegexMatches);
|
|
||||||
}
|
|
||||||
// Search in body
|
|
||||||
if (body.trim()) {
|
|
||||||
core.notice(`Searching in body (${body.length} characters)`);
|
|
||||||
const bodyKeywordMatches = findMatchingTermsWithLines(body, keywords, 'keyword', 'body');
|
|
||||||
const bodySubstringMatches = findMatchingTermsWithLines(body, substrings, 'substring', 'body');
|
|
||||||
const bodyRegexMatches = findMatchingTermsWithLines(body, regexPatterns, 'regex', 'body');
|
|
||||||
allMatches.push(...bodyKeywordMatches, ...bodySubstringMatches, ...bodyRegexMatches);
|
|
||||||
}
|
|
||||||
if (allMatches.length > 0) {
|
|
||||||
core.notice(`Found ${allMatches.length} matching term(s):`);
|
|
||||||
for (const termMatch of allMatches) {
|
|
||||||
const locationText = termMatch.searchLocation === 'title' ? 'title' : 'body';
|
|
||||||
const searchInText = termMatch.searchIn === 'both' ? 'both' : termMatch.searchIn;
|
|
||||||
if (termMatch.searchType === 'regex') {
|
|
||||||
core.notice(` 📍 Regex: "${termMatch.term}" (pattern: ${termMatch.pattern}) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
|
|
||||||
} else {
|
|
||||||
core.notice(` 📍 Term: "${termMatch.term}" (${termMatch.searchType} search) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
|
|
||||||
}
|
|
||||||
// Show details for each match
|
|
||||||
termMatch.matches.forEach((match, index) => {
|
|
||||||
core.notice(` ${index + 1}. Line ${match.lineNumber} in ${match.searchLocation}: "${match.match}" [${match.searchType}]`);
|
|
||||||
if (match.description) {
|
|
||||||
core.notice(` Description: ${match.description}`);
|
|
||||||
}
|
|
||||||
core.notice(` Context: ${match.context}`);
|
|
||||||
if (match.lineContent !== match.context) {
|
|
||||||
core.notice(` Full line: ${match.lineContent}`);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
shouldAddLabel = true;
|
|
||||||
const totalMatches = allMatches.reduce((sum, t) => sum + t.count, 0);
|
|
||||||
const titleMatches = allMatches.filter(t => t.searchLocation === 'title').reduce((sum, t) => sum + t.count, 0);
|
|
||||||
const bodyMatches = allMatches.filter(t => t.searchLocation === 'body').reduce((sum, t) => sum + t.count, 0);
|
|
||||||
const keywordMatches = allMatches.filter(t => t.searchType === 'keyword').reduce((sum, t) => sum + t.count, 0);
|
|
||||||
const substringMatches = allMatches.filter(t => t.searchType === 'substring').reduce((sum, t) => sum + t.count, 0);
|
|
||||||
const regexMatches = allMatches.filter(t => t.searchType === 'regex').reduce((sum, t) => sum + t.count, 0);
|
|
||||||
reason = `Found ${totalMatches} total matches (${titleMatches} in title, ${bodyMatches} in body) - ${keywordMatches} keyword matches, ${substringMatches} substring matches, ${regexMatches} regex matches`;
|
|
||||||
}
|
|
||||||
core.notice(`Final decision: ${shouldAddLabel ? 'ADD LABEL' : 'DO NOT ADD LABEL'}`);
|
|
||||||
core.notice(`Reason: ${reason || 'No matching terms found'}`);
|
|
||||||
if (shouldAddLabel) {
|
|
||||||
const existingLabels = context.payload.issue.labels.map(l => l.name);
|
|
||||||
if (!existingLabels.includes(labelName)) {
|
|
||||||
await github.rest.issues.addLabels({
|
|
||||||
owner: context.repo.owner,
|
|
||||||
repo: context.repo.repo,
|
|
||||||
issue_number: context.issue.number,
|
|
||||||
labels: [labelName],
|
|
||||||
});
|
|
||||||
core.notice(`Label "${labelName}" added. ${reason}`);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
core.notice(`Label "${labelName}" already present.`);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
core.notice(`No matching terms found for label "${labelName}".`);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
// Process all configured labels
|
|
||||||
const labelsAddedResults = await Promise.all(
|
|
||||||
Object.entries(labelConfig).map(([labelName, config]) =>
|
|
||||||
processLabel(labelName, config).then(added => ({ labelName, added }))
|
|
||||||
)
|
|
||||||
);
|
|
||||||
|
|
||||||
const numLabelsAdded = labelsAddedResults.filter(r => r.added).length;
|
|
||||||
core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
|
|
||||||
|
|
||||||
// Return which labels were added for the next step
|
|
||||||
const addedLabels = labelsAddedResults.filter(r => r.added).map(r => r.labelName);
|
|
||||||
core.setOutput('labels_added', JSON.stringify(addedLabels));
|
|
||||||
return addedLabels;
|
|
||||||
|
|
||||||
- name: CC users for labeled issues
|
|
||||||
if: steps.label-step.outputs.labels_added != '[]'
|
|
||||||
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
|
|
||||||
with:
|
|
||||||
script: |
|
|
||||||
// Configuration: Map labels to GitHub users to CC
|
|
||||||
// You can add multiple users per label, and multiple label configurations
|
|
||||||
const ccConfig = {
|
|
||||||
rocm: {
|
|
||||||
users: ['hongxiayang', 'tjtanaa', 'vllmellm'], // Add more users as needed: ['user1', 'user2', 'user3']
|
|
||||||
message: 'CC {users} for ROCm-related issue' // {users} will be replaced with @mentions
|
|
||||||
},
|
|
||||||
// Add more label -> user mappings here
|
|
||||||
// Example:
|
|
||||||
// cuda: {
|
|
||||||
// users: ['user1', 'user2'],
|
|
||||||
// message: 'CC {users} for CUDA-related issue'
|
|
||||||
// },
|
|
||||||
// performance: {
|
|
||||||
// users: ['perfexpert'],
|
|
||||||
// message: 'CC {users} for performance issue'
|
|
||||||
// },
|
|
||||||
};
|
|
||||||
|
|
||||||
const labelsAdded = JSON.parse('${{ steps.label-step.outputs.labels_added }}');
|
|
||||||
core.notice(`Labels added: ${labelsAdded.join(', ')}`);
|
|
||||||
|
|
||||||
// Get existing comments to check for already mentioned users
|
|
||||||
const comments = await github.rest.issues.listComments({
|
|
||||||
owner: context.repo.owner,
|
|
||||||
repo: context.repo.repo,
|
|
||||||
issue_number: context.issue.number,
|
|
||||||
});
|
|
||||||
|
|
||||||
const issueBody = context.payload.issue.body || '';
|
|
||||||
const allExistingText = issueBody + '\n' + comments.data.map(c => c.body).join('\n');
|
|
||||||
|
|
||||||
// Process each label that was added
|
|
||||||
for (const label of labelsAdded) {
|
|
||||||
if (ccConfig[label]) {
|
|
||||||
const config = ccConfig[label];
|
|
||||||
const usersToMention = [];
|
|
||||||
|
|
||||||
// Check which users haven't been mentioned yet
|
|
||||||
for (const user of config.users) {
|
|
||||||
const mentionPattern = new RegExp(`@${user}\\b`, 'i');
|
|
||||||
if (!mentionPattern.test(allExistingText)) {
|
|
||||||
usersToMention.push(user);
|
|
||||||
} else {
|
|
||||||
core.notice(`@${user} already mentioned for label "${label}", skipping`);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Post comment if there are users to mention
|
|
||||||
if (usersToMention.length > 0) {
|
|
||||||
const mentions = usersToMention.map(u => `@${u}`).join(' ');
|
|
||||||
const message = config.message.replace('{users}', mentions);
|
|
||||||
|
|
||||||
await github.rest.issues.createComment({
|
|
||||||
owner: context.repo.owner,
|
|
||||||
repo: context.repo.repo,
|
|
||||||
issue_number: context.issue.number,
|
|
||||||
body: message
|
|
||||||
});
|
|
||||||
|
|
||||||
core.notice(`CC comment added for label "${label}": ${mentions}`);
|
|
||||||
} else {
|
|
||||||
core.notice(`All users for label "${label}" already mentioned, skipping comment`);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
85
.github/workflows/lint-and-deploy.yaml
vendored
Normal file
85
.github/workflows/lint-and-deploy.yaml
vendored
Normal file
@ -0,0 +1,85 @@
|
|||||||
|
name: Lint and Deploy Charts
|
||||||
|
|
||||||
|
on: pull_request
|
||||||
|
|
||||||
|
permissions:
|
||||||
|
contents: read
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
lint-and-deploy:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
|
||||||
|
- name: Set up Helm
|
||||||
|
uses: azure/setup-helm@b9e51907a09c216f16ebe8536097933489208112 # v4.3.0
|
||||||
|
with:
|
||||||
|
version: v3.14.4
|
||||||
|
|
||||||
|
#Python is required because ct lint runs Yamale and yamllint which require Python.
|
||||||
|
- uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
|
||||||
|
with:
|
||||||
|
python-version: '3.13'
|
||||||
|
|
||||||
|
- name: Set up chart-testing
|
||||||
|
uses: helm/chart-testing-action@0d28d3144d3a25ea2cc349d6e59901c4ff469b3b # v2.7.0
|
||||||
|
with:
|
||||||
|
version: v3.10.1
|
||||||
|
|
||||||
|
- name: Run chart-testing (lint)
|
||||||
|
run: ct lint --target-branch ${{ github.event.repository.default_branch }} --chart-dirs examples/online_serving/chart-helm --charts examples/online_serving/chart-helm
|
||||||
|
|
||||||
|
- name: Setup minio
|
||||||
|
run: |
|
||||||
|
docker network create vllm-net
|
||||||
|
docker run -d -p 9000:9000 --name minio --net vllm-net \
|
||||||
|
-e "MINIO_ACCESS_KEY=minioadmin" \
|
||||||
|
-e "MINIO_SECRET_KEY=minioadmin" \
|
||||||
|
-v /tmp/data:/data \
|
||||||
|
-v /tmp/config:/root/.minio \
|
||||||
|
minio/minio server /data
|
||||||
|
export AWS_ACCESS_KEY_ID=minioadmin
|
||||||
|
export AWS_SECRET_ACCESS_KEY=minioadmin
|
||||||
|
export AWS_EC2_METADATA_DISABLED=true
|
||||||
|
mkdir opt-125m
|
||||||
|
cd opt-125m && curl -O -Ls "https://huggingface.co/facebook/opt-125m/resolve/main/{pytorch_model.bin,config.json,generation_config.json,merges.txt,special_tokens_map.json,tokenizer_config.json,vocab.json}" && cd ..
|
||||||
|
aws --endpoint-url http://127.0.0.1:9000/ s3 mb s3://testbucket
|
||||||
|
aws --endpoint-url http://127.0.0.1:9000/ s3 cp opt-125m/ s3://testbucket/opt-125m --recursive
|
||||||
|
|
||||||
|
- name: Create kind cluster
|
||||||
|
uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0
|
||||||
|
|
||||||
|
- name: Build the Docker image vllm cpu
|
||||||
|
run: docker buildx build -f docker/Dockerfile.cpu -t vllm-cpu-env .
|
||||||
|
|
||||||
|
- name: Configuration of docker images, network and namespace for the kind cluster
|
||||||
|
run: |
|
||||||
|
docker pull amazon/aws-cli:2.6.4
|
||||||
|
kind load docker-image amazon/aws-cli:2.6.4 --name chart-testing
|
||||||
|
kind load docker-image vllm-cpu-env:latest --name chart-testing
|
||||||
|
docker network connect vllm-net "$(docker ps -aqf "name=chart-testing-control-plane")"
|
||||||
|
kubectl create ns ns-vllm
|
||||||
|
|
||||||
|
- name: Run chart-testing (install)
|
||||||
|
run: |
|
||||||
|
export AWS_ACCESS_KEY_ID=minioadmin
|
||||||
|
export AWS_SECRET_ACCESS_KEY=minioadmin
|
||||||
|
sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" &
|
||||||
|
helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set image.env[2].name=VLLM_CPU_CI_ENV --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string image.env[2].value="1" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
|
||||||
|
|
||||||
|
- name: curl test
|
||||||
|
run: |
|
||||||
|
kubectl -n ns-vllm port-forward service/test-vllm-service 8001:80 &
|
||||||
|
sleep 10
|
||||||
|
CODE="$(curl -v -f --location http://localhost:8001/v1/completions \
|
||||||
|
--header "Content-Type: application/json" \
|
||||||
|
--data '{
|
||||||
|
"model": "opt-125m",
|
||||||
|
"prompt": "San Francisco is a",
|
||||||
|
"max_tokens": 7,
|
||||||
|
"temperature": 0
|
||||||
|
}'):$CODE"
|
||||||
|
echo "$CODE"
|
17
.github/workflows/matchers/markdownlint.json
vendored
17
.github/workflows/matchers/markdownlint.json
vendored
@ -1,17 +0,0 @@
|
|||||||
{
|
|
||||||
"problemMatcher": [
|
|
||||||
{
|
|
||||||
"owner": "markdownlint",
|
|
||||||
"pattern": [
|
|
||||||
{
|
|
||||||
"regexp": "^([^:]*):(\\d+):?(\\d+)?\\s([\\w-\\/]*)\\s(.*)$",
|
|
||||||
"file": 1,
|
|
||||||
"line": 2,
|
|
||||||
"column": 3,
|
|
||||||
"code": 4,
|
|
||||||
"message": 5
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
7
.github/workflows/pre-commit.yml
vendored
7
.github/workflows/pre-commit.yml
vendored
@ -5,10 +5,6 @@ on:
|
|||||||
push:
|
push:
|
||||||
branches: [main]
|
branches: [main]
|
||||||
|
|
||||||
concurrency:
|
|
||||||
group: ${{ github.workflow }}-${{ github.ref }}
|
|
||||||
cancel-in-progress: ${{ github.event_name == 'pull_request' }}
|
|
||||||
|
|
||||||
permissions:
|
permissions:
|
||||||
contents: read
|
contents: read
|
||||||
|
|
||||||
@ -17,11 +13,10 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
- uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
|
- uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
|
||||||
with:
|
with:
|
||||||
python-version: "3.12"
|
python-version: "3.12"
|
||||||
- run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
|
- run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
|
||||||
- run: echo "::add-matcher::.github/workflows/matchers/markdownlint.json"
|
|
||||||
- run: echo "::add-matcher::.github/workflows/matchers/mypy.json"
|
- run: echo "::add-matcher::.github/workflows/matchers/mypy.json"
|
||||||
- uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
|
- uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
|
||||||
with:
|
with:
|
||||||
|
111
.github/workflows/publish.yml
vendored
Normal file
111
.github/workflows/publish.yml
vendored
Normal file
@ -0,0 +1,111 @@
|
|||||||
|
# This workflow will upload a Python Package to Release asset
|
||||||
|
# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions
|
||||||
|
|
||||||
|
name: Create Release
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
tags:
|
||||||
|
- v*
|
||||||
|
|
||||||
|
# Needed to create release and upload assets
|
||||||
|
permissions:
|
||||||
|
contents: write
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
release:
|
||||||
|
# Retrieve tag and create release
|
||||||
|
name: Create Release
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
outputs:
|
||||||
|
upload_url: ${{ steps.create_release.outputs.upload_url }}
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
|
|
||||||
|
- name: Extract branch info
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
echo "release_tag=${GITHUB_REF#refs/*/}" >> "$GITHUB_ENV"
|
||||||
|
|
||||||
|
- name: Create Release
|
||||||
|
id: create_release
|
||||||
|
uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
|
||||||
|
env:
|
||||||
|
RELEASE_TAG: ${{ env.release_tag }}
|
||||||
|
with:
|
||||||
|
github-token: "${{ secrets.GITHUB_TOKEN }}"
|
||||||
|
script: |
|
||||||
|
const script = require('.github/workflows/scripts/create_release.js')
|
||||||
|
await script(github, context, core)
|
||||||
|
|
||||||
|
# NOTE(simon): No longer build wheel using GitHub Actions. See buildkite's release workflow.
|
||||||
|
# wheel:
|
||||||
|
# name: Build Wheel
|
||||||
|
# runs-on: ${{ matrix.os }}
|
||||||
|
# needs: release
|
||||||
|
|
||||||
|
# strategy:
|
||||||
|
# fail-fast: false
|
||||||
|
# matrix:
|
||||||
|
# os: ['ubuntu-20.04']
|
||||||
|
# python-version: ['3.9', '3.10', '3.11', '3.12']
|
||||||
|
# pytorch-version: ['2.4.0'] # Must be the most recent version that meets requirements/cuda.txt.
|
||||||
|
# cuda-version: ['11.8', '12.1']
|
||||||
|
|
||||||
|
# steps:
|
||||||
|
# - name: Checkout
|
||||||
|
# uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
|
|
||||||
|
# - name: Setup ccache
|
||||||
|
# uses: hendrikmuhs/ccache-action@ed74d11c0b343532753ecead8a951bb09bb34bc9 # v1.2.14
|
||||||
|
# with:
|
||||||
|
# create-symlink: true
|
||||||
|
# key: ${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}
|
||||||
|
|
||||||
|
# - name: Set up Linux Env
|
||||||
|
# if: ${{ runner.os == 'Linux' }}
|
||||||
|
# run: |
|
||||||
|
# bash -x .github/workflows/scripts/env.sh
|
||||||
|
|
||||||
|
# - name: Set up Python
|
||||||
|
# uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
|
||||||
|
# with:
|
||||||
|
# python-version: ${{ matrix.python-version }}
|
||||||
|
|
||||||
|
# - name: Install CUDA ${{ matrix.cuda-version }}
|
||||||
|
# run: |
|
||||||
|
# bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }}
|
||||||
|
|
||||||
|
# - name: Install PyTorch ${{ matrix.pytorch-version }} with CUDA ${{ matrix.cuda-version }}
|
||||||
|
# run: |
|
||||||
|
# bash -x .github/workflows/scripts/pytorch-install.sh ${{ matrix.python-version }} ${{ matrix.pytorch-version }} ${{ matrix.cuda-version }}
|
||||||
|
|
||||||
|
# - name: Build wheel
|
||||||
|
# shell: bash
|
||||||
|
# env:
|
||||||
|
# CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
|
||||||
|
# run: |
|
||||||
|
# bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
|
||||||
|
# wheel_name=$(find dist -name "*whl" -print0 | xargs -0 -n 1 basename)
|
||||||
|
# asset_name=${wheel_name//"linux"/"manylinux1"}
|
||||||
|
# echo "wheel_name=${wheel_name}" >> "$GITHUB_ENV"
|
||||||
|
# echo "asset_name=${asset_name}" >> "$GITHUB_ENV"
|
||||||
|
|
||||||
|
# - name: Upload Release Asset
|
||||||
|
# uses: actions/upload-release-asset@e8f9f06c4b078e705bd2ea027f0926603fc9b4d5 # v1.0.2
|
||||||
|
# env:
|
||||||
|
# GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
# with:
|
||||||
|
# upload_url: ${{ needs.release.outputs.upload_url }}
|
||||||
|
# asset_path: ./dist/${{ env.wheel_name }}
|
||||||
|
# asset_name: ${{ env.asset_name }}
|
||||||
|
# asset_content_type: application/*
|
||||||
|
|
||||||
|
# (Danielkinz): This last step will publish the .whl to pypi. Warning: untested
|
||||||
|
# - name: Publish package
|
||||||
|
# uses: pypa/gh-action-pypi-publish@release/v1.8
|
||||||
|
# with:
|
||||||
|
# repository-url: https://test.pypi.org/legacy/
|
||||||
|
# password: ${{ secrets.PYPI_API_TOKEN }}
|
||||||
|
# skip-existing: true
|
51
.github/workflows/reminder_comment.yml
vendored
51
.github/workflows/reminder_comment.yml
vendored
@ -9,46 +9,19 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Remind to run full CI on PR
|
- name: Remind to run full CI on PR
|
||||||
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
|
uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
|
||||||
with:
|
with:
|
||||||
script: |
|
script: |
|
||||||
try {
|
github.rest.issues.createComment({
|
||||||
// Get the PR author
|
owner: context.repo.owner,
|
||||||
const prAuthor = context.payload.pull_request.user.login;
|
repo: context.repo.repo,
|
||||||
|
issue_number: context.issue.number,
|
||||||
// Check if this is the author's first PR in this repository
|
body: '👋 Hi! Thank you for contributing to the vLLM project.\n\n' +
|
||||||
// Use GitHub's search API to find all PRs by this author
|
'💬 Join our developer Slack at https://slack.vllm.ai to discuss your PR in #pr-reviews, coordinate on features in #feat- channels, or join special interest groups in #sig- channels.\n\n' +
|
||||||
const { data: searchResults } = await github.rest.search.issuesAndPullRequests({
|
'Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping `simon-mo` or `khluu` to add you in our Buildkite org.\n\n' +
|
||||||
q: `repo:${context.repo.owner}/${context.repo.repo} type:pr author:${prAuthor}`,
|
'Once the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n' +
|
||||||
per_page: 100
|
'To run CI, PR reviewers can either: Add `ready` label to the PR or enable auto-merge.\n\n' +
|
||||||
});
|
'🚀'
|
||||||
|
})
|
||||||
const authorPRCount = searchResults.total_count;
|
|
||||||
|
|
||||||
console.log(`Found ${authorPRCount} PRs by ${prAuthor}`);
|
|
||||||
|
|
||||||
// Only post comment if this is the first PR (only one PR by this author)
|
|
||||||
if (authorPRCount === 1) {
|
|
||||||
console.log(`Posting welcome comment for first-time contributor: ${prAuthor}`);
|
|
||||||
await github.rest.issues.createComment({
|
|
||||||
owner: context.repo.owner,
|
|
||||||
repo: context.repo.repo,
|
|
||||||
issue_number: context.issue.number,
|
|
||||||
body: '👋 Hi! Thank you for contributing to the vLLM project.\n\n' +
|
|
||||||
'💬 Join our developer Slack at https://slack.vllm.ai to discuss your PR in #pr-reviews, coordinate on features in #feat- channels, or join special interest groups in #sig- channels.\n\n' +
|
|
||||||
'Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. \n\n' +
|
|
||||||
'You ask your reviewers to trigger select CI tests on top of `fastcheck` CI. \n\n' +
|
|
||||||
'Once the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n' +
|
|
||||||
'To run CI, PR reviewers can either: Add `ready` label to the PR or enable auto-merge.\n\n' +
|
|
||||||
'If you have any questions, please reach out to us on Slack at https://slack.vllm.ai.\n\n' +
|
|
||||||
'🚀'
|
|
||||||
});
|
|
||||||
} else {
|
|
||||||
console.log(`Skipping comment for ${prAuthor} - not their first PR (${authorPRCount} PRs found)`);
|
|
||||||
}
|
|
||||||
} catch (error) {
|
|
||||||
console.error('Error checking PR history or posting comment:', error);
|
|
||||||
// Don't fail the workflow, just log the error
|
|
||||||
}
|
|
||||||
env:
|
env:
|
||||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
1
.github/workflows/scripts/build.sh
vendored
1
.github/workflows/scripts/build.sh
vendored
@ -15,6 +15,7 @@ $python_executable -m pip install -r requirements/build.txt -r requirements/cuda
|
|||||||
export MAX_JOBS=1
|
export MAX_JOBS=1
|
||||||
# Make sure release wheels are built for the following architectures
|
# Make sure release wheels are built for the following architectures
|
||||||
export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
|
export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
|
||||||
|
export VLLM_FA_CMAKE_GPU_ARCHES="80-real;90-real"
|
||||||
|
|
||||||
bash tools/check_repo.sh
|
bash tools/check_repo.sh
|
||||||
|
|
||||||
|
2
.github/workflows/stale.yml
vendored
2
.github/workflows/stale.yml
vendored
@ -13,7 +13,7 @@ jobs:
|
|||||||
actions: write
|
actions: write
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0
|
- uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0
|
||||||
with:
|
with:
|
||||||
# Increasing this value ensures that changes to this workflow
|
# Increasing this value ensures that changes to this workflow
|
||||||
# propagate to all issues and PRs in days rather than months
|
# propagate to all issues and PRs in days rather than months
|
||||||
|
17
.gitignore
vendored
17
.gitignore
vendored
@ -4,9 +4,6 @@
|
|||||||
# vllm-flash-attn built from source
|
# vllm-flash-attn built from source
|
||||||
vllm/vllm_flash_attn/*
|
vllm/vllm_flash_attn/*
|
||||||
|
|
||||||
# triton jit
|
|
||||||
.triton
|
|
||||||
|
|
||||||
# Byte-compiled / optimized / DLL files
|
# Byte-compiled / optimized / DLL files
|
||||||
__pycache__/
|
__pycache__/
|
||||||
*.py[cod]
|
*.py[cod]
|
||||||
@ -150,8 +147,7 @@ venv.bak/
|
|||||||
# mkdocs documentation
|
# mkdocs documentation
|
||||||
/site
|
/site
|
||||||
docs/argparse
|
docs/argparse
|
||||||
docs/examples/*
|
docs/examples
|
||||||
!docs/examples/README.md
|
|
||||||
|
|
||||||
# mypy
|
# mypy
|
||||||
.mypy_cache/
|
.mypy_cache/
|
||||||
@ -177,14 +173,6 @@ cython_debug/
|
|||||||
# VSCode
|
# VSCode
|
||||||
.vscode/
|
.vscode/
|
||||||
|
|
||||||
# Claude
|
|
||||||
CLAUDE.md
|
|
||||||
.claude/
|
|
||||||
|
|
||||||
# Codex
|
|
||||||
AGENTS.md
|
|
||||||
.codex/
|
|
||||||
|
|
||||||
# DS Store
|
# DS Store
|
||||||
.DS_Store
|
.DS_Store
|
||||||
|
|
||||||
@ -215,6 +203,3 @@ shellcheck*/
|
|||||||
|
|
||||||
# Ignore moe/marlin_moe gen code
|
# Ignore moe/marlin_moe gen code
|
||||||
csrc/moe/marlin_moe_wna16/kernel_*
|
csrc/moe/marlin_moe_wna16/kernel_*
|
||||||
|
|
||||||
# Ignore ep_kernels_workspace folder
|
|
||||||
ep_kernels_workspace/
|
|
||||||
|
@ -1,12 +0,0 @@
|
|||||||
MD007:
|
|
||||||
indent: 4
|
|
||||||
MD013: false
|
|
||||||
MD024:
|
|
||||||
siblings_only: true
|
|
||||||
MD033: false
|
|
||||||
MD045: false
|
|
||||||
MD046: false
|
|
||||||
MD051: false
|
|
||||||
MD052: false
|
|
||||||
MD053: false
|
|
||||||
MD059: false
|
|
@ -6,39 +6,50 @@ default_stages:
|
|||||||
- manual # Run in CI
|
- manual # Run in CI
|
||||||
exclude: 'vllm/third_party/.*'
|
exclude: 'vllm/third_party/.*'
|
||||||
repos:
|
repos:
|
||||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
- repo: https://github.com/google/yapf
|
||||||
rev: v0.14.0
|
rev: v0.43.0
|
||||||
hooks:
|
hooks:
|
||||||
- id: ruff-check
|
- id: yapf
|
||||||
|
args: [--in-place, --verbose]
|
||||||
|
# Keep the same list from yapfignore here to avoid yapf failing without any inputs
|
||||||
|
exclude: '(.buildkite|benchmarks|build|examples)/.*'
|
||||||
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||||
|
rev: v0.11.7
|
||||||
|
hooks:
|
||||||
|
- id: ruff
|
||||||
args: [--output-format, github, --fix]
|
args: [--output-format, github, --fix]
|
||||||
- id: ruff-format
|
- id: ruff-format
|
||||||
|
files: ^(.buildkite|benchmarks|examples)/.*
|
||||||
- repo: https://github.com/crate-ci/typos
|
- repo: https://github.com/crate-ci/typos
|
||||||
rev: v1.38.1
|
rev: v1.34.0
|
||||||
hooks:
|
hooks:
|
||||||
- id: typos
|
- id: typos
|
||||||
args: [--force-exclude]
|
- repo: https://github.com/PyCQA/isort
|
||||||
|
rev: 6.0.1
|
||||||
|
hooks:
|
||||||
|
- id: isort
|
||||||
- repo: https://github.com/pre-commit/mirrors-clang-format
|
- repo: https://github.com/pre-commit/mirrors-clang-format
|
||||||
rev: v21.1.2
|
rev: v20.1.3
|
||||||
hooks:
|
hooks:
|
||||||
- id: clang-format
|
- id: clang-format
|
||||||
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
|
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
|
||||||
types_or: [c++, cuda]
|
types_or: [c++, cuda]
|
||||||
args: [--style=file, --verbose]
|
args: [--style=file, --verbose]
|
||||||
- repo: https://github.com/igorshubovych/markdownlint-cli
|
- repo: https://github.com/jackdewinter/pymarkdown
|
||||||
rev: v0.45.0
|
rev: v0.9.29
|
||||||
hooks:
|
hooks:
|
||||||
- id: markdownlint
|
- id: pymarkdown
|
||||||
exclude: '.*\.inc\.md'
|
exclude: '.*\.inc\.md'
|
||||||
stages: [manual] # Only run in CI
|
args: [fix]
|
||||||
- repo: https://github.com/rhysd/actionlint
|
- repo: https://github.com/rhysd/actionlint
|
||||||
rev: v1.7.7
|
rev: v1.7.7
|
||||||
hooks:
|
hooks:
|
||||||
- id: actionlint
|
- id: actionlint
|
||||||
- repo: https://github.com/astral-sh/uv-pre-commit
|
- repo: https://github.com/astral-sh/uv-pre-commit
|
||||||
rev: 0.9.1
|
rev: 0.6.17
|
||||||
hooks:
|
hooks:
|
||||||
- id: pip-compile
|
- id: pip-compile
|
||||||
args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128, --python-platform, x86_64-manylinux_2_28]
|
args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128]
|
||||||
files: ^requirements/test\.(in|txt)$
|
files: ^requirements/test\.(in|txt)$
|
||||||
- repo: local
|
- repo: local
|
||||||
hooks:
|
hooks:
|
||||||
@ -49,32 +60,38 @@ repos:
|
|||||||
files: ^requirements/test\.(in|txt)$
|
files: ^requirements/test\.(in|txt)$
|
||||||
- id: mypy-local
|
- id: mypy-local
|
||||||
name: Run mypy for local Python installation
|
name: Run mypy for local Python installation
|
||||||
entry: python tools/pre_commit/mypy.py 0 "local"
|
entry: tools/mypy.sh 0 "local"
|
||||||
|
language: python
|
||||||
|
types: [python]
|
||||||
|
additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests, pydantic]
|
||||||
stages: [pre-commit] # Don't run in CI
|
stages: [pre-commit] # Don't run in CI
|
||||||
<<: &mypy_common
|
- id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||||
language: python
|
name: Run mypy for Python 3.9
|
||||||
types_or: [python, pyi]
|
entry: tools/mypy.sh 1 "3.9"
|
||||||
require_serial: true
|
language: python
|
||||||
additional_dependencies: [mypy==1.11.1, regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic]
|
types: [python]
|
||||||
|
additional_dependencies: *mypy_deps
|
||||||
|
stages: [manual] # Only run in CI
|
||||||
- id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
- id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||||
name: Run mypy for Python 3.10
|
name: Run mypy for Python 3.10
|
||||||
entry: python tools/pre_commit/mypy.py 1 "3.10"
|
entry: tools/mypy.sh 1 "3.10"
|
||||||
<<: *mypy_common
|
language: python
|
||||||
|
types: [python]
|
||||||
|
additional_dependencies: *mypy_deps
|
||||||
stages: [manual] # Only run in CI
|
stages: [manual] # Only run in CI
|
||||||
- id: mypy-3.11 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
- id: mypy-3.11 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||||
name: Run mypy for Python 3.11
|
name: Run mypy for Python 3.11
|
||||||
entry: python tools/pre_commit/mypy.py 1 "3.11"
|
entry: tools/mypy.sh 1 "3.11"
|
||||||
<<: *mypy_common
|
language: python
|
||||||
|
types: [python]
|
||||||
|
additional_dependencies: *mypy_deps
|
||||||
stages: [manual] # Only run in CI
|
stages: [manual] # Only run in CI
|
||||||
- id: mypy-3.12 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
- id: mypy-3.12 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||||
name: Run mypy for Python 3.12
|
name: Run mypy for Python 3.12
|
||||||
entry: python tools/pre_commit/mypy.py 1 "3.12"
|
entry: tools/mypy.sh 1 "3.12"
|
||||||
<<: *mypy_common
|
language: python
|
||||||
stages: [manual] # Only run in CI
|
types: [python]
|
||||||
- id: mypy-3.13 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
additional_dependencies: *mypy_deps
|
||||||
name: Run mypy for Python 3.13
|
|
||||||
entry: python tools/pre_commit/mypy.py 1 "3.13"
|
|
||||||
<<: *mypy_common
|
|
||||||
stages: [manual] # Only run in CI
|
stages: [manual] # Only run in CI
|
||||||
- id: shellcheck
|
- id: shellcheck
|
||||||
name: Lint shell scripts
|
name: Lint shell scripts
|
||||||
@ -138,15 +155,18 @@ repos:
|
|||||||
additional_dependencies: [regex]
|
additional_dependencies: [regex]
|
||||||
- id: check-pickle-imports
|
- id: check-pickle-imports
|
||||||
name: Prevent new pickle/cloudpickle imports
|
name: Prevent new pickle/cloudpickle imports
|
||||||
entry: python tools/pre_commit/check_pickle_imports.py
|
entry: python tools/check_pickle_imports.py
|
||||||
language: python
|
language: python
|
||||||
types: [python]
|
types: [python]
|
||||||
additional_dependencies: [regex]
|
pass_filenames: false
|
||||||
|
additional_dependencies: [pathspec, regex]
|
||||||
- id: validate-config
|
- id: validate-config
|
||||||
name: Validate configuration has default values and that each field has a docstring
|
name: Validate configuration has default values and that each field has a docstring
|
||||||
entry: python tools/validate_config.py
|
entry: python tools/validate_config.py
|
||||||
language: python
|
language: python
|
||||||
additional_dependencies: [regex]
|
types: [python]
|
||||||
|
pass_filenames: true
|
||||||
|
files: vllm/config.py|tests/test_config.py|vllm/entrypoints/openai/cli_args.py
|
||||||
# Keep `suggestion` last
|
# Keep `suggestion` last
|
||||||
- id: suggestion
|
- id: suggestion
|
||||||
name: Suggestion
|
name: Suggestion
|
||||||
|
@ -7,13 +7,9 @@ build:
|
|||||||
os: ubuntu-22.04
|
os: ubuntu-22.04
|
||||||
tools:
|
tools:
|
||||||
python: "3.12"
|
python: "3.12"
|
||||||
jobs:
|
|
||||||
post_checkout:
|
|
||||||
- git fetch --unshallow || true
|
|
||||||
|
|
||||||
mkdocs:
|
mkdocs:
|
||||||
configuration: mkdocs.yaml
|
configuration: mkdocs.yaml
|
||||||
fail_on_warning: true
|
|
||||||
|
|
||||||
# Optionally declare the Python requirements required to build your docs
|
# Optionally declare the Python requirements required to build your docs
|
||||||
python:
|
python:
|
||||||
|
@ -1,2 +1 @@
|
|||||||
collect_env.py
|
collect_env.py
|
||||||
vllm/model_executor/layers/fla/ops/*.py
|
|
||||||
|
229
CMakeLists.txt
229
CMakeLists.txt
@ -13,10 +13,6 @@ cmake_minimum_required(VERSION 3.26)
|
|||||||
# cmake --install . --component _C
|
# cmake --install . --component _C
|
||||||
project(vllm_extensions LANGUAGES CXX)
|
project(vllm_extensions LANGUAGES CXX)
|
||||||
|
|
||||||
set(CMAKE_CXX_STANDARD 17)
|
|
||||||
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
|
||||||
|
|
||||||
|
|
||||||
# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
|
# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
|
||||||
set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
|
set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
|
||||||
message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
|
message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
|
||||||
@ -34,10 +30,10 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
|
|||||||
# Supported python versions. These versions will be searched in order, the
|
# Supported python versions. These versions will be searched in order, the
|
||||||
# first match will be selected. These should be kept in sync with setup.py.
|
# first match will be selected. These should be kept in sync with setup.py.
|
||||||
#
|
#
|
||||||
set(PYTHON_SUPPORTED_VERSIONS "3.10" "3.11" "3.12" "3.13")
|
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
|
||||||
|
|
||||||
# Supported AMD GPU architectures.
|
# Supported AMD GPU architectures.
|
||||||
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
|
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
|
||||||
|
|
||||||
#
|
#
|
||||||
# Supported/expected torch versions for CUDA/ROCm.
|
# Supported/expected torch versions for CUDA/ROCm.
|
||||||
@ -49,8 +45,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
|
|||||||
# requirements.txt files and should be kept consistent. The ROCm torch
|
# requirements.txt files and should be kept consistent. The ROCm torch
|
||||||
# versions are derived from docker/Dockerfile.rocm
|
# versions are derived from docker/Dockerfile.rocm
|
||||||
#
|
#
|
||||||
set(TORCH_SUPPORTED_VERSION_CUDA "2.8.0")
|
set(TORCH_SUPPORTED_VERSION_CUDA "2.7.1")
|
||||||
set(TORCH_SUPPORTED_VERSION_ROCM "2.8.0")
|
set(TORCH_SUPPORTED_VERSION_ROCM "2.7.0")
|
||||||
|
|
||||||
#
|
#
|
||||||
# Try to find python package with an executable that exactly matches
|
# Try to find python package with an executable that exactly matches
|
||||||
@ -86,9 +82,6 @@ find_package(Torch REQUIRED)
|
|||||||
# Supported NVIDIA architectures.
|
# Supported NVIDIA architectures.
|
||||||
# This check must happen after find_package(Torch) because that's when CMAKE_CUDA_COMPILER_VERSION gets defined
|
# This check must happen after find_package(Torch) because that's when CMAKE_CUDA_COMPILER_VERSION gets defined
|
||||||
if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
|
if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
|
||||||
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
|
|
||||||
set(CUDA_SUPPORTED_ARCHS "7.5;8.0;8.6;8.7;8.9;9.0;10.0;11.0;12.0")
|
|
||||||
elseif(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
|
|
||||||
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
|
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
|
||||||
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
|
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
|
||||||
else()
|
else()
|
||||||
@ -178,25 +171,6 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
|
list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
#
|
|
||||||
# Set compression mode for CUDA >=13.x.
|
|
||||||
#
|
|
||||||
if(VLLM_GPU_LANG STREQUAL "CUDA" AND
|
|
||||||
DEFINED CMAKE_CUDA_COMPILER_VERSION AND
|
|
||||||
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
|
|
||||||
list(APPEND VLLM_GPU_FLAGS "--compress-mode=size")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
#
|
|
||||||
# Set CUDA include flags for CXX compiler.
|
|
||||||
#
|
|
||||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I${CUDA_TOOLKIT_ROOT_DIR}/include")
|
|
||||||
if(CUDA_VERSION VERSION_GREATER_EQUAL 13.0)
|
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I${CUDA_TOOLKIT_ROOT_DIR}/include/cccl")
|
|
||||||
endif()
|
|
||||||
endif()
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process.
|
# Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process.
|
||||||
# setup.py will override FETCHCONTENT_BASE_DIR to play nicely with sccache.
|
# setup.py will override FETCHCONTENT_BASE_DIR to play nicely with sccache.
|
||||||
@ -269,12 +243,13 @@ set(VLLM_EXT_SRC
|
|||||||
"csrc/sampler.cu"
|
"csrc/sampler.cu"
|
||||||
"csrc/cuda_view.cu"
|
"csrc/cuda_view.cu"
|
||||||
"csrc/quantization/gptq/q_gemm.cu"
|
"csrc/quantization/gptq/q_gemm.cu"
|
||||||
"csrc/quantization/w8a8/int8/scaled_quant.cu"
|
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
|
||||||
"csrc/quantization/w8a8/fp8/common.cu"
|
"csrc/quantization/fp8/common.cu"
|
||||||
"csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
|
"csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
|
||||||
"csrc/quantization/gguf/gguf_kernel.cu"
|
"csrc/quantization/gguf/gguf_kernel.cu"
|
||||||
"csrc/quantization/activation_kernels.cu"
|
"csrc/quantization/activation_kernels.cu"
|
||||||
"csrc/cuda_utils_kernels.cu"
|
"csrc/cuda_utils_kernels.cu"
|
||||||
|
"csrc/prepare_inputs/advance_step.cu"
|
||||||
"csrc/custom_all_reduce.cu"
|
"csrc/custom_all_reduce.cu"
|
||||||
"csrc/torch_bindings.cpp")
|
"csrc/torch_bindings.cpp")
|
||||||
|
|
||||||
@ -282,7 +257,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
|
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
|
||||||
|
|
||||||
# Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
|
# Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
|
||||||
set(CUTLASS_REVISION "v4.2.1" CACHE STRING "CUTLASS revision to use")
|
set(CUTLASS_REVISION "v4.0.0" CACHE STRING "CUTLASS revision to use")
|
||||||
|
|
||||||
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
|
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
|
||||||
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
|
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
|
||||||
@ -312,15 +287,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
FetchContent_MakeAvailable(cutlass)
|
FetchContent_MakeAvailable(cutlass)
|
||||||
|
|
||||||
list(APPEND VLLM_EXT_SRC
|
list(APPEND VLLM_EXT_SRC
|
||||||
|
"csrc/quantization/aqlm/gemm_kernels.cu"
|
||||||
"csrc/quantization/awq/gemm_kernels.cu"
|
"csrc/quantization/awq/gemm_kernels.cu"
|
||||||
"csrc/permute_cols.cu"
|
"csrc/permute_cols.cu"
|
||||||
"csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu"
|
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
|
||||||
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
|
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
|
||||||
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
|
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
|
||||||
|
"csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu"
|
||||||
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
|
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
|
||||||
"csrc/cutlass_extensions/common.cpp"
|
"csrc/cutlass_extensions/common.cpp"
|
||||||
"csrc/quantization/w8a8/fp8/per_token_group_quant.cu"
|
"csrc/attention/mla/cutlass_mla_entry.cu"
|
||||||
"csrc/quantization/w8a8/int8/per_token_group_quant.cu")
|
"csrc/quantization/fp8/per_token_group_quant.cu")
|
||||||
|
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${VLLM_EXT_SRC}"
|
SRCS "${VLLM_EXT_SRC}"
|
||||||
@ -374,27 +351,20 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
|
SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
|
||||||
CUDA_ARCHS "${MARLIN_ARCHS}")
|
CUDA_ARCHS "${MARLIN_ARCHS}")
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
|
|
||||||
set_source_files_properties(${MARLIN_TEMPLATE_KERNEL_SRC}
|
|
||||||
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
|
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
|
||||||
|
|
||||||
set(MARLIN_SRCS
|
set(MARLIN_SRCS
|
||||||
|
"csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
|
||||||
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
|
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
|
||||||
|
"csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
|
||||||
"csrc/quantization/gptq_marlin/gptq_marlin.cu"
|
"csrc/quantization/gptq_marlin/gptq_marlin.cu"
|
||||||
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
|
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
|
||||||
"csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
|
"csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${MARLIN_SRCS}"
|
SRCS "${MARLIN_SRCS}"
|
||||||
CUDA_ARCHS "${MARLIN_ARCHS}")
|
CUDA_ARCHS "${MARLIN_ARCHS}")
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
|
|
||||||
set_source_files_properties("csrc/quantization/gptq_marlin/gptq_marlin.cu"
|
|
||||||
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
|
|
||||||
endif()
|
|
||||||
list(APPEND VLLM_EXT_SRC "${MARLIN_SRCS}")
|
list(APPEND VLLM_EXT_SRC "${MARLIN_SRCS}")
|
||||||
|
|
||||||
message(STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS}")
|
message(STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS}")
|
||||||
else()
|
else()
|
||||||
message(STATUS "Not building Marlin kernels as no compatible archs found"
|
message(STATUS "Not building Marlin kernels as no compatible archs found"
|
||||||
@ -424,11 +394,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS
|
set(SRCS
|
||||||
"csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm90.cu"
|
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu"
|
||||||
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8.cu"
|
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
|
||||||
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_int8.cu"
|
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
|
||||||
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_azp_sm90_int8.cu"
|
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu"
|
||||||
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm90_fp8.cu")
|
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
||||||
@ -452,16 +422,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
|
|
||||||
# The cutlass_scaled_mm kernels for Geforce Blackwell SM120 (c3x, i.e. CUTLASS 3.x) require
|
# The cutlass_scaled_mm kernels for Geforce Blackwell SM120 (c3x, i.e. CUTLASS 3.x) require
|
||||||
# CUDA 12.8 or later
|
# CUDA 12.8 or later
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0;12.0a" "${CUDA_ARCHS}")
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0f" "${CUDA_ARCHS}")
|
|
||||||
else()
|
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0a" "${CUDA_ARCHS}")
|
|
||||||
endif()
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS
|
set(SRCS
|
||||||
"csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm120.cu"
|
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu"
|
||||||
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm120_fp8.cu"
|
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu"
|
||||||
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm120_fp8.cu"
|
|
||||||
)
|
)
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
@ -486,16 +451,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
|
|
||||||
# The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
|
# The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
|
||||||
# require CUDA 12.8 or later
|
# require CUDA 12.8 or later
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a" "${CUDA_ARCHS}")
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
|
||||||
else()
|
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
|
|
||||||
endif()
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS
|
set(SRCS
|
||||||
"csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm100.cu"
|
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
|
||||||
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8.cu"
|
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
|
||||||
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8.cu"
|
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu"
|
||||||
)
|
)
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
@ -526,7 +487,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
# subtract out the archs that are already built for 3x
|
# subtract out the archs that are already built for 3x
|
||||||
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
|
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
|
||||||
if (SCALED_MM_2X_ARCHS)
|
if (SCALED_MM_2X_ARCHS)
|
||||||
set(SRCS "csrc/quantization/w8a8/cutlass/scaled_mm_c2x.cu")
|
set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
CUDA_ARCHS "${SCALED_MM_2X_ARCHS}")
|
CUDA_ARCHS "${SCALED_MM_2X_ARCHS}")
|
||||||
@ -568,40 +529,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# The nvfp4_scaled_mm_sm120 kernels for Geforce Blackwell SM120 require
|
|
||||||
# CUDA 12.8 or later
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
|
||||||
cuda_archs_loose_intersection(FP4_ARCHS "12.0f" "${CUDA_ARCHS}")
|
|
||||||
else()
|
|
||||||
cuda_archs_loose_intersection(FP4_ARCHS "12.0a" "${CUDA_ARCHS}")
|
|
||||||
endif()
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
|
|
||||||
set(SRCS
|
|
||||||
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
|
|
||||||
"csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu"
|
|
||||||
"csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu")
|
|
||||||
set_gencode_flags_for_srcs(
|
|
||||||
SRCS "${SRCS}"
|
|
||||||
CUDA_ARCHS "${FP4_ARCHS}")
|
|
||||||
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
|
||||||
list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4_SM120=1")
|
|
||||||
message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
|
|
||||||
else()
|
|
||||||
message(STATUS "Not building NVFP4 as no compatible archs were found.")
|
|
||||||
# clear FP4_ARCHS
|
|
||||||
set(FP4_ARCHS)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
# FP4 Archs and flags
|
# FP4 Archs and flags
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||||
cuda_archs_loose_intersection(FP4_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
|
||||||
else()
|
|
||||||
cuda_archs_loose_intersection(FP4_ARCHS "10.0a;10.1a;12.0a;12.1a" "${CUDA_ARCHS}")
|
|
||||||
endif()
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
|
||||||
set(SRCS
|
set(SRCS
|
||||||
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
|
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
|
||||||
"csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu"
|
|
||||||
"csrc/quantization/fp4/nvfp4_experts_quant.cu"
|
"csrc/quantization/fp4/nvfp4_experts_quant.cu"
|
||||||
"csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu"
|
"csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu"
|
||||||
"csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu")
|
"csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu")
|
||||||
@ -609,7 +541,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
CUDA_ARCHS "${FP4_ARCHS}")
|
CUDA_ARCHS "${FP4_ARCHS}")
|
||||||
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
||||||
list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4_SM100=1")
|
list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4=1")
|
||||||
list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM100=1")
|
list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM100=1")
|
||||||
message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
|
message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
|
||||||
else()
|
else()
|
||||||
@ -619,13 +551,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
# CUTLASS MLA Archs and flags
|
# CUTLASS MLA Archs and flags
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||||
cuda_archs_loose_intersection(MLA_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
|
||||||
else()
|
|
||||||
cuda_archs_loose_intersection(MLA_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
|
|
||||||
endif()
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND MLA_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND MLA_ARCHS)
|
||||||
set(SRCS
|
set(SRCS
|
||||||
|
"csrc/attention/mla/cutlass_mla_kernels.cu"
|
||||||
"csrc/attention/mla/sm100_cutlass_mla_kernel.cu")
|
"csrc/attention/mla/sm100_cutlass_mla_kernel.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
@ -649,7 +578,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
# if it's possible to compile MoE kernels that use its output.
|
# if it's possible to compile MoE kernels that use its output.
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm90.cu")
|
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm90.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
||||||
@ -667,13 +596,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
|
|
||||||
else()
|
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
|
|
||||||
endif()
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm100.cu")
|
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm100.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
||||||
@ -692,13 +617,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
# moe_data.cu is used by all CUTLASS MoE kernels.
|
# moe_data.cu is used by all CUTLASS MoE kernels.
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
|
||||||
cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
|
||||||
else()
|
|
||||||
cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
|
|
||||||
endif()
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
|
||||||
set(SRCS "csrc/quantization/w8a8/cutlass/moe/moe_data.cu")
|
set(SRCS "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
CUDA_ARCHS "${CUTLASS_MOE_DATA_ARCHS}")
|
CUDA_ARCHS "${CUTLASS_MOE_DATA_ARCHS}")
|
||||||
@ -714,14 +635,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
"in CUDA target architectures.")
|
"in CUDA target architectures.")
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
|
||||||
else()
|
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
|
|
||||||
endif()
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS "csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu")
|
set(SRCS "csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
||||||
@ -808,44 +725,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
"found in CUDA target architectures")
|
"found in CUDA target architectures")
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# Only build W4A8 kernels if we are building for something compatible with sm90a
|
|
||||||
cuda_archs_loose_intersection(W4A8_ARCHS "9.0a" "${CUDA_ARCHS}")
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND W4A8_ARCHS)
|
|
||||||
set(SRCS
|
|
||||||
"csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu")
|
|
||||||
|
|
||||||
set_gencode_flags_for_srcs(
|
|
||||||
SRCS "${SRCS}"
|
|
||||||
CUDA_ARCHS "${W4A8_ARCHS}")
|
|
||||||
|
|
||||||
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
|
||||||
|
|
||||||
message(STATUS "Building W4A8 kernels for archs: ${W4A8_ARCHS}")
|
|
||||||
else()
|
|
||||||
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0
|
|
||||||
AND W4A8_ARCHS)
|
|
||||||
message(STATUS "Not building W4A8 kernels as CUDA Compiler version is "
|
|
||||||
"not >= 12.0, we recommend upgrading to CUDA 12.0 or "
|
|
||||||
"later if you intend on running w4a16 quantized models on "
|
|
||||||
"Hopper.")
|
|
||||||
else()
|
|
||||||
message(STATUS "Not building W4A8 kernels as no compatible archs "
|
|
||||||
"found in CUDA target architectures")
|
|
||||||
endif()
|
|
||||||
endif()
|
|
||||||
|
|
||||||
# Hadacore kernels
|
|
||||||
cuda_archs_loose_intersection(HADACORE_ARCHS "8.0;8.9;9.0" "${CUDA_ARCHS}")
|
|
||||||
if(HADACORE_ARCHS)
|
|
||||||
set(SRCS "csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu")
|
|
||||||
set_gencode_flags_for_srcs(
|
|
||||||
SRCS "${SRCS}"
|
|
||||||
CUDA_ARCHS "${HADACORE_ARCHS}")
|
|
||||||
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
|
||||||
message(STATUS "Building hadacore")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
# if CUDA endif
|
# if CUDA endif
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
@ -886,17 +765,7 @@ set(VLLM_MOE_EXT_SRC
|
|||||||
"csrc/moe/topk_softmax_kernels.cu")
|
"csrc/moe/topk_softmax_kernels.cu")
|
||||||
|
|
||||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
list(APPEND VLLM_MOE_EXT_SRC
|
list(APPEND VLLM_MOE_EXT_SRC "csrc/moe/moe_wna16.cu")
|
||||||
"csrc/moe/moe_wna16.cu"
|
|
||||||
"csrc/moe/grouped_topk_kernels.cu")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|
||||||
set(MOE_PERMUTE_SRC
|
|
||||||
"csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu"
|
|
||||||
"csrc/moe/moe_permute_unpermute_op.cu")
|
|
||||||
|
|
||||||
list(APPEND VLLM_MOE_EXT_SRC "${MOE_PERMUTE_SRC}")
|
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
@ -957,10 +826,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${MOE_WNAA16_MARLIN_SRC}"
|
SRCS "${MOE_WNAA16_MARLIN_SRC}"
|
||||||
CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
|
CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
|
|
||||||
set_source_files_properties(${MOE_WNAA16_MARLIN_SRC}
|
|
||||||
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC})
|
list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC})
|
||||||
|
|
||||||
@ -971,6 +836,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
|
set(MOE_PERMUTE_SRC
|
||||||
|
"csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu"
|
||||||
|
"csrc/moe/moe_permute_unpermute_op.cu")
|
||||||
|
|
||||||
|
set_gencode_flags_for_srcs(
|
||||||
|
SRCS "${MARLIN_PERMUTE_SRC}"
|
||||||
|
CUDA_ARCHS "${MOE_PERMUTE_ARCHS}")
|
||||||
|
|
||||||
|
list(APPEND VLLM_MOE_EXT_SRC "${MOE_PERMUTE_SRC}")
|
||||||
|
endif()
|
||||||
message(STATUS "Enabling moe extension.")
|
message(STATUS "Enabling moe extension.")
|
||||||
define_gpu_extension_target(
|
define_gpu_extension_target(
|
||||||
_moe_C
|
_moe_C
|
||||||
@ -1007,7 +883,6 @@ endif()
|
|||||||
# For CUDA we also build and ship some external projects.
|
# For CUDA we also build and ship some external projects.
|
||||||
if (VLLM_GPU_LANG STREQUAL "CUDA")
|
if (VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
include(cmake/external_projects/flashmla.cmake)
|
include(cmake/external_projects/flashmla.cmake)
|
||||||
include(cmake/external_projects/qutlass.cmake)
|
|
||||||
|
|
||||||
# vllm-flash-attn should be last as it overwrites some CMake functions
|
# vllm-flash-attn should be last as it overwrites some CMake functions
|
||||||
include(cmake/external_projects/vllm_flash_attn.cmake)
|
include(cmake/external_projects/vllm_flash_attn.cmake)
|
||||||
|
@ -2,6 +2,7 @@ include LICENSE
|
|||||||
include requirements/common.txt
|
include requirements/common.txt
|
||||||
include requirements/cuda.txt
|
include requirements/cuda.txt
|
||||||
include requirements/rocm.txt
|
include requirements/rocm.txt
|
||||||
|
include requirements/neuron.txt
|
||||||
include requirements/cpu.txt
|
include requirements/cpu.txt
|
||||||
include CMakeLists.txt
|
include CMakeLists.txt
|
||||||
|
|
||||||
|
26
README.md
26
README.md
@ -1,4 +1,3 @@
|
|||||||
<!-- markdownlint-disable MD001 MD041 -->
|
|
||||||
<p align="center">
|
<p align="center">
|
||||||
<picture>
|
<picture>
|
||||||
<source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/assets/logos/vllm-logo-text-dark.png">
|
<source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/assets/logos/vllm-logo-text-dark.png">
|
||||||
@ -14,27 +13,17 @@ Easy, fast, and cheap LLM serving for everyone
|
|||||||
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://blog.vllm.ai/"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://discuss.vllm.ai"><b>User Forum</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
|
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://blog.vllm.ai/"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://discuss.vllm.ai"><b>User Forum</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
---
|
|
||||||
Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundation.org/pytorch-conference/) and [Ray Summit, November 3-5](https://www.anyscale.com/ray-summit/2025) in San Francisco for our latest updates on vLLM and to meet the vLLM team! Register now for the largest vLLM community events of the year!
|
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
*Latest News* 🔥
|
*Latest News* 🔥
|
||||||
|
- [2025/05] We hosted [NYC vLLM Meetup](https://lu.ma/c1rqyf1f)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing).
|
||||||
- [2025/09] We hosted [vLLM Toronto Meetup](https://luma.com/e80e0ymm) focused on tackling inference at scale and speculative decoding with speakers from NVIDIA and Red Hat! Please find the meetup slides [here](https://docs.google.com/presentation/d/1IYJYmJcu9fLpID5N5RbW_vO0XLo0CGOR14IXOjB61V8/edit?usp=sharing).
|
|
||||||
- [2025/08] We hosted [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ) focusing on the ecosystem around vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA).
|
|
||||||
- [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing).
|
|
||||||
- [2025/08] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg) focusing on building, developing, and integrating with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH).
|
|
||||||
- [2025/05] vLLM is now a hosted project under PyTorch Foundation! Please find the announcement [here](https://pytorch.org/blog/pytorch-foundation-welcomes-vllm/).
|
- [2025/05] vLLM is now a hosted project under PyTorch Foundation! Please find the announcement [here](https://pytorch.org/blog/pytorch-foundation-welcomes-vllm/).
|
||||||
|
- [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
|
||||||
- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
|
- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
<summary>Previous News</summary>
|
<summary>Previous News</summary>
|
||||||
|
|
||||||
- [2025/08] We hosted [vLLM Korea Meetup](https://luma.com/cgcgprmh) with Red Hat and Rebellions! We shared the latest advancements in vLLM along with project spotlights from the vLLM Korea community. Please find the meetup slides [here](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view).
|
|
||||||
- [2025/08] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA) focusing on large-scale LLM deployment! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) and the recording [here](https://www.chaspark.com/#/live/1166916873711665152).
|
|
||||||
- [2025/05] We hosted [NYC vLLM Meetup](https://lu.ma/c1rqyf1f)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing).
|
|
||||||
- [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
|
|
||||||
- [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
|
- [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
|
||||||
- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
|
- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
|
||||||
- [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
|
- [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
|
||||||
@ -57,7 +46,6 @@ Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundatio
|
|||||||
</details>
|
</details>
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## About
|
## About
|
||||||
|
|
||||||
vLLM is a fast and easy-to-use library for LLM inference and serving.
|
vLLM is a fast and easy-to-use library for LLM inference and serving.
|
||||||
@ -82,12 +70,11 @@ vLLM is flexible and easy to use with:
|
|||||||
- Tensor, pipeline, data and expert parallelism support for distributed inference
|
- Tensor, pipeline, data and expert parallelism support for distributed inference
|
||||||
- Streaming outputs
|
- Streaming outputs
|
||||||
- OpenAI-compatible API server
|
- OpenAI-compatible API server
|
||||||
- Support for NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, and TPU. Additionally, support for diverse hardware plugins such as Intel Gaudi, IBM Spyre and Huawei Ascend.
|
- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron
|
||||||
- Prefix caching support
|
- Prefix caching support
|
||||||
- Multi-LoRA support
|
- Multi-LoRA support
|
||||||
|
|
||||||
vLLM seamlessly supports most popular open-source models on HuggingFace, including:
|
vLLM seamlessly supports most popular open-source models on HuggingFace, including:
|
||||||
|
|
||||||
- Transformer-like LLMs (e.g., Llama)
|
- Transformer-like LLMs (e.g., Llama)
|
||||||
- Mixture-of-Expert LLMs (e.g., Mixtral, Deepseek-V2 and V3)
|
- Mixture-of-Expert LLMs (e.g., Mixtral, Deepseek-V2 and V3)
|
||||||
- Embedding Models (e.g., E5-Mistral)
|
- Embedding Models (e.g., E5-Mistral)
|
||||||
@ -104,7 +91,6 @@ pip install vllm
|
|||||||
```
|
```
|
||||||
|
|
||||||
Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more.
|
Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more.
|
||||||
|
|
||||||
- [Installation](https://docs.vllm.ai/en/latest/getting_started/installation.html)
|
- [Installation](https://docs.vllm.ai/en/latest/getting_started/installation.html)
|
||||||
- [Quickstart](https://docs.vllm.ai/en/latest/getting_started/quickstart.html)
|
- [Quickstart](https://docs.vllm.ai/en/latest/getting_started/quickstart.html)
|
||||||
- [List of Supported Models](https://docs.vllm.ai/en/latest/models/supported_models.html)
|
- [List of Supported Models](https://docs.vllm.ai/en/latest/models/supported_models.html)
|
||||||
@ -121,7 +107,6 @@ vLLM is a community project. Our compute resources for development and testing a
|
|||||||
<!-- Note: Please sort them in alphabetical order. -->
|
<!-- Note: Please sort them in alphabetical order. -->
|
||||||
<!-- Note: Please keep these consistent with docs/community/sponsors.md -->
|
<!-- Note: Please keep these consistent with docs/community/sponsors.md -->
|
||||||
Cash Donations:
|
Cash Donations:
|
||||||
|
|
||||||
- a16z
|
- a16z
|
||||||
- Dropbox
|
- Dropbox
|
||||||
- Sequoia Capital
|
- Sequoia Capital
|
||||||
@ -129,8 +114,6 @@ Cash Donations:
|
|||||||
- ZhenFund
|
- ZhenFund
|
||||||
|
|
||||||
Compute Resources:
|
Compute Resources:
|
||||||
|
|
||||||
- Alibaba Cloud
|
|
||||||
- AMD
|
- AMD
|
||||||
- Anyscale
|
- Anyscale
|
||||||
- AWS
|
- AWS
|
||||||
@ -149,7 +132,6 @@ Compute Resources:
|
|||||||
- Trainy
|
- Trainy
|
||||||
- UC Berkeley
|
- UC Berkeley
|
||||||
- UC San Diego
|
- UC San Diego
|
||||||
- Volcengine
|
|
||||||
|
|
||||||
Slack Sponsor: Anyscale
|
Slack Sponsor: Anyscale
|
||||||
|
|
||||||
@ -171,7 +153,7 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
|
|||||||
## Contact Us
|
## Contact Us
|
||||||
|
|
||||||
<!-- --8<-- [start:contact-us] -->
|
<!-- --8<-- [start:contact-us] -->
|
||||||
- For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues)
|
- For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues) or [Discussions](https://github.com/vllm-project/vllm/discussions)
|
||||||
- For discussing with fellow users, please use the [vLLM Forum](https://discuss.vllm.ai)
|
- For discussing with fellow users, please use the [vLLM Forum](https://discuss.vllm.ai)
|
||||||
- For coordinating contributions and development, please use [Slack](https://slack.vllm.ai)
|
- For coordinating contributions and development, please use [Slack](https://slack.vllm.ai)
|
||||||
- For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm/security/advisories) feature
|
- For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm/security/advisories) feature
|
||||||
|
@ -60,10 +60,9 @@ Please note: **No feature work allowed for cherry picks**. All PRs that are cons
|
|||||||
Before each release, we perform end-to-end performance validation to ensure no regressions are introduced. This validation uses the [vllm-benchmark workflow](https://github.com/pytorch/pytorch-integration-testing/actions/workflows/vllm-benchmark.yml) on PyTorch CI.
|
Before each release, we perform end-to-end performance validation to ensure no regressions are introduced. This validation uses the [vllm-benchmark workflow](https://github.com/pytorch/pytorch-integration-testing/actions/workflows/vllm-benchmark.yml) on PyTorch CI.
|
||||||
|
|
||||||
**Current Coverage:**
|
**Current Coverage:**
|
||||||
|
|
||||||
* Models: Llama3, Llama4, and Mixtral
|
* Models: Llama3, Llama4, and Mixtral
|
||||||
* Hardware: NVIDIA H100 and AMD MI300x
|
* Hardware: NVIDIA H100 and AMD MI300x
|
||||||
* _Note: Coverage may change based on new model releases and hardware availability_
|
* *Note: Coverage may change based on new model releases and hardware availability*
|
||||||
|
|
||||||
**Performance Validation Process:**
|
**Performance Validation Process:**
|
||||||
|
|
||||||
@ -72,13 +71,11 @@ Request write access to the [pytorch/pytorch-integration-testing](https://github
|
|||||||
|
|
||||||
**Step 2: Review Benchmark Setup**
|
**Step 2: Review Benchmark Setup**
|
||||||
Familiarize yourself with the benchmark configurations:
|
Familiarize yourself with the benchmark configurations:
|
||||||
|
|
||||||
* [CUDA setup](https://github.com/pytorch/pytorch-integration-testing/tree/main/vllm-benchmarks/benchmarks/cuda)
|
* [CUDA setup](https://github.com/pytorch/pytorch-integration-testing/tree/main/vllm-benchmarks/benchmarks/cuda)
|
||||||
* [ROCm setup](https://github.com/pytorch/pytorch-integration-testing/tree/main/vllm-benchmarks/benchmarks/rocm)
|
* [ROCm setup](https://github.com/pytorch/pytorch-integration-testing/tree/main/vllm-benchmarks/benchmarks/rocm)
|
||||||
|
|
||||||
**Step 3: Run the Benchmark**
|
**Step 3: Run the Benchmark**
|
||||||
Navigate to the [vllm-benchmark workflow](https://github.com/pytorch/pytorch-integration-testing/actions/workflows/vllm-benchmark.yml) and configure:
|
Navigate to the [vllm-benchmark workflow](https://github.com/pytorch/pytorch-integration-testing/actions/workflows/vllm-benchmark.yml) and configure:
|
||||||
|
|
||||||
* **vLLM branch**: Set to the release branch (e.g., `releases/v0.9.2`)
|
* **vLLM branch**: Set to the release branch (e.g., `releases/v0.9.2`)
|
||||||
* **vLLM commit**: Set to the RC commit hash
|
* **vLLM commit**: Set to the RC commit hash
|
||||||
|
|
||||||
|
45
SECURITY.md
45
SECURITY.md
@ -1,50 +1,13 @@
|
|||||||
# Security Policy
|
# Security Policy
|
||||||
|
|
||||||
## Reporting security issues
|
## Reporting a Vulnerability
|
||||||
|
|
||||||
Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new).
|
If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem.
|
||||||
|
|
||||||
## Issue triage
|
Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new). Reports will then be triaged by the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html).
|
||||||
|
|
||||||
Reports will then be triaged by the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html).
|
---
|
||||||
|
|
||||||
## Threat model
|
|
||||||
|
|
||||||
Please see the [Security Guide in the vLLM documentation](https://docs.vllm.ai/en/latest/usage/security.html) for more information on vLLM's security assumptions and recommendations.
|
Please see the [Security Guide in the vLLM documentation](https://docs.vllm.ai/en/latest/usage/security.html) for more information on vLLM's security assumptions and recommendations.
|
||||||
|
|
||||||
Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models.
|
Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models.
|
||||||
|
|
||||||
## Issue severity
|
|
||||||
|
|
||||||
We will determine the risk of each issue, taking into account our experience dealing with past issues, versions affected, common defaults, and use cases. We use the following severity categories:
|
|
||||||
|
|
||||||
### CRITICAL Severity
|
|
||||||
|
|
||||||
Vulnerabilities that allow remote attackers to execute arbitrary code, take full control of the system, or significantly compromise confidentiality, integrity, or availability without any interaction or privileges needed, examples include remote code execution via network, deserialization issues that allow exploit chains. Generally those issues which are rated as CVSS ≥ 9.0.
|
|
||||||
|
|
||||||
### HIGH Severity
|
|
||||||
|
|
||||||
Serious security flaws that allow elevated impact—like RCE in specific, limited contexts or significant data loss—but require advanced conditions or some trust, examples include RCE in advanced deployment modes (e.g. multi-node), or high impact issues where some sort of privileged network access is required. These issues typically have CVSS scores between 7.0 and 8.9
|
|
||||||
|
|
||||||
### MODERATE Severity
|
|
||||||
|
|
||||||
Vulnerabilities that cause denial of service or partial disruption, but do not allow arbitrary code execution or data breach and have limited impact. These issues have a CVSS rating between 4.0 and 6.9
|
|
||||||
|
|
||||||
### LOW Severity
|
|
||||||
|
|
||||||
Minor issues such as informational disclosures, logging errors, non-exploitable flaws, or weaknesses that require local or high-privilege access and offer negligible impact. Examples include side channel attacks or hash collisions. These issues often have CVSS scores less than 4.0
|
|
||||||
|
|
||||||
## Prenotification policy
|
|
||||||
|
|
||||||
For certain security issues of CRITICAL, HIGH, or MODERATE severity level, we may prenotify certain organizations or vendors that ship vLLM. The purpose of this prenotification is to allow for a coordinated release of fixes for severe issues.
|
|
||||||
|
|
||||||
* This prenotification will be in the form of a private email notification. It may also include adding security contacts to the GitHub security advisory, typically a few days before release.
|
|
||||||
|
|
||||||
* If you wish to be added to the prenotification group, please send an email copying all the members of the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html). Each vendor contact will be analyzed on a case-by-case basis.
|
|
||||||
|
|
||||||
* Organizations and vendors who either ship or use vLLM, are eligible to join the prenotification group if they meet at least one of the following qualifications
|
|
||||||
* Substantial internal deployment leveraging the upstream vLLM project.
|
|
||||||
* Established internal security teams and comprehensive compliance measures.
|
|
||||||
* Active and consistent contributions to the upstream vLLM project.
|
|
||||||
|
|
||||||
* We may withdraw organizations from receiving future prenotifications if they release fixes or any other information about issues before they are public. Group membership may also change based on policy refinements for who may be included.
|
|
||||||
|
@ -1,20 +1,605 @@
|
|||||||
# Benchmarks
|
# Benchmarking vLLM
|
||||||
|
|
||||||
This directory used to contain vLLM's benchmark scripts and utilities for performance testing and evaluation.
|
This README guides you through running benchmark tests with the extensive
|
||||||
|
datasets supported on vLLM. It’s a living document, updated as new features and datasets
|
||||||
|
become available.
|
||||||
|
|
||||||
## Contents
|
**Dataset Overview**
|
||||||
|
|
||||||
- **Serving benchmarks**: Scripts for testing online inference performance (latency, throughput)
|
<table style="width:100%; border-collapse: collapse;">
|
||||||
- **Throughput benchmarks**: Scripts for testing offline batch inference performance
|
<thead>
|
||||||
- **Specialized benchmarks**: Tools for testing specific features like structured output, prefix caching, long document QA, request prioritization, and multi-modal inference
|
<tr>
|
||||||
- **Dataset utilities**: Framework for loading and sampling from various benchmark datasets (ShareGPT, HuggingFace datasets, synthetic data, etc.)
|
<th style="width:15%; text-align: left;">Dataset</th>
|
||||||
|
<th style="width:10%; text-align: center;">Online</th>
|
||||||
|
<th style="width:10%; text-align: center;">Offline</th>
|
||||||
|
<th style="width:65%; text-align: left;">Data Path</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
<tr>
|
||||||
|
<td><strong>ShareGPT</strong></td>
|
||||||
|
<td style="text-align: center;">✅</td>
|
||||||
|
<td style="text-align: center;">✅</td>
|
||||||
|
<td><code>wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json</code></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><strong>BurstGPT</strong></td>
|
||||||
|
<td style="text-align: center;">✅</td>
|
||||||
|
<td style="text-align: center;">✅</td>
|
||||||
|
<td><code>wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv</code></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><strong>Sonnet</strong></td>
|
||||||
|
<td style="text-align: center;">✅</td>
|
||||||
|
<td style="text-align: center;">✅</td>
|
||||||
|
<td>Local file: <code>benchmarks/sonnet.txt</code></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><strong>Random</strong></td>
|
||||||
|
<td style="text-align: center;">✅</td>
|
||||||
|
<td style="text-align: center;">✅</td>
|
||||||
|
<td><code>synthetic</code></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><strong>HuggingFace-VisionArena</strong></td>
|
||||||
|
<td style="text-align: center;">✅</td>
|
||||||
|
<td style="text-align: center;">✅</td>
|
||||||
|
<td><code>lmarena-ai/VisionArena-Chat</code></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><strong>HuggingFace-InstructCoder</strong></td>
|
||||||
|
<td style="text-align: center;">✅</td>
|
||||||
|
<td style="text-align: center;">✅</td>
|
||||||
|
<td><code>likaixin/InstructCoder</code></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><strong>HuggingFace-AIMO</strong></td>
|
||||||
|
<td style="text-align: center;">✅</td>
|
||||||
|
<td style="text-align: center;">✅</td>
|
||||||
|
<td><code>AI-MO/aimo-validation-aime</code> , <code>AI-MO/NuminaMath-1.5</code>, <code>AI-MO/NuminaMath-CoT</code></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><strong>HuggingFace-Other</strong></td>
|
||||||
|
<td style="text-align: center;">✅</td>
|
||||||
|
<td style="text-align: center;">✅</td>
|
||||||
|
<td><code>lmms-lab/LLaVA-OneVision-Data</code>, <code>Aeala/ShareGPT_Vicuna_unfiltered</code></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><strong>Custom</strong></td>
|
||||||
|
<td style="text-align: center;">✅</td>
|
||||||
|
<td style="text-align: center;">✅</td>
|
||||||
|
<td>Local file: <code>data.jsonl</code></td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
|
||||||
## Usage
|
✅: supported
|
||||||
|
|
||||||
For detailed usage instructions, examples, and dataset information, see the [Benchmark CLI documentation](https://docs.vllm.ai/en/latest/contributing/benchmarks.html#benchmark-cli).
|
🟡: Partial support
|
||||||
|
|
||||||
For full CLI reference see:
|
🚧: to be supported
|
||||||
|
|
||||||
- <https://docs.vllm.ai/en/latest/cli/bench/latency.html>
|
**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`
|
||||||
- <https://docs.vllm.ai/en/latest/cli/bench/serve.html>
|
|
||||||
- <https://docs.vllm.ai/en/latest/cli/bench/throughput.html>
|
---
|
||||||
|
<details>
|
||||||
|
<summary><b>🚀 Example - Online Benchmark</b></summary>
|
||||||
|
|
||||||
|
<br/>
|
||||||
|
|
||||||
|
First start serving your model
|
||||||
|
|
||||||
|
```bash
|
||||||
|
vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests
|
||||||
|
```
|
||||||
|
|
||||||
|
Then run the benchmarking script
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# download dataset
|
||||||
|
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||||
|
python3 vllm/benchmarks/benchmark_serving.py \
|
||||||
|
--backend vllm \
|
||||||
|
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
||||||
|
--endpoint /v1/completions \
|
||||||
|
--dataset-name sharegpt \
|
||||||
|
--dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
|
||||||
|
--num-prompts 10
|
||||||
|
```
|
||||||
|
|
||||||
|
If successful, you will see the following output
|
||||||
|
|
||||||
|
```
|
||||||
|
============ Serving Benchmark Result ============
|
||||||
|
Successful requests: 10
|
||||||
|
Benchmark duration (s): 5.78
|
||||||
|
Total input tokens: 1369
|
||||||
|
Total generated tokens: 2212
|
||||||
|
Request throughput (req/s): 1.73
|
||||||
|
Output token throughput (tok/s): 382.89
|
||||||
|
Total Token throughput (tok/s): 619.85
|
||||||
|
---------------Time to First Token----------------
|
||||||
|
Mean TTFT (ms): 71.54
|
||||||
|
Median TTFT (ms): 73.88
|
||||||
|
P99 TTFT (ms): 79.49
|
||||||
|
-----Time per Output Token (excl. 1st token)------
|
||||||
|
Mean TPOT (ms): 7.91
|
||||||
|
Median TPOT (ms): 7.96
|
||||||
|
P99 TPOT (ms): 8.03
|
||||||
|
---------------Inter-token Latency----------------
|
||||||
|
Mean ITL (ms): 7.74
|
||||||
|
Median ITL (ms): 7.70
|
||||||
|
P99 ITL (ms): 8.39
|
||||||
|
==================================================
|
||||||
|
```
|
||||||
|
|
||||||
|
**Custom Dataset**
|
||||||
|
|
||||||
|
If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl
|
||||||
|
|
||||||
|
```
|
||||||
|
{"prompt": "What is the capital of India?"}
|
||||||
|
{"prompt": "What is the capital of Iran?"}
|
||||||
|
{"prompt": "What is the capital of China?"}
|
||||||
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# start server
|
||||||
|
VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct --disable-log-requests
|
||||||
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# run benchmarking script
|
||||||
|
python3 benchmarks/benchmark_serving.py --port 9001 --save-result --save-detailed \
|
||||||
|
--backend vllm \
|
||||||
|
--model meta-llama/Llama-3.1-8B-Instruct \
|
||||||
|
--endpoint /v1/completions \
|
||||||
|
--dataset-name custom \
|
||||||
|
--dataset-path <path-to-your-data-jsonl> \
|
||||||
|
--custom-skip-chat-template \
|
||||||
|
--num-prompts 80 \
|
||||||
|
--max-concurrency 1 \
|
||||||
|
--temperature=0.3 \
|
||||||
|
--top-p=0.75 \
|
||||||
|
--result-dir "./log/"
|
||||||
|
```
|
||||||
|
|
||||||
|
You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`.
|
||||||
|
|
||||||
|
**VisionArena Benchmark for Vision Language Models**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# need a model with vision capability here
|
||||||
|
vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
|
||||||
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 vllm/benchmarks/benchmark_serving.py \
|
||||||
|
--backend openai-chat \
|
||||||
|
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||||
|
--endpoint /v1/chat/completions \
|
||||||
|
--dataset-name hf \
|
||||||
|
--dataset-path lmarena-ai/VisionArena-Chat \
|
||||||
|
--hf-split train \
|
||||||
|
--num-prompts 1000
|
||||||
|
```
|
||||||
|
|
||||||
|
**InstructCoder Benchmark with Speculative Decoding**
|
||||||
|
|
||||||
|
``` bash
|
||||||
|
VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
|
||||||
|
--speculative-config $'{"method": "ngram",
|
||||||
|
"num_speculative_tokens": 5, "prompt_lookup_max": 5,
|
||||||
|
"prompt_lookup_min": 2}'
|
||||||
|
```
|
||||||
|
|
||||||
|
``` bash
|
||||||
|
python3 benchmarks/benchmark_serving.py \
|
||||||
|
--model meta-llama/Meta-Llama-3-8B-Instruct \
|
||||||
|
--dataset-name hf \
|
||||||
|
--dataset-path likaixin/InstructCoder \
|
||||||
|
--num-prompts 2048
|
||||||
|
```
|
||||||
|
|
||||||
|
**Other HuggingFaceDataset Examples**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
|
||||||
|
```
|
||||||
|
|
||||||
|
**`lmms-lab/LLaVA-OneVision-Data`**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 vllm/benchmarks/benchmark_serving.py \
|
||||||
|
--backend openai-chat \
|
||||||
|
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||||
|
--endpoint /v1/chat/completions \
|
||||||
|
--dataset-name hf \
|
||||||
|
--dataset-path lmms-lab/LLaVA-OneVision-Data \
|
||||||
|
--hf-split train \
|
||||||
|
--hf-subset "chart2text(cauldron)" \
|
||||||
|
--num-prompts 10
|
||||||
|
```
|
||||||
|
|
||||||
|
**`Aeala/ShareGPT_Vicuna_unfiltered`**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 vllm/benchmarks/benchmark_serving.py \
|
||||||
|
--backend openai-chat \
|
||||||
|
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||||
|
--endpoint /v1/chat/completions \
|
||||||
|
--dataset-name hf \
|
||||||
|
--dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
|
||||||
|
--hf-split train \
|
||||||
|
--num-prompts 10
|
||||||
|
```
|
||||||
|
|
||||||
|
**`AI-MO/aimo-validation-aime`**
|
||||||
|
|
||||||
|
``` bash
|
||||||
|
python3 vllm/benchmarks/benchmark_serving.py \
|
||||||
|
--model Qwen/QwQ-32B \
|
||||||
|
--dataset-name hf \
|
||||||
|
--dataset-path AI-MO/aimo-validation-aime \
|
||||||
|
--num-prompts 10 \
|
||||||
|
--seed 42
|
||||||
|
```
|
||||||
|
|
||||||
|
**`philschmid/mt-bench`**
|
||||||
|
|
||||||
|
``` bash
|
||||||
|
python3 vllm/benchmarks/benchmark_serving.py \
|
||||||
|
--model Qwen/QwQ-32B \
|
||||||
|
--dataset-name hf \
|
||||||
|
--dataset-path philschmid/mt-bench \
|
||||||
|
--num-prompts 80
|
||||||
|
```
|
||||||
|
|
||||||
|
**Running With Sampling Parameters**
|
||||||
|
|
||||||
|
When using OpenAI-compatible backends such as `vllm`, optional sampling
|
||||||
|
parameters can be specified. Example client command:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 vllm/benchmarks/benchmark_serving.py \
|
||||||
|
--backend vllm \
|
||||||
|
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
||||||
|
--endpoint /v1/completions \
|
||||||
|
--dataset-name sharegpt \
|
||||||
|
--dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
|
||||||
|
--top-k 10 \
|
||||||
|
--top-p 0.9 \
|
||||||
|
--temperature 0.5 \
|
||||||
|
--num-prompts 10
|
||||||
|
```
|
||||||
|
|
||||||
|
**Running With Ramp-Up Request Rate**
|
||||||
|
|
||||||
|
The benchmark tool also supports ramping up the request rate over the
|
||||||
|
duration of the benchmark run. This can be useful for stress testing the
|
||||||
|
server or finding the maximum throughput that it can handle, given some latency budget.
|
||||||
|
|
||||||
|
Two ramp-up strategies are supported:
|
||||||
|
- `linear`: Increases the request rate linearly from a start value to an end value.
|
||||||
|
- `exponential`: Increases the request rate exponentially.
|
||||||
|
|
||||||
|
The following arguments can be used to control the ramp-up:
|
||||||
|
- `--ramp-up-strategy`: The ramp-up strategy to use (`linear` or `exponential`).
|
||||||
|
- `--ramp-up-start-rps`: The request rate at the beginning of the benchmark.
|
||||||
|
- `--ramp-up-end-rps`: The request rate at the end of the benchmark.
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary><b>📈 Example - Offline Throughput Benchmark</b></summary>
|
||||||
|
|
||||||
|
<br/>
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 vllm/benchmarks/benchmark_throughput.py \
|
||||||
|
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
||||||
|
--dataset-name sonnet \
|
||||||
|
--dataset-path vllm/benchmarks/sonnet.txt \
|
||||||
|
--num-prompts 10
|
||||||
|
```
|
||||||
|
|
||||||
|
If successful, you will see the following output
|
||||||
|
|
||||||
|
```
|
||||||
|
Throughput: 7.15 requests/s, 4656.00 total tokens/s, 1072.15 output tokens/s
|
||||||
|
Total num prompt tokens: 5014
|
||||||
|
Total num output tokens: 1500
|
||||||
|
```
|
||||||
|
|
||||||
|
**VisionArena Benchmark for Vision Language Models**
|
||||||
|
|
||||||
|
``` bash
|
||||||
|
python3 vllm/benchmarks/benchmark_throughput.py \
|
||||||
|
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||||
|
--backend vllm-chat \
|
||||||
|
--dataset-name hf \
|
||||||
|
--dataset-path lmarena-ai/VisionArena-Chat \
|
||||||
|
--num-prompts 1000 \
|
||||||
|
--hf-split train
|
||||||
|
```
|
||||||
|
|
||||||
|
The `num prompt tokens` now includes image token counts
|
||||||
|
|
||||||
|
```
|
||||||
|
Throughput: 2.55 requests/s, 4036.92 total tokens/s, 326.90 output tokens/s
|
||||||
|
Total num prompt tokens: 14527
|
||||||
|
Total num output tokens: 1280
|
||||||
|
```
|
||||||
|
|
||||||
|
**InstructCoder Benchmark with Speculative Decoding**
|
||||||
|
|
||||||
|
``` bash
|
||||||
|
VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
||||||
|
VLLM_USE_V1=1 \
|
||||||
|
python3 vllm/benchmarks/benchmark_throughput.py \
|
||||||
|
--dataset-name=hf \
|
||||||
|
--dataset-path=likaixin/InstructCoder \
|
||||||
|
--model=meta-llama/Meta-Llama-3-8B-Instruct \
|
||||||
|
--input-len=1000 \
|
||||||
|
--output-len=100 \
|
||||||
|
--num-prompts=2048 \
|
||||||
|
--async-engine \
|
||||||
|
--speculative-config $'{"method": "ngram",
|
||||||
|
"num_speculative_tokens": 5, "prompt_lookup_max": 5,
|
||||||
|
"prompt_lookup_min": 2}'
|
||||||
|
```
|
||||||
|
|
||||||
|
```
|
||||||
|
Throughput: 104.77 requests/s, 23836.22 total tokens/s, 10477.10 output tokens/s
|
||||||
|
Total num prompt tokens: 261136
|
||||||
|
Total num output tokens: 204800
|
||||||
|
```
|
||||||
|
|
||||||
|
**Other HuggingFaceDataset Examples**
|
||||||
|
|
||||||
|
**`lmms-lab/LLaVA-OneVision-Data`**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 vllm/benchmarks/benchmark_throughput.py \
|
||||||
|
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||||
|
--backend vllm-chat \
|
||||||
|
--dataset-name hf \
|
||||||
|
--dataset-path lmms-lab/LLaVA-OneVision-Data \
|
||||||
|
--hf-split train \
|
||||||
|
--hf-subset "chart2text(cauldron)" \
|
||||||
|
--num-prompts 10
|
||||||
|
```
|
||||||
|
|
||||||
|
**`Aeala/ShareGPT_Vicuna_unfiltered`**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 vllm/benchmarks/benchmark_throughput.py \
|
||||||
|
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||||
|
--backend vllm-chat \
|
||||||
|
--dataset-name hf \
|
||||||
|
--dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
|
||||||
|
--hf-split train \
|
||||||
|
--num-prompts 10
|
||||||
|
```
|
||||||
|
|
||||||
|
**`AI-MO/aimo-validation-aime`**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 benchmarks/benchmark_throughput.py \
|
||||||
|
--model Qwen/QwQ-32B \
|
||||||
|
--backend vllm \
|
||||||
|
--dataset-name hf \
|
||||||
|
--dataset-path AI-MO/aimo-validation-aime \
|
||||||
|
--hf-split train \
|
||||||
|
--num-prompts 10
|
||||||
|
```
|
||||||
|
|
||||||
|
**Benchmark with LoRA Adapters**
|
||||||
|
|
||||||
|
``` bash
|
||||||
|
# download dataset
|
||||||
|
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||||
|
python3 vllm/benchmarks/benchmark_throughput.py \
|
||||||
|
--model meta-llama/Llama-2-7b-hf \
|
||||||
|
--backend vllm \
|
||||||
|
--dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
|
||||||
|
--dataset_name sharegpt \
|
||||||
|
--num-prompts 10 \
|
||||||
|
--max-loras 2 \
|
||||||
|
--max-lora-rank 8 \
|
||||||
|
--enable-lora \
|
||||||
|
--lora-path yard1/llama-2-7b-sql-lora-test
|
||||||
|
```
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary><b>🛠️ Example - Structured Output Benchmark</b></summary>
|
||||||
|
|
||||||
|
<br/>
|
||||||
|
|
||||||
|
Benchmark the performance of structured output generation (JSON, grammar, regex).
|
||||||
|
|
||||||
|
**Server Setup**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests
|
||||||
|
```
|
||||||
|
|
||||||
|
**JSON Schema Benchmark**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 benchmarks/benchmark_serving_structured_output.py \
|
||||||
|
--backend vllm \
|
||||||
|
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
||||||
|
--dataset json \
|
||||||
|
--structured-output-ratio 1.0 \
|
||||||
|
--request-rate 10 \
|
||||||
|
--num-prompts 1000
|
||||||
|
```
|
||||||
|
|
||||||
|
**Grammar-based Generation Benchmark**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 benchmarks/benchmark_serving_structured_output.py \
|
||||||
|
--backend vllm \
|
||||||
|
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
||||||
|
--dataset grammar \
|
||||||
|
--structure-type grammar \
|
||||||
|
--request-rate 10 \
|
||||||
|
--num-prompts 1000
|
||||||
|
```
|
||||||
|
|
||||||
|
**Regex-based Generation Benchmark**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 benchmarks/benchmark_serving_structured_output.py \
|
||||||
|
--backend vllm \
|
||||||
|
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
||||||
|
--dataset regex \
|
||||||
|
--request-rate 10 \
|
||||||
|
--num-prompts 1000
|
||||||
|
```
|
||||||
|
|
||||||
|
**Choice-based Generation Benchmark**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 benchmarks/benchmark_serving_structured_output.py \
|
||||||
|
--backend vllm \
|
||||||
|
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
||||||
|
--dataset choice \
|
||||||
|
--request-rate 10 \
|
||||||
|
--num-prompts 1000
|
||||||
|
```
|
||||||
|
|
||||||
|
**XGrammar Benchmark Dataset**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 benchmarks/benchmark_serving_structured_output.py \
|
||||||
|
--backend vllm \
|
||||||
|
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
||||||
|
--dataset xgrammar_bench \
|
||||||
|
--request-rate 10 \
|
||||||
|
--num-prompts 1000
|
||||||
|
```
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary><b>📚 Example - Long Document QA Benchmark</b></summary>
|
||||||
|
|
||||||
|
<br/>
|
||||||
|
|
||||||
|
Benchmark the performance of long document question-answering with prefix caching.
|
||||||
|
|
||||||
|
**Basic Long Document QA Test**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 benchmarks/benchmark_long_document_qa_throughput.py \
|
||||||
|
--model meta-llama/Llama-2-7b-chat-hf \
|
||||||
|
--enable-prefix-caching \
|
||||||
|
--num-documents 16 \
|
||||||
|
--document-length 2000 \
|
||||||
|
--output-len 50 \
|
||||||
|
--repeat-count 5
|
||||||
|
```
|
||||||
|
|
||||||
|
**Different Repeat Modes**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Random mode (default) - shuffle prompts randomly
|
||||||
|
python3 benchmarks/benchmark_long_document_qa_throughput.py \
|
||||||
|
--model meta-llama/Llama-2-7b-chat-hf \
|
||||||
|
--enable-prefix-caching \
|
||||||
|
--num-documents 8 \
|
||||||
|
--document-length 3000 \
|
||||||
|
--repeat-count 3 \
|
||||||
|
--repeat-mode random
|
||||||
|
|
||||||
|
# Tile mode - repeat entire prompt list in sequence
|
||||||
|
python3 benchmarks/benchmark_long_document_qa_throughput.py \
|
||||||
|
--model meta-llama/Llama-2-7b-chat-hf \
|
||||||
|
--enable-prefix-caching \
|
||||||
|
--num-documents 8 \
|
||||||
|
--document-length 3000 \
|
||||||
|
--repeat-count 3 \
|
||||||
|
--repeat-mode tile
|
||||||
|
|
||||||
|
# Interleave mode - repeat each prompt consecutively
|
||||||
|
python3 benchmarks/benchmark_long_document_qa_throughput.py \
|
||||||
|
--model meta-llama/Llama-2-7b-chat-hf \
|
||||||
|
--enable-prefix-caching \
|
||||||
|
--num-documents 8 \
|
||||||
|
--document-length 3000 \
|
||||||
|
--repeat-count 3 \
|
||||||
|
--repeat-mode interleave
|
||||||
|
```
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary><b>🗂️ Example - Prefix Caching Benchmark</b></summary>
|
||||||
|
|
||||||
|
<br/>
|
||||||
|
|
||||||
|
Benchmark the efficiency of automatic prefix caching.
|
||||||
|
|
||||||
|
**Fixed Prompt with Prefix Caching**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 benchmarks/benchmark_prefix_caching.py \
|
||||||
|
--model meta-llama/Llama-2-7b-chat-hf \
|
||||||
|
--enable-prefix-caching \
|
||||||
|
--num-prompts 1 \
|
||||||
|
--repeat-count 100 \
|
||||||
|
--input-length-range 128:256
|
||||||
|
```
|
||||||
|
|
||||||
|
**ShareGPT Dataset with Prefix Caching**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# download dataset
|
||||||
|
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||||
|
|
||||||
|
python3 benchmarks/benchmark_prefix_caching.py \
|
||||||
|
--model meta-llama/Llama-2-7b-chat-hf \
|
||||||
|
--dataset-path /path/ShareGPT_V3_unfiltered_cleaned_split.json \
|
||||||
|
--enable-prefix-caching \
|
||||||
|
--num-prompts 20 \
|
||||||
|
--repeat-count 5 \
|
||||||
|
--input-length-range 128:256
|
||||||
|
```
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary><b>⚡ Example - Request Prioritization Benchmark</b></summary>
|
||||||
|
|
||||||
|
<br/>
|
||||||
|
|
||||||
|
Benchmark the performance of request prioritization in vLLM.
|
||||||
|
|
||||||
|
**Basic Prioritization Test**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 benchmarks/benchmark_prioritization.py \
|
||||||
|
--model meta-llama/Llama-2-7b-chat-hf \
|
||||||
|
--input-len 128 \
|
||||||
|
--output-len 64 \
|
||||||
|
--num-prompts 100 \
|
||||||
|
--scheduling-policy priority
|
||||||
|
```
|
||||||
|
|
||||||
|
**Multiple Sequences per Prompt**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 benchmarks/benchmark_prioritization.py \
|
||||||
|
--model meta-llama/Llama-2-7b-chat-hf \
|
||||||
|
--input-len 128 \
|
||||||
|
--output-len 64 \
|
||||||
|
--num-prompts 100 \
|
||||||
|
--scheduling-policy priority \
|
||||||
|
--n 2
|
||||||
|
```
|
||||||
|
|
||||||
|
</details>
|
||||||
|
@ -3,7 +3,6 @@
|
|||||||
This script automates the process of finding the optimal server parameter combination (`max-num-seqs` and `max-num-batched-tokens`) to maximize throughput for a vLLM server. It also supports additional constraints such as E2E latency and prefix cache hit rate.
|
This script automates the process of finding the optimal server parameter combination (`max-num-seqs` and `max-num-batched-tokens`) to maximize throughput for a vLLM server. It also supports additional constraints such as E2E latency and prefix cache hit rate.
|
||||||
|
|
||||||
## Table of Contents
|
## Table of Contents
|
||||||
|
|
||||||
- [Prerequisites](#prerequisites)
|
- [Prerequisites](#prerequisites)
|
||||||
- [Configuration](#configuration)
|
- [Configuration](#configuration)
|
||||||
- [How to Run](#how-to-run)
|
- [How to Run](#how-to-run)
|
||||||
@ -31,12 +30,6 @@ cd vllm
|
|||||||
|
|
||||||
You must set the following variables at the top of the script before execution.
|
You must set the following variables at the top of the script before execution.
|
||||||
|
|
||||||
Note: You can also override the default values below via environment variables when running the script.
|
|
||||||
|
|
||||||
```bash
|
|
||||||
MODEL=meta-llama/Llama-3.3-70B-Instruct SYSTEM=TPU TP=8 DOWNLOAD_DIR='' INPUT_LEN=128 OUTPUT_LEN=2048 MAX_MODEL_LEN=2300 MIN_CACHE_HIT_PCT=0 MAX_LATENCY_ALLOWED_MS=100000000000 NUM_SEQS_LIST="128 256" NUM_BATCHED_TOKENS_LIST="1024 2048 4096" VLLM_LOGGING_LEVEL=DEBUG bash auto_tune.sh
|
|
||||||
```
|
|
||||||
|
|
||||||
| Variable | Description | Example Value |
|
| Variable | Description | Example Value |
|
||||||
| --- | --- | --- |
|
| --- | --- | --- |
|
||||||
| `BASE` | **Required.** The absolute path to the parent directory of your vLLM repository directory. | `"$HOME"` |
|
| `BASE` | **Required.** The absolute path to the parent directory of your vLLM repository directory. | `"$HOME"` |
|
||||||
@ -46,7 +39,6 @@ MODEL=meta-llama/Llama-3.3-70B-Instruct SYSTEM=TPU TP=8 DOWNLOAD_DIR='' INPUT_LE
|
|||||||
| `DOWNLOAD_DIR` | **Required.** Directory to download and load model weights from. | `""` (default download path) |
|
| `DOWNLOAD_DIR` | **Required.** Directory to download and load model weights from. | `""` (default download path) |
|
||||||
| `INPUT_LEN` | **Required.** Request input length. | `4000` |
|
| `INPUT_LEN` | **Required.** Request input length. | `4000` |
|
||||||
| `OUTPUT_LEN` | **Required.** Request output length. | `16` |
|
| `OUTPUT_LEN` | **Required.** Request output length. | `16` |
|
||||||
| `MAX_MODEL_LEN` | **Required.** Max model length. | `4096` |
|
|
||||||
| `MIN_CACHE_HIT_PCT` | Prefix cache hit rate in percentage (0-100). Set to `0` to disable. | `60` |
|
| `MIN_CACHE_HIT_PCT` | Prefix cache hit rate in percentage (0-100). Set to `0` to disable. | `60` |
|
||||||
| `MAX_LATENCY_ALLOWED_MS` | The maximum allowed P99 end-to-end latency in milliseconds. Set to a very large number (e.g., `100000000000`) to effectively ignore the latency constraint. | `500` |
|
| `MAX_LATENCY_ALLOWED_MS` | The maximum allowed P99 end-to-end latency in milliseconds. Set to a very large number (e.g., `100000000000`) to effectively ignore the latency constraint. | `500` |
|
||||||
| `NUM_SEQS_LIST` | A space-separated string of `max-num-seqs` values to test. | `"128 256"` |
|
| `NUM_SEQS_LIST` | A space-separated string of `max-num-seqs` values to test. | `"128 256"` |
|
||||||
@ -59,7 +51,7 @@ MODEL=meta-llama/Llama-3.3-70B-Instruct SYSTEM=TPU TP=8 DOWNLOAD_DIR='' INPUT_LE
|
|||||||
1. **Configure**: Edit the script and set the variables in the [Configuration](#configuration) section.
|
1. **Configure**: Edit the script and set the variables in the [Configuration](#configuration) section.
|
||||||
2. **Execute**: Run the script. Since the process can take a long time, it is highly recommended to use a terminal multiplexer like `tmux` or `screen` to prevent the script from stopping if your connection is lost.
|
2. **Execute**: Run the script. Since the process can take a long time, it is highly recommended to use a terminal multiplexer like `tmux` or `screen` to prevent the script from stopping if your connection is lost.
|
||||||
|
|
||||||
```bash
|
```
|
||||||
cd <FOLDER_OF_THIS_SCRIPT>
|
cd <FOLDER_OF_THIS_SCRIPT>
|
||||||
bash auto_tune.sh
|
bash auto_tune.sh
|
||||||
```
|
```
|
||||||
@ -71,40 +63,34 @@ bash auto_tune.sh
|
|||||||
Here are a few examples of how to configure the script for different goals:
|
Here are a few examples of how to configure the script for different goals:
|
||||||
|
|
||||||
### 1. Maximize Throughput (No Latency Constraint)
|
### 1. Maximize Throughput (No Latency Constraint)
|
||||||
|
|
||||||
- **Goal**: Find the best `max-num-seqs` and `max-num-batched-tokens` to get the highest possible throughput for 1800 input tokens and 20 output tokens.
|
- **Goal**: Find the best `max-num-seqs` and `max-num-batched-tokens` to get the highest possible throughput for 1800 input tokens and 20 output tokens.
|
||||||
- **Configuration**:
|
- **Configuration**:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
INPUT_LEN=1800
|
INPUT_LEN=1800
|
||||||
OUTPUT_LEN=20
|
OUTPUT_LEN=20
|
||||||
MAX_MODEL_LEN=2048
|
|
||||||
MIN_CACHE_HIT_PCT=0
|
MIN_CACHE_HIT_PCT=0
|
||||||
MAX_LATENCY_ALLOWED_MS=100000000000 # A very large number
|
MAX_LATENCY_ALLOWED_MS=100000000000 # A very large number
|
||||||
```
|
```
|
||||||
|
|
||||||
#### 2. Maximize Throughput with a Latency Requirement
|
#### 2. Maximize Throughput with a Latency Requirement
|
||||||
|
|
||||||
- **Goal**: Find the best server parameters when P99 end-to-end latency must be below 500ms.
|
- **Goal**: Find the best server parameters when P99 end-to-end latency must be below 500ms.
|
||||||
- **Configuration**:
|
- **Configuration**:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
INPUT_LEN=1800
|
INPUT_LEN=1800
|
||||||
OUTPUT_LEN=20
|
OUTPUT_LEN=20
|
||||||
MAX_MODEL_LEN=2048
|
|
||||||
MIN_CACHE_HIT_PCT=0
|
MIN_CACHE_HIT_PCT=0
|
||||||
MAX_LATENCY_ALLOWED_MS=500
|
MAX_LATENCY_ALLOWED_MS=500
|
||||||
```
|
```
|
||||||
|
|
||||||
#### 3. Maximize Throughput with Prefix Caching and Latency Requirements
|
#### 3. Maximize Throughput with Prefix Caching and Latency Requirements
|
||||||
|
|
||||||
- **Goal**: Find the best server parameters assuming a 60% prefix cache hit rate and a latency requirement of 500ms.
|
- **Goal**: Find the best server parameters assuming a 60% prefix cache hit rate and a latency requirement of 500ms.
|
||||||
- **Configuration**:
|
- **Configuration**:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
INPUT_LEN=1800
|
INPUT_LEN=1800
|
||||||
OUTPUT_LEN=20
|
OUTPUT_LEN=20
|
||||||
MAX_MODEL_LEN=2048
|
|
||||||
MIN_CACHE_HIT_PCT=60
|
MIN_CACHE_HIT_PCT=60
|
||||||
MAX_LATENCY_ALLOWED_MS=500
|
MAX_LATENCY_ALLOWED_MS=500
|
||||||
```
|
```
|
||||||
@ -115,11 +101,11 @@ After the script finishes, you will find the results in a new, timestamped direc
|
|||||||
|
|
||||||
- **Log Files**: The directory (`$BASE/auto-benchmark/YYYY_MM_DD_HH_MM/`) contains detailed logs for each run:
|
- **Log Files**: The directory (`$BASE/auto-benchmark/YYYY_MM_DD_HH_MM/`) contains detailed logs for each run:
|
||||||
- `vllm_log_...txt`: The log output from the vLLM server for each parameter combination.
|
- `vllm_log_...txt`: The log output from the vLLM server for each parameter combination.
|
||||||
- `bm_log_...txt`: The log output from the `vllm bench serve` command for each benchmark run.
|
- `bm_log_...txt`: The log output from the `benchmark_serving.py` script for each benchmark run.
|
||||||
|
|
||||||
- **Final Result Summary**: A file named `result.txt` is created in the log directory. It contains a summary of each tested combination and concludes with the overall best parameters found.
|
- **Final Result Summary**: A file named `result.txt` is created in the log directory. It contains a summary of each tested combination and concludes with the overall best parameters found.
|
||||||
|
|
||||||
```text
|
```
|
||||||
# Example result.txt content
|
# Example result.txt content
|
||||||
hash:a1b2c3d4...
|
hash:a1b2c3d4...
|
||||||
max_num_seqs: 128, max_num_batched_tokens: 2048, request_rate: 10.0, e2el: 450.5, throughput: 9.8, goodput: 9.8
|
max_num_seqs: 128, max_num_batched_tokens: 2048, request_rate: 10.0, e2el: 450.5, throughput: 9.8, goodput: 9.8
|
||||||
@ -149,70 +135,3 @@ The script follows a systematic process to find the optimal parameters:
|
|||||||
4. **Track Best Result**: Throughout the process, the script tracks the parameter combination that has yielded the highest valid throughput so far.
|
4. **Track Best Result**: Throughout the process, the script tracks the parameter combination that has yielded the highest valid throughput so far.
|
||||||
|
|
||||||
5. **Profile Collection**: For the best-performing run, the script saves the vLLM profiler output, which can be used for deep-dive performance analysis with tools like TensorBoard.
|
5. **Profile Collection**: For the best-performing run, the script saves the vLLM profiler output, which can be used for deep-dive performance analysis with tools like TensorBoard.
|
||||||
|
|
||||||
## Batched `auto_tune`
|
|
||||||
|
|
||||||
The `batch_auto_tune.sh` script allows you to run multiple `auto_tune.sh` experiments sequentially from a single configuration file. It iterates through a list of parameter sets, executes `auto_tune.sh` for each, and records the results back into the input file.
|
|
||||||
|
|
||||||
### Prerequisites
|
|
||||||
|
|
||||||
- **jq**: This script requires `jq` to parse the JSON configuration file.
|
|
||||||
- **gcloud**: If you plan to upload results to Google Cloud Storage, the `gcloud` CLI must be installed and authenticated.
|
|
||||||
|
|
||||||
### How to Run
|
|
||||||
|
|
||||||
1. **Create a JSON configuration file**: Create a file (e.g., `runs_config.json`) containing an array of JSON objects. Each object defines the parameters for a single `auto_tune.sh` run.
|
|
||||||
|
|
||||||
2. **Execute the script**:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
bash batch_auto_tune.sh <path_to_json_file> [gcs_upload_path]
|
|
||||||
```
|
|
||||||
|
|
||||||
- `<path_to_json_file>`: **Required.** Path to your JSON configuration file.
|
|
||||||
- `[gcs_upload_path]`: **Optional.** A GCS path (e.g., `gs://my-bucket/benchmark-results`) where the detailed results and profiles for each run will be uploaded. If this is empty, the results will be available on the local filesystem (see the log for `RESULT_FILE=/path/to/results/file.txt`).
|
|
||||||
|
|
||||||
### Configuration File
|
|
||||||
|
|
||||||
The JSON configuration file should contain an array of objects. Each object's keys correspond to the configuration variables for `auto_tune.sh` (see the [Configuration table above](#configuration)). These keys will be converted to uppercase environment variables for each run.
|
|
||||||
|
|
||||||
Here is an example `runs_config.json` with two benchmark configurations:
|
|
||||||
|
|
||||||
```json
|
|
||||||
[
|
|
||||||
{
|
|
||||||
"base": "/home/user",
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"system": "TPU", # OR GPU
|
|
||||||
"tp": 8,
|
|
||||||
"input_len": 128,
|
|
||||||
"output_len": 2048,
|
|
||||||
"max_model_len": 2300,
|
|
||||||
"num_seqs_list": "128 256",
|
|
||||||
"num_batched_tokens_list": "8192 16384"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"base": "/home/user",
|
|
||||||
"model": "meta-llama/Llama-3.1-70B-Instruct",
|
|
||||||
"system": "TPU", # OR GPU
|
|
||||||
"tp": 8,
|
|
||||||
"input_len": 4000,
|
|
||||||
"output_len": 16,
|
|
||||||
"max_model_len": 4096,
|
|
||||||
"num_seqs_list": "64 128",
|
|
||||||
"num_batched_tokens_list": "4096 8192",
|
|
||||||
"max_latency_allowed_ms": 500
|
|
||||||
}
|
|
||||||
]
|
|
||||||
```
|
|
||||||
|
|
||||||
### Output
|
|
||||||
|
|
||||||
The script modifies the input JSON file in place, adding the results of each run to the corresponding object. The following fields are added:
|
|
||||||
|
|
||||||
- `run_id`: A unique identifier for the run, derived from the timestamp.
|
|
||||||
- `status`: The outcome of the run (`SUCCESS`, `FAILURE`, or `WARNING_NO_RESULT_FILE`).
|
|
||||||
- `results`: The content of the `result.txt` file from the `auto_tune.sh` run.
|
|
||||||
- `gcs_results`: The GCS URL where the run's artifacts are stored (if a GCS path was provided).
|
|
||||||
|
|
||||||
A summary of successful and failed runs is also printed to the console upon completion.
|
|
||||||
|
@ -1,45 +1,27 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
# This script aims to tune the best server parameter combinations to maximize throughput for given requirement.
|
# This script aims to tune the best server parameter combinations to maximize throughput for given requirement.
|
||||||
# See details in README (benchmarks/auto_tune/README.md).
|
# See details in README (benchmarks/auto_tune/README.md).
|
||||||
|
|
||||||
TAG=$(date +"%Y_%m_%d_%H_%M")
|
TAG=$(date +"%Y_%m_%d_%H_%M")
|
||||||
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
BASE=""
|
||||||
VLLM_LOGGING_LEVEL=${VLLM_LOGGING_LEVEL:-INFO}
|
MODEL="meta-llama/Llama-3.1-8B-Instruct"
|
||||||
BASE=${BASE:-"$SCRIPT_DIR/../../.."}
|
SYSTEM="TPU"
|
||||||
MODEL=${MODEL:-"meta-llama/Llama-3.1-8B-Instruct"}
|
TP=1
|
||||||
SYSTEM=${SYSTEM:-"TPU"}
|
DOWNLOAD_DIR=""
|
||||||
TP=${TP:-1}
|
INPUT_LEN=4000
|
||||||
DOWNLOAD_DIR=${DOWNLOAD_DIR:-""}
|
OUTPUT_LEN=16
|
||||||
INPUT_LEN=${INPUT_LEN:-4000}
|
MIN_CACHE_HIT_PCT=0
|
||||||
OUTPUT_LEN=${OUTPUT_LEN:-16}
|
MAX_LATENCY_ALLOWED_MS=100000000000
|
||||||
MAX_MODEL_LEN=${MAX_MODEL_LEN:-4096}
|
NUM_SEQS_LIST="128 256"
|
||||||
MIN_CACHE_HIT_PCT=${MIN_CACHE_HIT_PCT:-0}
|
NUM_BATCHED_TOKENS_LIST="512 1024 2048 4096"
|
||||||
MAX_LATENCY_ALLOWED_MS=${MAX_LATENCY_ALLOWED_MS:-100000000000}
|
|
||||||
NUM_SEQS_LIST=${NUM_SEQS_LIST:-"128 256"}
|
|
||||||
NUM_BATCHED_TOKENS_LIST=${NUM_BATCHED_TOKENS_LIST:-"512 1024 2048 4096"}
|
|
||||||
|
|
||||||
LOG_FOLDER="$BASE/auto-benchmark/$TAG"
|
LOG_FOLDER="$BASE/auto-benchmark/$TAG"
|
||||||
RESULT="$LOG_FOLDER/result.txt"
|
RESULT="$LOG_FOLDER/result.txt"
|
||||||
PROFILE_PATH="$LOG_FOLDER/profile"
|
PROFILE_PATH="$LOG_FOLDER/profile"
|
||||||
|
|
||||||
echo "====================== AUTO TUNE PARAMETERS ===================="
|
echo "result file: $RESULT"
|
||||||
echo "SCRIPT_DIR=$SCRIPT_DIR"
|
echo "model: $MODEL"
|
||||||
echo "BASE=$BASE"
|
|
||||||
echo "MODEL=$MODEL"
|
|
||||||
echo "SYSTEM=$SYSTEM"
|
|
||||||
echo "TP=$TP"
|
|
||||||
echo "DOWNLOAD_DIR=$DOWNLOAD_DIR"
|
|
||||||
echo "INPUT_LEN=$INPUT_LEN"
|
|
||||||
echo "OUTPUT_LEN=$OUTPUT_LEN"
|
|
||||||
echo "MAX_MODEL_LEN=$MAX_MODEL_LEN"
|
|
||||||
echo "MIN_CACHE_HIT_PCT=$MIN_CACHE_HIT_PCT"
|
|
||||||
echo "MAX_LATENCY_ALLOWED_MS=$MAX_LATENCY_ALLOWED_MS"
|
|
||||||
echo "NUM_SEQS_LIST=$NUM_SEQS_LIST"
|
|
||||||
echo "NUM_BATCHED_TOKENS_LIST=$NUM_BATCHED_TOKENS_LIST"
|
|
||||||
echo "VLLM_LOGGING_LEVEL=$VLLM_LOGGING_LEVEL"
|
|
||||||
echo "RESULT_FILE=$RESULT"
|
|
||||||
echo "====================== AUTO TUNEPARAMETERS ===================="
|
|
||||||
|
|
||||||
rm -rf $LOG_FOLDER
|
rm -rf $LOG_FOLDER
|
||||||
rm -rf $PROFILE_PATH
|
rm -rf $PROFILE_PATH
|
||||||
@ -54,18 +36,10 @@ current_hash=$(git rev-parse HEAD)
|
|||||||
echo "hash:$current_hash" >> "$RESULT"
|
echo "hash:$current_hash" >> "$RESULT"
|
||||||
echo "current_hash: $current_hash"
|
echo "current_hash: $current_hash"
|
||||||
|
|
||||||
TOTAL_LEN=$((INPUT_LEN + OUTPUT_LEN))
|
|
||||||
RED='\033[0;31m'
|
|
||||||
if (( TOTAL_LEN > MAX_MODEL_LEN )); then
|
|
||||||
echo -e "${RED}FAILED: INPUT_LEN($INPUT_LEN) + OUTPUT_LEN($OUTPUT_LEN) = $TOTAL_LEN, which is > MAX_MODEL_LEN = $MAX_MODEL_LEN.\033[0m" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
best_throughput=0
|
best_throughput=0
|
||||||
best_max_num_seqs=0
|
best_max_num_seqs=0
|
||||||
best_num_batched_tokens=0
|
best_num_batched_tokens=0
|
||||||
best_goodput=0
|
best_goodput=0
|
||||||
best_request_rate=0
|
|
||||||
|
|
||||||
start_server() {
|
start_server() {
|
||||||
local gpu_memory_utilization=$1
|
local gpu_memory_utilization=$1
|
||||||
@ -73,47 +47,26 @@ start_server() {
|
|||||||
local max_num_batched_tokens=$3
|
local max_num_batched_tokens=$3
|
||||||
local vllm_log=$4
|
local vllm_log=$4
|
||||||
local profile_dir=$5
|
local profile_dir=$5
|
||||||
|
|
||||||
|
pkill -f vllm
|
||||||
|
|
||||||
pkill -if "vllm serve" || true
|
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir vllm serve $MODEL \
|
||||||
|
--disable-log-requests \
|
||||||
# Define the common arguments as a bash array.
|
--port 8004 \
|
||||||
# Each argument and its value are separate elements.
|
--gpu-memory-utilization $gpu_memory_utilization \
|
||||||
local common_args_array=(
|
--max-num-seqs $max_num_seqs \
|
||||||
"$MODEL"
|
--max-num-batched-tokens $max_num_batched_tokens \
|
||||||
"--disable-log-requests"
|
--tensor-parallel-size $TP \
|
||||||
"--port" "8004"
|
--enable-prefix-caching \
|
||||||
"--gpu-memory-utilization" "$gpu_memory_utilization"
|
--load-format dummy \
|
||||||
"--max-num-seqs" "$max_num_seqs"
|
--download-dir "$DOWNLOAD_DIR" \
|
||||||
"--max-num-batched-tokens" "$max_num_batched_tokens"
|
--max-model-len $(( INPUT_LEN+OUTPUT_LEN )) > "$vllm_log" 2>&1 &
|
||||||
"--tensor-parallel-size" "$TP"
|
|
||||||
"--enable-prefix-caching"
|
|
||||||
"--load-format" "dummy"
|
|
||||||
"--download-dir" "$DOWNLOAD_DIR"
|
|
||||||
"--max-model-len" "$MAX_MODEL_LEN"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Use the array expansion "${common_args_array[@]}"
|
|
||||||
# This correctly passes each element as a separate argument.
|
|
||||||
if [[ -n "$profile_dir" ]]; then
|
|
||||||
# Start server with profiling enabled
|
|
||||||
VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
|
|
||||||
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
|
|
||||||
else
|
|
||||||
# Start server without profiling
|
|
||||||
VLLM_SERVER_DEV_MODE=1 \
|
|
||||||
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
|
|
||||||
fi
|
|
||||||
local server_pid=$!
|
|
||||||
|
|
||||||
# wait for 10 minutes...
|
# wait for 10 minutes...
|
||||||
server_started=0
|
server_started=0
|
||||||
for i in {1..60}; do
|
for i in {1..60}; do
|
||||||
# This line checks whether the server is still alive or not,
|
|
||||||
# since that we should always have permission to send signal to the server process.
|
|
||||||
kill -0 $server_pid 2> /dev/null || break
|
|
||||||
|
|
||||||
RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
|
RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
|
||||||
STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
|
STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
|
||||||
if [[ "$STATUS_CODE" -eq 200 ]]; then
|
if [[ "$STATUS_CODE" -eq 200 ]]; then
|
||||||
server_started=1
|
server_started=1
|
||||||
break
|
break
|
||||||
@ -121,29 +74,45 @@ start_server() {
|
|||||||
sleep 10
|
sleep 10
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
if (( ! server_started )); then
|
if (( ! server_started )); then
|
||||||
echo "server did not start within 10 minutes or crashed. Please check server log at $vllm_log".
|
echo "server did not start within 10 minutes. Please check server log at $vllm_log".
|
||||||
return 1
|
return 1
|
||||||
else
|
else
|
||||||
return 0
|
return 0
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
|
update_best_profile() {
|
||||||
|
local profile_dir=$1
|
||||||
|
local profile_index=$2
|
||||||
|
sorted_paths=($(find "$profile_dir" -maxdepth 1 -not -path "$profile_dir" | sort))
|
||||||
|
selected_profile_file=
|
||||||
|
if [[ "$SYSTEM" == "TPU" ]]; then
|
||||||
|
selected_profile_file="${sorted_paths[$profile_index]}/*.xplane.pb"
|
||||||
|
fi
|
||||||
|
if [[ "$SYSTEM" == "GPU" ]]; then
|
||||||
|
selected_profile_file="${sorted_paths[$profile_index]}"
|
||||||
|
fi
|
||||||
|
rm -f $PROFILE_PATH/*
|
||||||
|
cp $selected_profile_file $PROFILE_PATH
|
||||||
|
}
|
||||||
|
|
||||||
run_benchmark() {
|
run_benchmark() {
|
||||||
local max_num_seqs=$1
|
local max_num_seqs=$1
|
||||||
local max_num_batched_tokens=$2
|
local max_num_batched_tokens=$2
|
||||||
local gpu_memory_utilization=$3
|
local gpu_memory_utilization=$3
|
||||||
echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
|
echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
|
||||||
local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
|
local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
|
||||||
|
local profile_dir="$LOG_FOLDER/profile_${max_num_seqs}_${max_num_batched_tokens}"
|
||||||
echo "vllm_log: $vllm_log"
|
echo "vllm_log: $vllm_log"
|
||||||
echo
|
echo
|
||||||
rm -f $vllm_log
|
rm -f $vllm_log
|
||||||
pkill -if "vllm serve" || true
|
mkdir -p $profile_dir
|
||||||
|
pkill -f vllm
|
||||||
|
local profile_index=0
|
||||||
|
|
||||||
echo "starting server..."
|
echo "starting server..."
|
||||||
# Call start_server without a profile_dir to avoid profiling overhead
|
start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log $profile_dir
|
||||||
start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log ""
|
|
||||||
result=$?
|
result=$?
|
||||||
if [[ "$result" -eq 1 ]]; then
|
if [[ "$result" -eq 1 ]]; then
|
||||||
echo "server failed to start. gpu_memory_utilization:$gpu_memory_utilization, max_num_seqs:$max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
|
echo "server failed to start. gpu_memory_utilization:$gpu_memory_utilization, max_num_seqs:$max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
|
||||||
@ -151,15 +120,14 @@ run_benchmark() {
|
|||||||
echo "server started."
|
echo "server started."
|
||||||
fi
|
fi
|
||||||
echo
|
echo
|
||||||
|
|
||||||
echo "run benchmark test..."
|
echo "run benchmark test..."
|
||||||
meet_latency_requirement=0
|
meet_latency_requirement=0
|
||||||
# get a basic qps by using request-rate inf
|
# get a basic qps by using request-rate inf
|
||||||
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
|
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
|
||||||
prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
|
prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
|
||||||
adjusted_input_len=$(( INPUT_LEN - prefix_len ))
|
adjusted_input_len=$(( INPUT_LEN - prefix_len ))
|
||||||
# --profile flag is removed from this call
|
python3 benchmarks/benchmark_serving.py \
|
||||||
vllm bench serve \
|
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--model $MODEL \
|
--model $MODEL \
|
||||||
--dataset-name random \
|
--dataset-name random \
|
||||||
@ -172,7 +140,8 @@ run_benchmark() {
|
|||||||
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
|
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
|
||||||
--num-prompts 1000 \
|
--num-prompts 1000 \
|
||||||
--random-prefix-len $prefix_len \
|
--random-prefix-len $prefix_len \
|
||||||
--port 8004 &> "$bm_log"
|
--port 8004 \
|
||||||
|
--profile &> "$bm_log"
|
||||||
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
||||||
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
|
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
|
||||||
goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
||||||
@ -186,11 +155,12 @@ run_benchmark() {
|
|||||||
# start from request-rate as int(throughput) + 1
|
# start from request-rate as int(throughput) + 1
|
||||||
request_rate=$((${throughput%.*} + 1))
|
request_rate=$((${throughput%.*} + 1))
|
||||||
while ((request_rate > 0)); do
|
while ((request_rate > 0)); do
|
||||||
|
profile_index=$((profile_index+1))
|
||||||
# clear prefix cache
|
# clear prefix cache
|
||||||
curl -X POST http://0.0.0.0:8004/reset_prefix_cache
|
curl -X POST http://0.0.0.0:8004/reset_prefix_cache
|
||||||
sleep 5
|
sleep 5
|
||||||
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
|
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
|
||||||
vllm bench serve \
|
python3 benchmarks/benchmark_serving.py \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--model $MODEL \
|
--model $MODEL \
|
||||||
--dataset-name random \
|
--dataset-name random \
|
||||||
@ -223,7 +193,12 @@ run_benchmark() {
|
|||||||
best_max_num_seqs=$max_num_seqs
|
best_max_num_seqs=$max_num_seqs
|
||||||
best_num_batched_tokens=$max_num_batched_tokens
|
best_num_batched_tokens=$max_num_batched_tokens
|
||||||
best_goodput=$goodput
|
best_goodput=$goodput
|
||||||
best_request_rate=$request_rate
|
if [[ "$SYSTEM" == "TPU" ]]; then
|
||||||
|
update_best_profile "$profile_dir/plugins/profile" $profile_index
|
||||||
|
fi
|
||||||
|
if [[ "$SYSTEM" == "GPU" ]]; then
|
||||||
|
update_best_profile "$profile_dir" $profile_index
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
else
|
else
|
||||||
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}"
|
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}"
|
||||||
@ -232,9 +207,9 @@ run_benchmark() {
|
|||||||
|
|
||||||
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
|
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
|
||||||
|
|
||||||
pkill -if "vllm serve" || true
|
pkill vllm
|
||||||
sleep 10
|
sleep 10
|
||||||
echo "===================="
|
printf '=%.0s' $(seq 1 20)
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -245,8 +220,7 @@ read -r -a num_batched_tokens_list <<< "$NUM_BATCHED_TOKENS_LIST"
|
|||||||
gpu_memory_utilization=0.98
|
gpu_memory_utilization=0.98
|
||||||
find_gpu_memory_utilization=0
|
find_gpu_memory_utilization=0
|
||||||
while (( $(echo "$gpu_memory_utilization >= 0.9" | bc -l) )); do
|
while (( $(echo "$gpu_memory_utilization >= 0.9" | bc -l) )); do
|
||||||
# Pass empty string for profile_dir argument
|
start_server $gpu_memory_utilization "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log"
|
||||||
start_server $gpu_memory_utilization "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log" ""
|
|
||||||
result=$?
|
result=$?
|
||||||
if [[ "$result" -eq 0 ]]; then
|
if [[ "$result" -eq 0 ]]; then
|
||||||
find_gpu_memory_utilization=1
|
find_gpu_memory_utilization=1
|
||||||
@ -269,45 +243,6 @@ for num_seqs in "${num_seqs_list[@]}"; do
|
|||||||
done
|
done
|
||||||
done
|
done
|
||||||
echo "finish permutations"
|
echo "finish permutations"
|
||||||
|
|
||||||
# =================================================================================
|
|
||||||
# FINAL PROFILING RUN FOR THE BEST CONFIGURATION
|
|
||||||
# =================================================================================
|
|
||||||
if (( $(echo "$best_throughput > 0" | bc -l) )); then
|
|
||||||
echo
|
|
||||||
echo "Benchmark tuning finished. Now running profiling on the best configuration found..."
|
|
||||||
echo "Best config: max_num_seqs: $best_max_num_seqs, max_num_batched_tokens: $best_num_batched_tokens, throughput: $best_throughput"
|
|
||||||
echo
|
|
||||||
|
|
||||||
vllm_log="$LOG_FOLDER/vllm_log_BEST_PROFILE.txt"
|
|
||||||
bm_log="$LOG_FOLDER/bm_log_BEST_PROFILE.txt"
|
|
||||||
|
|
||||||
# Start server with the best params and profiling ENABLED
|
|
||||||
echo "Starting server for profiling..."
|
|
||||||
start_server $gpu_memory_utilization $best_max_num_seqs $best_num_batched_tokens "$vllm_log" "$PROFILE_PATH"
|
|
||||||
|
|
||||||
# Run benchmark with the best params and the --profile flag
|
|
||||||
echo "Running benchmark with profiling..."
|
|
||||||
prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
|
|
||||||
adjusted_input_len=$(( INPUT_LEN - prefix_len ))
|
|
||||||
vllm bench serve \
|
|
||||||
--backend vllm \
|
|
||||||
--model $MODEL \
|
|
||||||
--dataset-name random \
|
|
||||||
--random-input-len $adjusted_input_len \
|
|
||||||
--random-output-len $OUTPUT_LEN \
|
|
||||||
--ignore-eos \
|
|
||||||
--disable-tqdm \
|
|
||||||
--request-rate $best_request_rate \
|
|
||||||
--percentile-metrics ttft,tpot,itl,e2el \
|
|
||||||
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
|
|
||||||
--num-prompts 100 \
|
|
||||||
--random-prefix-len $prefix_len \
|
|
||||||
--port 8004 \
|
|
||||||
--profile &> "$bm_log"
|
|
||||||
else
|
|
||||||
echo "No configuration met the latency requirements. Skipping final profiling run."
|
|
||||||
fi
|
|
||||||
pkill -if "vllm serve" || true
|
|
||||||
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH"
|
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH"
|
||||||
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT"
|
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT"
|
||||||
|
|
||||||
|
@ -1,128 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
INPUT_JSON="$1"
|
|
||||||
GCS_PATH="$2" # Optional GCS path for uploading results for each run
|
|
||||||
|
|
||||||
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
|
|
||||||
AUTOTUNE_SCRIPT="$SCRIPT_DIR/auto_tune.sh"
|
|
||||||
|
|
||||||
if [[ -z "$INPUT_JSON" ]]; then
|
|
||||||
echo "Error: Input JSON file not provided."
|
|
||||||
echo "Usage: $0 <path_to_json_file> [gcs_upload_path]"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ ! -f "$INPUT_JSON" ]]; then
|
|
||||||
echo "Error: File not found at '$INPUT_JSON'"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
if ! command -v jq &> /dev/null; then
|
|
||||||
echo "Error: 'jq' command not found. Please install jq to process the JSON input."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ -n "$GCS_PATH" ]] && ! command -v gcloud &> /dev/null; then
|
|
||||||
echo "Error: 'gcloud' command not found, but a GCS_PATH was provided."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
SUCCESS_COUNT=0
|
|
||||||
FAILURE_COUNT=0
|
|
||||||
FAILED_RUNS=()
|
|
||||||
SCRIPT_START_TIME=$(date +%s)
|
|
||||||
|
|
||||||
json_content=$(cat "$INPUT_JSON")
|
|
||||||
if ! num_runs=$(echo "$json_content" | jq 'length'); then
|
|
||||||
echo "Error: Invalid JSON in $INPUT_JSON. 'jq' failed to get array length." >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "Found $num_runs benchmark configurations in $INPUT_JSON."
|
|
||||||
echo "Starting benchmark runs..."
|
|
||||||
echo "--------------------------------------------------"
|
|
||||||
|
|
||||||
for i in $(seq 0 $(($num_runs - 1))); do
|
|
||||||
run_object=$(echo "$json_content" | jq ".[$i]")
|
|
||||||
|
|
||||||
RUN_START_TIME=$(date +%s)
|
|
||||||
ENV_VARS_ARRAY=()
|
|
||||||
# Dynamically create env vars from the JSON object's keys
|
|
||||||
for key in $(echo "$run_object" | jq -r 'keys_unsorted[]'); do
|
|
||||||
value=$(echo "$run_object" | jq -r ".$key")
|
|
||||||
var_name=$(echo "$key" | tr '[:lower:]' '[:upper:]' | tr -cd 'A-Z0-9_')
|
|
||||||
ENV_VARS_ARRAY+=("${var_name}=${value}")
|
|
||||||
done
|
|
||||||
|
|
||||||
echo "Executing run #$((i+1))/$num_runs with parameters: ${ENV_VARS_ARRAY[*]}"
|
|
||||||
|
|
||||||
# Execute auto_tune.sh and capture output
|
|
||||||
RUN_OUTPUT_FILE=$(mktemp)
|
|
||||||
if env "${ENV_VARS_ARRAY[@]}" bash "$AUTOTUNE_SCRIPT" > >(tee -a "$RUN_OUTPUT_FILE") 2>&1; then
|
|
||||||
STATUS="SUCCESS"
|
|
||||||
((SUCCESS_COUNT++))
|
|
||||||
else
|
|
||||||
STATUS="FAILURE"
|
|
||||||
((FAILURE_COUNT++))
|
|
||||||
FAILED_RUNS+=("Run #$((i+1)): $(echo $run_object | jq -c .)")
|
|
||||||
fi
|
|
||||||
|
|
||||||
RUN_OUTPUT=$(<"$RUN_OUTPUT_FILE")
|
|
||||||
rm "$RUN_OUTPUT_FILE"
|
|
||||||
|
|
||||||
# Parse results and optionally upload them to GCS
|
|
||||||
RUN_ID=""
|
|
||||||
RESULTS=""
|
|
||||||
GCS_RESULTS_URL=""
|
|
||||||
if [[ "$STATUS" == "SUCCESS" ]]; then
|
|
||||||
RESULT_FILE_PATH=$(echo "$RUN_OUTPUT" | grep 'RESULT_FILE=' | tail -n 1 | cut -d'=' -f2 | tr -s '/' || true)
|
|
||||||
|
|
||||||
if [[ -n "$RESULT_FILE_PATH" && -f "$RESULT_FILE_PATH" ]]; then
|
|
||||||
RUN_ID=$(basename "$(dirname "$RESULT_FILE_PATH")")
|
|
||||||
RESULT_DIR=$(dirname "$RESULT_FILE_PATH")
|
|
||||||
RESULTS=$(cat "$RESULT_FILE_PATH")
|
|
||||||
|
|
||||||
if [[ -n "$GCS_PATH" ]]; then
|
|
||||||
GCS_RESULTS_URL="${GCS_PATH}/${RUN_ID}"
|
|
||||||
echo "Uploading results to GCS..."
|
|
||||||
if gcloud storage rsync --recursive "$RESULT_DIR/" "$GCS_RESULTS_URL"; then
|
|
||||||
echo "GCS upload successful."
|
|
||||||
else
|
|
||||||
echo "Warning: GCS upload failed for RUN_ID $RUN_ID."
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
echo "Warning: Could not find result file for a successful run."
|
|
||||||
STATUS="WARNING_NO_RESULT_FILE"
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Add the results back into the JSON object for this run
|
|
||||||
json_content=$(echo "$json_content" | jq --argjson i "$i" --arg run_id "$RUN_ID" --arg status "$STATUS" --arg results "$RESULTS" --arg gcs_results "$GCS_RESULTS_URL" \
|
|
||||||
'.[$i] += {run_id: $run_id, status: $status, results: $results, gcs_results: $gcs_results}')
|
|
||||||
|
|
||||||
RUN_END_TIME=$(date +%s)
|
|
||||||
echo "Run finished in $((RUN_END_TIME - RUN_START_TIME)) seconds. Status: $STATUS"
|
|
||||||
echo "--------------------------------------------------"
|
|
||||||
|
|
||||||
# Save intermediate progress back to the file
|
|
||||||
echo "$json_content" > "$INPUT_JSON.tmp" && mv "$INPUT_JSON.tmp" "$INPUT_JSON"
|
|
||||||
|
|
||||||
done
|
|
||||||
|
|
||||||
SCRIPT_END_TIME=$(date +%s)
|
|
||||||
echo "All benchmark runs completed in $((SCRIPT_END_TIME - SCRIPT_START_TIME)) seconds."
|
|
||||||
echo
|
|
||||||
echo "====================== SUMMARY ======================"
|
|
||||||
echo "Successful runs: $SUCCESS_COUNT"
|
|
||||||
echo "Failed runs: $FAILURE_COUNT"
|
|
||||||
echo "==================================================="
|
|
||||||
|
|
||||||
if [[ $FAILURE_COUNT -gt 0 ]]; then
|
|
||||||
echo "Details of failed runs (see JSON file for full parameters):"
|
|
||||||
for failed in "${FAILED_RUNS[@]}"; do
|
|
||||||
echo " - $failed"
|
|
||||||
done
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "Updated results have been saved to '$INPUT_JSON'."
|
|
@ -8,6 +8,7 @@ import sys
|
|||||||
import time
|
import time
|
||||||
import traceback
|
import traceback
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Optional, Union
|
||||||
|
|
||||||
import aiohttp
|
import aiohttp
|
||||||
import huggingface_hub.constants
|
import huggingface_hub.constants
|
||||||
@ -27,13 +28,12 @@ class RequestFuncInput:
|
|||||||
prompt_len: int
|
prompt_len: int
|
||||||
output_len: int
|
output_len: int
|
||||||
model: str
|
model: str
|
||||||
model_name: str | None = None
|
model_name: Optional[str] = None
|
||||||
logprobs: int | None = None
|
logprobs: Optional[int] = None
|
||||||
extra_body: dict | None = None
|
extra_body: Optional[dict] = None
|
||||||
multi_modal_content: dict | list[dict] | None = None
|
multi_modal_content: Optional[dict] = None
|
||||||
ignore_eos: bool = False
|
ignore_eos: bool = False
|
||||||
language: str | None = None
|
language: Optional[str] = None
|
||||||
request_id: str | None = None
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@ -51,7 +51,7 @@ class RequestFuncOutput:
|
|||||||
|
|
||||||
async def async_request_tgi(
|
async def async_request_tgi(
|
||||||
request_func_input: RequestFuncInput,
|
request_func_input: RequestFuncInput,
|
||||||
pbar: tqdm | None = None,
|
pbar: Optional[tqdm] = None,
|
||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
api_url = request_func_input.api_url
|
api_url = request_func_input.api_url
|
||||||
assert api_url.endswith("generate_stream")
|
assert api_url.endswith("generate_stream")
|
||||||
@ -71,9 +71,6 @@ async def async_request_tgi(
|
|||||||
"inputs": request_func_input.prompt,
|
"inputs": request_func_input.prompt,
|
||||||
"parameters": params,
|
"parameters": params,
|
||||||
}
|
}
|
||||||
headers = None
|
|
||||||
if request_func_input.request_id:
|
|
||||||
headers = {"x-request-id": request_func_input.request_id}
|
|
||||||
output = RequestFuncOutput()
|
output = RequestFuncOutput()
|
||||||
output.prompt_len = request_func_input.prompt_len
|
output.prompt_len = request_func_input.prompt_len
|
||||||
if request_func_input.ignore_eos:
|
if request_func_input.ignore_eos:
|
||||||
@ -85,9 +82,7 @@ async def async_request_tgi(
|
|||||||
st = time.perf_counter()
|
st = time.perf_counter()
|
||||||
most_recent_timestamp = st
|
most_recent_timestamp = st
|
||||||
try:
|
try:
|
||||||
async with session.post(
|
async with session.post(url=api_url, json=payload) as response:
|
||||||
url=api_url, json=payload, headers=headers
|
|
||||||
) as response:
|
|
||||||
if response.status == 200:
|
if response.status == 200:
|
||||||
async for chunk_bytes in response.content:
|
async for chunk_bytes in response.content:
|
||||||
chunk_bytes = chunk_bytes.strip()
|
chunk_bytes = chunk_bytes.strip()
|
||||||
@ -132,7 +127,7 @@ async def async_request_tgi(
|
|||||||
|
|
||||||
async def async_request_trt_llm(
|
async def async_request_trt_llm(
|
||||||
request_func_input: RequestFuncInput,
|
request_func_input: RequestFuncInput,
|
||||||
pbar: tqdm | None = None,
|
pbar: Optional[tqdm] = None,
|
||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
api_url = request_func_input.api_url
|
api_url = request_func_input.api_url
|
||||||
assert api_url.endswith("generate_stream")
|
assert api_url.endswith("generate_stream")
|
||||||
@ -150,9 +145,6 @@ async def async_request_trt_llm(
|
|||||||
}
|
}
|
||||||
if request_func_input.ignore_eos:
|
if request_func_input.ignore_eos:
|
||||||
payload["min_length"] = request_func_input.output_len
|
payload["min_length"] = request_func_input.output_len
|
||||||
headers = None
|
|
||||||
if request_func_input.request_id:
|
|
||||||
headers = {"x-request-id": request_func_input.request_id}
|
|
||||||
output = RequestFuncOutput()
|
output = RequestFuncOutput()
|
||||||
output.prompt_len = request_func_input.prompt_len
|
output.prompt_len = request_func_input.prompt_len
|
||||||
|
|
||||||
@ -160,9 +152,7 @@ async def async_request_trt_llm(
|
|||||||
st = time.perf_counter()
|
st = time.perf_counter()
|
||||||
most_recent_timestamp = st
|
most_recent_timestamp = st
|
||||||
try:
|
try:
|
||||||
async with session.post(
|
async with session.post(url=api_url, json=payload) as response:
|
||||||
url=api_url, json=payload, headers=headers
|
|
||||||
) as response:
|
|
||||||
if response.status == 200:
|
if response.status == 200:
|
||||||
async for chunk_bytes in response.content:
|
async for chunk_bytes in response.content:
|
||||||
chunk_bytes = chunk_bytes.strip()
|
chunk_bytes = chunk_bytes.strip()
|
||||||
@ -203,7 +193,7 @@ async def async_request_trt_llm(
|
|||||||
|
|
||||||
async def async_request_deepspeed_mii(
|
async def async_request_deepspeed_mii(
|
||||||
request_func_input: RequestFuncInput,
|
request_func_input: RequestFuncInput,
|
||||||
pbar: tqdm | None = None,
|
pbar: Optional[tqdm] = None,
|
||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
api_url = request_func_input.api_url
|
api_url = request_func_input.api_url
|
||||||
assert api_url.endswith(("completions", "profile")), (
|
assert api_url.endswith(("completions", "profile")), (
|
||||||
@ -221,8 +211,6 @@ async def async_request_deepspeed_mii(
|
|||||||
"top_p": 1.0,
|
"top_p": 1.0,
|
||||||
}
|
}
|
||||||
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
|
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
|
||||||
if request_func_input.request_id:
|
|
||||||
headers["x-request-id"] = request_func_input.request_id
|
|
||||||
|
|
||||||
output = RequestFuncOutput()
|
output = RequestFuncOutput()
|
||||||
output.prompt_len = request_func_input.prompt_len
|
output.prompt_len = request_func_input.prompt_len
|
||||||
@ -266,7 +254,7 @@ async def async_request_deepspeed_mii(
|
|||||||
|
|
||||||
async def async_request_openai_completions(
|
async def async_request_openai_completions(
|
||||||
request_func_input: RequestFuncInput,
|
request_func_input: RequestFuncInput,
|
||||||
pbar: tqdm | None = None,
|
pbar: Optional[tqdm] = None,
|
||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
api_url = request_func_input.api_url
|
api_url = request_func_input.api_url
|
||||||
assert api_url.endswith(("completions", "profile")), (
|
assert api_url.endswith(("completions", "profile")), (
|
||||||
@ -295,8 +283,6 @@ async def async_request_openai_completions(
|
|||||||
if request_func_input.extra_body:
|
if request_func_input.extra_body:
|
||||||
payload.update(request_func_input.extra_body)
|
payload.update(request_func_input.extra_body)
|
||||||
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
|
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
|
||||||
if request_func_input.request_id:
|
|
||||||
headers["x-request-id"] = request_func_input.request_id
|
|
||||||
|
|
||||||
output = RequestFuncOutput()
|
output = RequestFuncOutput()
|
||||||
output.prompt_len = request_func_input.prompt_len
|
output.prompt_len = request_func_input.prompt_len
|
||||||
@ -366,7 +352,7 @@ async def async_request_openai_completions(
|
|||||||
|
|
||||||
async def async_request_openai_chat_completions(
|
async def async_request_openai_chat_completions(
|
||||||
request_func_input: RequestFuncInput,
|
request_func_input: RequestFuncInput,
|
||||||
pbar: tqdm | None = None,
|
pbar: Optional[tqdm] = None,
|
||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
api_url = request_func_input.api_url
|
api_url = request_func_input.api_url
|
||||||
assert api_url.endswith(("chat/completions", "profile")), (
|
assert api_url.endswith(("chat/completions", "profile")), (
|
||||||
@ -378,15 +364,7 @@ async def async_request_openai_chat_completions(
|
|||||||
) as session:
|
) as session:
|
||||||
content = [{"type": "text", "text": request_func_input.prompt}]
|
content = [{"type": "text", "text": request_func_input.prompt}]
|
||||||
if request_func_input.multi_modal_content:
|
if request_func_input.multi_modal_content:
|
||||||
mm_content = request_func_input.multi_modal_content
|
content.append(request_func_input.multi_modal_content)
|
||||||
if isinstance(mm_content, list):
|
|
||||||
content.extend(mm_content)
|
|
||||||
elif isinstance(mm_content, dict):
|
|
||||||
content.append(mm_content)
|
|
||||||
else:
|
|
||||||
raise TypeError(
|
|
||||||
"multi_modal_content must be a dict or list[dict] for openai-chat"
|
|
||||||
)
|
|
||||||
payload = {
|
payload = {
|
||||||
"model": request_func_input.model_name
|
"model": request_func_input.model_name
|
||||||
if request_func_input.model_name
|
if request_func_input.model_name
|
||||||
@ -409,8 +387,6 @@ async def async_request_openai_chat_completions(
|
|||||||
"Content-Type": "application/json",
|
"Content-Type": "application/json",
|
||||||
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
|
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
|
||||||
}
|
}
|
||||||
if request_func_input.request_id:
|
|
||||||
headers["x-request-id"] = request_func_input.request_id
|
|
||||||
|
|
||||||
output = RequestFuncOutput()
|
output = RequestFuncOutput()
|
||||||
output.prompt_len = request_func_input.prompt_len
|
output.prompt_len = request_func_input.prompt_len
|
||||||
@ -475,7 +451,7 @@ async def async_request_openai_chat_completions(
|
|||||||
|
|
||||||
async def async_request_openai_audio(
|
async def async_request_openai_audio(
|
||||||
request_func_input: RequestFuncInput,
|
request_func_input: RequestFuncInput,
|
||||||
pbar: tqdm | None = None,
|
pbar: Optional[tqdm] = None,
|
||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
# Lazy import without PlaceholderModule to avoid vllm dep.
|
# Lazy import without PlaceholderModule to avoid vllm dep.
|
||||||
import soundfile
|
import soundfile
|
||||||
@ -507,8 +483,6 @@ async def async_request_openai_audio(
|
|||||||
headers = {
|
headers = {
|
||||||
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
|
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
|
||||||
}
|
}
|
||||||
if request_func_input.request_id:
|
|
||||||
headers["x-request-id"] = request_func_input.request_id
|
|
||||||
|
|
||||||
# Send audio file
|
# Send audio file
|
||||||
def to_bytes(y, sr):
|
def to_bytes(y, sr):
|
||||||
@ -517,10 +491,7 @@ async def async_request_openai_audio(
|
|||||||
buffer.seek(0)
|
buffer.seek(0)
|
||||||
return buffer
|
return buffer
|
||||||
|
|
||||||
mm_audio = request_func_input.multi_modal_content
|
with to_bytes(*request_func_input.multi_modal_content["audio"]) as f:
|
||||||
if not isinstance(mm_audio, dict) or "audio" not in mm_audio:
|
|
||||||
raise TypeError("multi_modal_content must be a dict containing 'audio'")
|
|
||||||
with to_bytes(*mm_audio["audio"]) as f:
|
|
||||||
form = aiohttp.FormData()
|
form = aiohttp.FormData()
|
||||||
form.add_field("file", f, content_type="audio/wav")
|
form.add_field("file", f, content_type="audio/wav")
|
||||||
for key, value in payload.items():
|
for key, value in payload.items():
|
||||||
@ -609,7 +580,7 @@ def get_tokenizer(
|
|||||||
tokenizer_mode: str = "auto",
|
tokenizer_mode: str = "auto",
|
||||||
trust_remote_code: bool = False,
|
trust_remote_code: bool = False,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> PreTrainedTokenizer | PreTrainedTokenizerFast:
|
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
|
||||||
if pretrained_model_name_or_path is not None and not os.path.exists(
|
if pretrained_model_name_or_path is not None and not os.path.exists(
|
||||||
pretrained_model_name_or_path
|
pretrained_model_name_or_path
|
||||||
):
|
):
|
||||||
|
@ -1,74 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
import gc
|
|
||||||
|
|
||||||
from benchmark_utils import TimeCollector
|
|
||||||
from tabulate import tabulate
|
|
||||||
|
|
||||||
from vllm.utils import FlexibleArgumentParser
|
|
||||||
from vllm.v1.core.block_pool import BlockPool
|
|
||||||
|
|
||||||
|
|
||||||
def main(args):
|
|
||||||
rows = []
|
|
||||||
for allocate_block in args.allocate_blocks:
|
|
||||||
# Enforce a GC collect ahead to minimize the impact among runs
|
|
||||||
gc.collect()
|
|
||||||
block_pool = BlockPool(num_gpu_blocks=args.num_gpu_blocks, enable_caching=True)
|
|
||||||
|
|
||||||
get_blocks_times = TimeCollector(TimeCollector.US)
|
|
||||||
free_blocks_times = TimeCollector(TimeCollector.US)
|
|
||||||
for _ in range(args.num_iteration):
|
|
||||||
with get_blocks_times:
|
|
||||||
blocks = block_pool.get_new_blocks(allocate_block)
|
|
||||||
with free_blocks_times:
|
|
||||||
block_pool.free_blocks(blocks)
|
|
||||||
|
|
||||||
rows.append(
|
|
||||||
[get_blocks_times.cnt, args.num_gpu_blocks, allocate_block]
|
|
||||||
+ get_blocks_times.dump_avg_max()
|
|
||||||
+ free_blocks_times.dump_avg_max()
|
|
||||||
)
|
|
||||||
|
|
||||||
print(
|
|
||||||
tabulate(
|
|
||||||
rows,
|
|
||||||
headers=[
|
|
||||||
"Iterations",
|
|
||||||
"Total\nBlocks",
|
|
||||||
"Allocated\nBlocks",
|
|
||||||
"Get Blocks\nAvg (us)",
|
|
||||||
"Get Blocks\nMax (us)",
|
|
||||||
"Free Blocks\nAvg (us)",
|
|
||||||
"Free Blocks\nMax (us)",
|
|
||||||
],
|
|
||||||
tablefmt="grid",
|
|
||||||
floatfmt=".3f",
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def invoke_main() -> None:
|
|
||||||
parser = FlexibleArgumentParser(
|
|
||||||
description="Benchmark the performance of BlockPool for KV Cache."
|
|
||||||
)
|
|
||||||
parser.add_argument("--num-gpu-blocks", type=int, default=100000)
|
|
||||||
parser.add_argument(
|
|
||||||
"--num-iteration",
|
|
||||||
type=int,
|
|
||||||
default=1000,
|
|
||||||
help="Number of iterations to run to stabilize final data readings",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--allocate-blocks",
|
|
||||||
type=int,
|
|
||||||
nargs="*",
|
|
||||||
default=[10, 50, 100, 500, 1000],
|
|
||||||
help="Number of blocks to allocate",
|
|
||||||
)
|
|
||||||
args = parser.parse_args()
|
|
||||||
main(args)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
invoke_main() # pragma: no cover
|
|
1173
benchmarks/benchmark_dataset.py
Normal file
1173
benchmarks/benchmark_dataset.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -1,17 +1,186 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
import sys
|
"""Benchmark the latency of processing a single batch of requests."""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import dataclasses
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
from typing import Any, Optional
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
import vllm.envs as envs
|
||||||
|
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
|
||||||
|
from vllm import LLM, SamplingParams
|
||||||
|
from vllm.engine.arg_utils import EngineArgs
|
||||||
|
from vllm.inputs import PromptType
|
||||||
|
from vllm.sampling_params import BeamSearchParams
|
||||||
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
|
|
||||||
|
def save_to_pytorch_benchmark_format(
|
||||||
|
args: argparse.Namespace, results: dict[str, Any]
|
||||||
|
) -> None:
|
||||||
|
pt_records = convert_to_pytorch_benchmark_format(
|
||||||
|
args=args,
|
||||||
|
metrics={"latency": results["latencies"]},
|
||||||
|
extra_info={k: results[k] for k in ["avg_latency", "percentiles"]},
|
||||||
|
)
|
||||||
|
if pt_records:
|
||||||
|
pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
|
||||||
|
write_to_json(pt_file, pt_records)
|
||||||
|
|
||||||
|
|
||||||
|
def main(args: argparse.Namespace):
|
||||||
|
print(args)
|
||||||
|
|
||||||
|
engine_args = EngineArgs.from_cli_args(args)
|
||||||
|
|
||||||
|
# NOTE(woosuk): If the request cannot be processed in a single batch,
|
||||||
|
# the engine will automatically process the request in multiple batches.
|
||||||
|
llm = LLM(**dataclasses.asdict(engine_args))
|
||||||
|
assert llm.llm_engine.model_config.max_model_len >= (
|
||||||
|
args.input_len + args.output_len
|
||||||
|
), (
|
||||||
|
"Please ensure that max_model_len is greater than"
|
||||||
|
" the sum of input_len and output_len."
|
||||||
|
)
|
||||||
|
|
||||||
|
sampling_params = SamplingParams(
|
||||||
|
n=args.n,
|
||||||
|
temperature=1.0,
|
||||||
|
top_p=1.0,
|
||||||
|
ignore_eos=True,
|
||||||
|
max_tokens=args.output_len,
|
||||||
|
detokenize=not args.disable_detokenize,
|
||||||
|
)
|
||||||
|
print(sampling_params)
|
||||||
|
dummy_prompt_token_ids = np.random.randint(
|
||||||
|
10000, size=(args.batch_size, args.input_len)
|
||||||
|
)
|
||||||
|
dummy_prompts: list[PromptType] = [
|
||||||
|
{"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist()
|
||||||
|
]
|
||||||
|
|
||||||
|
def llm_generate():
|
||||||
|
if not args.use_beam_search:
|
||||||
|
llm.generate(dummy_prompts, sampling_params=sampling_params, use_tqdm=False)
|
||||||
|
else:
|
||||||
|
llm.beam_search(
|
||||||
|
dummy_prompts,
|
||||||
|
BeamSearchParams(
|
||||||
|
beam_width=args.n,
|
||||||
|
max_tokens=args.output_len,
|
||||||
|
ignore_eos=True,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
def run_to_completion(profile_dir: Optional[str] = None):
|
||||||
|
if profile_dir:
|
||||||
|
llm.start_profile()
|
||||||
|
llm_generate()
|
||||||
|
llm.stop_profile()
|
||||||
|
else:
|
||||||
|
start_time = time.perf_counter()
|
||||||
|
llm_generate()
|
||||||
|
end_time = time.perf_counter()
|
||||||
|
latency = end_time - start_time
|
||||||
|
return latency
|
||||||
|
|
||||||
|
print("Warming up...")
|
||||||
|
for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
|
||||||
|
run_to_completion(profile_dir=None)
|
||||||
|
|
||||||
|
if args.profile:
|
||||||
|
profile_dir = envs.VLLM_TORCH_PROFILER_DIR
|
||||||
|
print(f"Profiling (results will be saved to '{profile_dir}')...")
|
||||||
|
run_to_completion(profile_dir=profile_dir)
|
||||||
|
return
|
||||||
|
|
||||||
|
# Benchmark.
|
||||||
|
latencies = []
|
||||||
|
for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
|
||||||
|
latencies.append(run_to_completion(profile_dir=None))
|
||||||
|
latencies = np.array(latencies)
|
||||||
|
percentages = [10, 25, 50, 75, 90, 99]
|
||||||
|
percentiles = np.percentile(latencies, percentages)
|
||||||
|
print(f"Avg latency: {np.mean(latencies)} seconds")
|
||||||
|
for percentage, percentile in zip(percentages, percentiles):
|
||||||
|
print(f"{percentage}% percentile latency: {percentile} seconds")
|
||||||
|
|
||||||
|
# Output JSON results if specified
|
||||||
|
if args.output_json:
|
||||||
|
results = {
|
||||||
|
"avg_latency": np.mean(latencies),
|
||||||
|
"latencies": latencies.tolist(),
|
||||||
|
"percentiles": dict(zip(percentages, percentiles.tolist())),
|
||||||
|
}
|
||||||
|
with open(args.output_json, "w") as f:
|
||||||
|
json.dump(results, f, indent=4)
|
||||||
|
save_to_pytorch_benchmark_format(args, results)
|
||||||
|
|
||||||
|
|
||||||
|
def create_argument_parser():
|
||||||
|
parser = FlexibleArgumentParser(
|
||||||
|
description="Benchmark the latency of processing a single batch of "
|
||||||
|
"requests till completion."
|
||||||
|
)
|
||||||
|
parser.add_argument("--input-len", type=int, default=32)
|
||||||
|
parser.add_argument("--output-len", type=int, default=128)
|
||||||
|
parser.add_argument("--batch-size", type=int, default=8)
|
||||||
|
parser.add_argument(
|
||||||
|
"--n",
|
||||||
|
type=int,
|
||||||
|
default=1,
|
||||||
|
help="Number of generated sequences per prompt.",
|
||||||
|
)
|
||||||
|
parser.add_argument("--use-beam-search", action="store_true")
|
||||||
|
parser.add_argument(
|
||||||
|
"--num-iters-warmup",
|
||||||
|
type=int,
|
||||||
|
default=10,
|
||||||
|
help="Number of iterations to run for warmup.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--num-iters", type=int, default=30, help="Number of iterations to run."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--profile",
|
||||||
|
action="store_true",
|
||||||
|
help="profile the generation process of a single batch",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--output-json",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="Path to save the latency results in JSON format.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--disable-detokenize",
|
||||||
|
action="store_true",
|
||||||
|
help=(
|
||||||
|
"Do not detokenize responses (i.e. do not include "
|
||||||
|
"detokenization time in the latency measurement)"
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
parser = EngineArgs.add_cli_args(parser)
|
||||||
|
# V1 enables prefix caching by default which skews the latency
|
||||||
|
# numbers. We need to disable prefix caching by default.
|
||||||
|
parser.set_defaults(enable_prefix_caching=False)
|
||||||
|
|
||||||
|
return parser
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
print("""DEPRECATED: This script has been moved to the vLLM CLI.
|
parser = create_argument_parser()
|
||||||
|
args = parser.parse_args()
|
||||||
Please use the following command instead:
|
if args.profile and not envs.VLLM_TORCH_PROFILER_DIR:
|
||||||
vllm bench latency
|
raise OSError(
|
||||||
|
"The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. "
|
||||||
For help with the new command, run:
|
"Please set it to a valid path to use torch profiler."
|
||||||
vllm bench latency --help
|
)
|
||||||
|
main(args)
|
||||||
Alternatively, you can run the new command directly with:
|
|
||||||
python -m vllm.entrypoints.cli.main bench latency --help
|
|
||||||
""")
|
|
||||||
sys.exit(1)
|
|
||||||
|
@ -1,213 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
import gc
|
|
||||||
import time
|
|
||||||
from unittest import mock
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
from benchmark_utils import TimeCollector
|
|
||||||
from tabulate import tabulate
|
|
||||||
|
|
||||||
from vllm.config import (
|
|
||||||
CacheConfig,
|
|
||||||
DeviceConfig,
|
|
||||||
LoadConfig,
|
|
||||||
ModelConfig,
|
|
||||||
ParallelConfig,
|
|
||||||
SchedulerConfig,
|
|
||||||
SpeculativeConfig,
|
|
||||||
VllmConfig,
|
|
||||||
)
|
|
||||||
from vllm.platforms import current_platform
|
|
||||||
from vllm.utils import FlexibleArgumentParser
|
|
||||||
from vllm.v1.spec_decode.ngram_proposer import NgramProposer
|
|
||||||
from vllm.v1.worker.gpu_input_batch import InputBatch
|
|
||||||
from vllm.v1.worker.gpu_model_runner import GPUModelRunner
|
|
||||||
|
|
||||||
|
|
||||||
def benchmark_propose(args):
|
|
||||||
rows = []
|
|
||||||
for max_ngram in args.max_ngram:
|
|
||||||
collector = TimeCollector(TimeCollector.US)
|
|
||||||
|
|
||||||
model_config = ModelConfig(
|
|
||||||
model="facebook/opt-125m",
|
|
||||||
task="generate",
|
|
||||||
max_model_len=args.num_token + args.num_spec_token,
|
|
||||||
tokenizer="facebook/opt-125m",
|
|
||||||
tokenizer_mode="auto",
|
|
||||||
dtype="auto",
|
|
||||||
seed=None,
|
|
||||||
trust_remote_code=False,
|
|
||||||
)
|
|
||||||
proposer = NgramProposer(
|
|
||||||
vllm_config=VllmConfig(
|
|
||||||
model_config=model_config,
|
|
||||||
speculative_config=SpeculativeConfig(
|
|
||||||
prompt_lookup_min=args.min_ngram,
|
|
||||||
prompt_lookup_max=max_ngram,
|
|
||||||
num_speculative_tokens=args.num_spec_token,
|
|
||||||
method="ngram",
|
|
||||||
),
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
# Warm up
|
|
||||||
proposer.propose(np.random.randint(0, 20, (args.num_token,)))
|
|
||||||
|
|
||||||
gc.collect()
|
|
||||||
for _ in range(args.num_iteration):
|
|
||||||
tokens = np.random.randint(0, 20, (args.num_req, args.num_token))
|
|
||||||
with collector:
|
|
||||||
for i in range(args.num_req):
|
|
||||||
proposer.propose(tokens[i, :])
|
|
||||||
rows.append(
|
|
||||||
[args.num_req, args.num_token, args.min_ngram, max_ngram]
|
|
||||||
+ collector.dump_avg_max()
|
|
||||||
)
|
|
||||||
|
|
||||||
print(
|
|
||||||
tabulate(
|
|
||||||
rows,
|
|
||||||
headers=[
|
|
||||||
"# Request",
|
|
||||||
"# Token",
|
|
||||||
"Min Ngram",
|
|
||||||
"Max Ngram",
|
|
||||||
"Avg (us)",
|
|
||||||
"Max (us)",
|
|
||||||
],
|
|
||||||
tablefmt="grid",
|
|
||||||
floatfmt=".3f",
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def benchmark_batched_propose(args):
|
|
||||||
NUM_SPECULATIVE_TOKENS_NGRAM = 10
|
|
||||||
PROMPT_LOOKUP_MIN = 5
|
|
||||||
PROMPT_LOOKUP_MAX = 15
|
|
||||||
MAX_MODEL_LEN = int(1e7)
|
|
||||||
DEVICE = current_platform.device_type
|
|
||||||
|
|
||||||
model_config = ModelConfig(model="facebook/opt-125m", runner="generate")
|
|
||||||
|
|
||||||
speculative_config = SpeculativeConfig(
|
|
||||||
target_model_config=model_config,
|
|
||||||
target_parallel_config=ParallelConfig(),
|
|
||||||
method="ngram",
|
|
||||||
num_speculative_tokens=NUM_SPECULATIVE_TOKENS_NGRAM,
|
|
||||||
prompt_lookup_max=PROMPT_LOOKUP_MAX,
|
|
||||||
prompt_lookup_min=PROMPT_LOOKUP_MIN,
|
|
||||||
)
|
|
||||||
|
|
||||||
vllm_config = VllmConfig(
|
|
||||||
model_config=model_config,
|
|
||||||
cache_config=CacheConfig(),
|
|
||||||
speculative_config=speculative_config,
|
|
||||||
device_config=DeviceConfig(device=current_platform.device_type),
|
|
||||||
parallel_config=ParallelConfig(),
|
|
||||||
load_config=LoadConfig(),
|
|
||||||
scheduler_config=SchedulerConfig(),
|
|
||||||
)
|
|
||||||
|
|
||||||
# monkey patch vllm.v1.worker.gpu_model_runner.get_pp_group
|
|
||||||
mock_pp_group = mock.MagicMock()
|
|
||||||
mock_pp_group.world_size = 1
|
|
||||||
with mock.patch(
|
|
||||||
"vllm.v1.worker.gpu_model_runner.get_pp_group", return_value=mock_pp_group
|
|
||||||
):
|
|
||||||
runner = GPUModelRunner(vllm_config, DEVICE)
|
|
||||||
|
|
||||||
# hack max model len
|
|
||||||
runner.max_model_len = MAX_MODEL_LEN
|
|
||||||
runner.drafter.max_model_len = MAX_MODEL_LEN
|
|
||||||
|
|
||||||
dummy_input_batch = InputBatch(
|
|
||||||
max_num_reqs=args.num_req,
|
|
||||||
max_model_len=MAX_MODEL_LEN,
|
|
||||||
max_num_batched_tokens=args.num_req * args.num_token,
|
|
||||||
device=DEVICE,
|
|
||||||
pin_memory=False,
|
|
||||||
vocab_size=256000,
|
|
||||||
block_sizes=[16],
|
|
||||||
)
|
|
||||||
dummy_input_batch._req_ids = list(str(id) for id in range(args.num_req))
|
|
||||||
dummy_input_batch.spec_decode_unsupported_reqs = ()
|
|
||||||
dummy_input_batch.num_tokens_no_spec = [args.num_token] * args.num_req
|
|
||||||
dummy_input_batch.token_ids_cpu = np.random.randint(
|
|
||||||
0, 20, (args.num_req, args.num_token)
|
|
||||||
)
|
|
||||||
|
|
||||||
runner.input_batch = dummy_input_batch
|
|
||||||
|
|
||||||
sampled_token_ids = [[0]] * args.num_req
|
|
||||||
|
|
||||||
print("Starting benchmark")
|
|
||||||
# first run is warmup so ignore it
|
|
||||||
for _ in range(args.num_iteration):
|
|
||||||
start = time.time()
|
|
||||||
runner.drafter.propose(
|
|
||||||
sampled_token_ids,
|
|
||||||
dummy_input_batch.req_ids,
|
|
||||||
dummy_input_batch.num_tokens_no_spec,
|
|
||||||
dummy_input_batch.token_ids_cpu,
|
|
||||||
dummy_input_batch.spec_decode_unsupported_reqs,
|
|
||||||
)
|
|
||||||
end = time.time()
|
|
||||||
print(f"Iteration time (s): {end - start}")
|
|
||||||
|
|
||||||
|
|
||||||
def invoke_main() -> None:
|
|
||||||
parser = FlexibleArgumentParser(
|
|
||||||
description="Benchmark the performance of N-gram speculative decode drafting"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--batched", action="store_true", help="consider time to prepare batch"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--num-iteration",
|
|
||||||
type=int,
|
|
||||||
default=100,
|
|
||||||
help="Number of iterations to run to stabilize final data readings",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--num-req", type=int, default=128, help="Number of requests in the batch"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--num-token", type=int, default=1500, help="Number of tokens for each request"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--min-ngram",
|
|
||||||
type=int,
|
|
||||||
default=3,
|
|
||||||
help="Minimum n-gram to match",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--max-ngram",
|
|
||||||
type=int,
|
|
||||||
nargs="*",
|
|
||||||
default=[5, 7, 10, 15, 20],
|
|
||||||
help="Maximum n-gram to match",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--num-spec-token",
|
|
||||||
type=int,
|
|
||||||
default=3,
|
|
||||||
help="Number of speculative tokens to generate",
|
|
||||||
)
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
if not args.batched:
|
|
||||||
benchmark_propose(args)
|
|
||||||
else:
|
|
||||||
benchmark_batched_propose(args)
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
|
||||||
# Example command lines:
|
|
||||||
# time python3 benchmarks/benchmark_ngram_proposer.py
|
|
||||||
# time python3 benchmarks/benchmark_ngram_proposer.py --batched --num-iteration 4 --num-token 1000000 --num-req 128
|
|
||||||
""" # noqa: E501
|
|
||||||
if __name__ == "__main__":
|
|
||||||
invoke_main() # pragma: no cover
|
|
@ -32,6 +32,7 @@ import dataclasses
|
|||||||
import json
|
import json
|
||||||
import random
|
import random
|
||||||
import time
|
import time
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
from transformers import PreTrainedTokenizerBase
|
from transformers import PreTrainedTokenizerBase
|
||||||
|
|
||||||
@ -79,7 +80,7 @@ def sample_requests_from_dataset(
|
|||||||
num_requests: int,
|
num_requests: int,
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
input_length_range: tuple[int, int],
|
input_length_range: tuple[int, int],
|
||||||
fixed_output_len: int | None,
|
fixed_output_len: Optional[int],
|
||||||
) -> list[Request]:
|
) -> list[Request]:
|
||||||
if fixed_output_len is not None and fixed_output_len < 4:
|
if fixed_output_len is not None and fixed_output_len < 4:
|
||||||
raise ValueError("output_len too small")
|
raise ValueError("output_len too small")
|
||||||
@ -127,7 +128,7 @@ def sample_requests_from_random(
|
|||||||
num_requests: int,
|
num_requests: int,
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
input_length_range: tuple[int, int],
|
input_length_range: tuple[int, int],
|
||||||
fixed_output_len: int | None,
|
fixed_output_len: Optional[int],
|
||||||
prefix_len: int,
|
prefix_len: int,
|
||||||
) -> list[Request]:
|
) -> list[Request]:
|
||||||
requests = []
|
requests = []
|
||||||
|
@ -7,6 +7,7 @@ import dataclasses
|
|||||||
import json
|
import json
|
||||||
import random
|
import random
|
||||||
import time
|
import time
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
from transformers import AutoTokenizer, PreTrainedTokenizerBase
|
from transformers import AutoTokenizer, PreTrainedTokenizerBase
|
||||||
|
|
||||||
@ -23,7 +24,7 @@ def sample_requests(
|
|||||||
dataset_path: str,
|
dataset_path: str,
|
||||||
num_requests: int,
|
num_requests: int,
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
fixed_output_len: int | None,
|
fixed_output_len: Optional[int],
|
||||||
) -> list[tuple[str, int, int, int]]:
|
) -> list[tuple[str, int, int, int]]:
|
||||||
if fixed_output_len is not None and fixed_output_len < 4:
|
if fixed_output_len is not None and fixed_output_len < 4:
|
||||||
raise ValueError("output_len too small")
|
raise ValueError("output_len too small")
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -4,7 +4,7 @@ r"""Benchmark online serving throughput with structured outputs.
|
|||||||
|
|
||||||
On the server side, run one of the following commands:
|
On the server side, run one of the following commands:
|
||||||
(vLLM OpenAI API server)
|
(vLLM OpenAI API server)
|
||||||
vllm serve <your_model>
|
vllm serve <your_model> --disable-log-requests
|
||||||
|
|
||||||
On the client side, run:
|
On the client side, run:
|
||||||
python benchmarks/benchmark_serving_structured_output.py \
|
python benchmarks/benchmark_serving_structured_output.py \
|
||||||
@ -31,19 +31,20 @@ import time
|
|||||||
import uuid
|
import uuid
|
||||||
import warnings
|
import warnings
|
||||||
from collections.abc import AsyncGenerator
|
from collections.abc import AsyncGenerator
|
||||||
from contextlib import nullcontext
|
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
import datasets
|
import datasets
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
from tqdm.asyncio import tqdm
|
||||||
|
from transformers import PreTrainedTokenizerBase
|
||||||
|
|
||||||
from backend_request_func import (
|
from backend_request_func import (
|
||||||
ASYNC_REQUEST_FUNCS,
|
ASYNC_REQUEST_FUNCS,
|
||||||
RequestFuncInput,
|
RequestFuncInput,
|
||||||
RequestFuncOutput,
|
RequestFuncOutput,
|
||||||
)
|
)
|
||||||
from tqdm.asyncio import tqdm
|
|
||||||
from transformers import PreTrainedTokenizerBase
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||||
@ -316,7 +317,7 @@ def calculate_metrics(
|
|||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
selected_percentile_metrics: list[str],
|
selected_percentile_metrics: list[str],
|
||||||
selected_percentiles: list[float],
|
selected_percentiles: list[float],
|
||||||
goodput_config_dict: dict[str, float] | None = None,
|
goodput_config_dict: Optional[dict[str, float]] = None,
|
||||||
) -> tuple[BenchmarkMetrics, list[int]]:
|
) -> tuple[BenchmarkMetrics, list[int]]:
|
||||||
actual_output_lens: list[int] = []
|
actual_output_lens: list[int] = []
|
||||||
total_input = 0
|
total_input = 0
|
||||||
@ -436,9 +437,9 @@ async def benchmark(
|
|||||||
selected_percentile_metrics: list[str],
|
selected_percentile_metrics: list[str],
|
||||||
selected_percentiles: list[str],
|
selected_percentiles: list[str],
|
||||||
ignore_eos: bool,
|
ignore_eos: bool,
|
||||||
max_concurrency: int | None,
|
max_concurrency: Optional[int],
|
||||||
structured_output_ratio: float,
|
structured_output_ratio: float,
|
||||||
goodput_config_dict: dict[str, float] | None = None,
|
goodput_config_dict: Optional[dict[str, float]] = None,
|
||||||
):
|
):
|
||||||
if backend in ASYNC_REQUEST_FUNCS:
|
if backend in ASYNC_REQUEST_FUNCS:
|
||||||
request_func = ASYNC_REQUEST_FUNCS[backend]
|
request_func = ASYNC_REQUEST_FUNCS[backend]
|
||||||
@ -448,8 +449,7 @@ async def benchmark(
|
|||||||
def prepare_extra_body(request) -> dict:
|
def prepare_extra_body(request) -> dict:
|
||||||
extra_body = {}
|
extra_body = {}
|
||||||
# Add the schema to the extra_body
|
# Add the schema to the extra_body
|
||||||
extra_body["structured_outputs"] = {}
|
extra_body[request.structure_type] = request.schema
|
||||||
extra_body["structured_outputs"][request.structure_type] = request.schema
|
|
||||||
return extra_body
|
return extra_body
|
||||||
|
|
||||||
print("Starting initial single prompt test run...")
|
print("Starting initial single prompt test run...")
|
||||||
@ -502,9 +502,15 @@ async def benchmark(
|
|||||||
|
|
||||||
pbar = None if disable_tqdm else tqdm(total=len(input_requests))
|
pbar = None if disable_tqdm else tqdm(total=len(input_requests))
|
||||||
|
|
||||||
semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else nullcontext()
|
# This can be used once the minimum Python version is 3.10 or higher,
|
||||||
|
# and it will simplify the code in limited_request_func.
|
||||||
|
# semaphore = (asyncio.Semaphore(max_concurrency)
|
||||||
|
# if max_concurrency else contextlib.nullcontext())
|
||||||
|
semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
|
||||||
|
|
||||||
async def limited_request_func(request_func_input, pbar):
|
async def limited_request_func(request_func_input, pbar):
|
||||||
|
if semaphore is None:
|
||||||
|
return await request_func(request_func_input=request_func_input, pbar=pbar)
|
||||||
async with semaphore:
|
async with semaphore:
|
||||||
return await request_func(request_func_input=request_func_input, pbar=pbar)
|
return await request_func(request_func_input=request_func_input, pbar=pbar)
|
||||||
|
|
||||||
@ -532,6 +538,20 @@ async def benchmark(
|
|||||||
)
|
)
|
||||||
outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
|
outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
|
||||||
|
|
||||||
|
if profile:
|
||||||
|
print("Stopping profiler...")
|
||||||
|
profile_input = RequestFuncInput(
|
||||||
|
model=model_id,
|
||||||
|
prompt=test_request.prompt,
|
||||||
|
api_url=base_url + "/stop_profile",
|
||||||
|
prompt_len=test_request.prompt_len,
|
||||||
|
output_len=test_request.expected_output_len,
|
||||||
|
extra_body={test_request.structure_type: test_request.schema},
|
||||||
|
)
|
||||||
|
profile_output = await request_func(request_func_input=profile_input)
|
||||||
|
if profile_output.success:
|
||||||
|
print("Profiler stopped")
|
||||||
|
|
||||||
if pbar is not None:
|
if pbar is not None:
|
||||||
pbar.close()
|
pbar.close()
|
||||||
|
|
||||||
@ -549,10 +569,6 @@ async def benchmark(
|
|||||||
|
|
||||||
print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
|
print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
|
||||||
print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
|
print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
|
||||||
if max_concurrency is not None:
|
|
||||||
print("{:<40} {:<10}".format("Maximum request concurrency:", max_concurrency))
|
|
||||||
if request_rate != float("inf"):
|
|
||||||
print("{:<40} {:<10.2f}".format("Request rate configured (RPS):", request_rate))
|
|
||||||
print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
|
print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
|
||||||
print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
|
print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
|
||||||
print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
|
print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
|
||||||
@ -650,20 +666,6 @@ async def benchmark(
|
|||||||
|
|
||||||
print("=" * 50)
|
print("=" * 50)
|
||||||
|
|
||||||
if profile:
|
|
||||||
print("Stopping profiler...")
|
|
||||||
profile_input = RequestFuncInput(
|
|
||||||
model=model_id,
|
|
||||||
prompt=test_request.prompt,
|
|
||||||
api_url=base_url + "/stop_profile",
|
|
||||||
prompt_len=test_request.prompt_len,
|
|
||||||
output_len=test_request.expected_output_len,
|
|
||||||
extra_body={test_request.structure_type: test_request.schema},
|
|
||||||
)
|
|
||||||
profile_output = await request_func(request_func_input=profile_input)
|
|
||||||
if profile_output.success:
|
|
||||||
print("Profiler stopped")
|
|
||||||
|
|
||||||
return result, ret
|
return result, ret
|
||||||
|
|
||||||
|
|
||||||
@ -690,11 +692,11 @@ def evaluate(ret, args):
|
|||||||
return re.match(args.regex, actual) is not None
|
return re.match(args.regex, actual) is not None
|
||||||
|
|
||||||
def _eval_correctness(expected, actual):
|
def _eval_correctness(expected, actual):
|
||||||
if args.structure_type == "json":
|
if args.structure_type == "guided_json":
|
||||||
return _eval_correctness_json(expected, actual)
|
return _eval_correctness_json(expected, actual)
|
||||||
elif args.structure_type == "regex":
|
elif args.structure_type == "guided_regex":
|
||||||
return _eval_correctness_regex(expected, actual)
|
return _eval_correctness_regex(expected, actual)
|
||||||
elif args.structure_type == "choice":
|
elif args.structure_type == "guided_choice":
|
||||||
return _eval_correctness_choice(expected, actual)
|
return _eval_correctness_choice(expected, actual)
|
||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
@ -774,18 +776,18 @@ def main(args: argparse.Namespace):
|
|||||||
)
|
)
|
||||||
|
|
||||||
if args.dataset == "grammar":
|
if args.dataset == "grammar":
|
||||||
args.structure_type = "grammar"
|
args.structure_type = "guided_grammar"
|
||||||
elif args.dataset == "regex":
|
elif args.dataset == "regex":
|
||||||
args.structure_type = "regex"
|
args.structure_type = "guided_regex"
|
||||||
elif args.dataset == "choice":
|
elif args.dataset == "choice":
|
||||||
args.structure_type = "choice"
|
args.structure_type = "guided_choice"
|
||||||
else:
|
else:
|
||||||
args.structure_type = "json"
|
args.structure_type = "guided_json"
|
||||||
|
|
||||||
if args.no_structured_output:
|
if args.no_structured_output:
|
||||||
args.structured_output_ratio = 0
|
args.structured_output_ratio = 0
|
||||||
if args.save_results:
|
if args.save_results:
|
||||||
result_file_name = f"{args.structured_output_ratio}so"
|
result_file_name = f"{args.structured_output_ratio}guided"
|
||||||
result_file_name += f"_{backend}"
|
result_file_name += f"_{backend}"
|
||||||
result_file_name += f"_{args.request_rate}qps"
|
result_file_name += f"_{args.request_rate}qps"
|
||||||
result_file_name += f"_{args.model.split('/')[-1]}"
|
result_file_name += f"_{args.model.split('/')[-1]}"
|
||||||
@ -903,13 +905,13 @@ def create_argument_parser():
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--tokenizer",
|
"--tokenizer",
|
||||||
type=str,
|
type=str,
|
||||||
help="Name or path of the tokenizer, if not using the default tokenizer.",
|
help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--tokenizer-mode",
|
"--tokenizer-mode",
|
||||||
type=str,
|
type=str,
|
||||||
default="auto",
|
default="auto",
|
||||||
help="Name or path of the tokenizer, if not using the default tokenizer.",
|
help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--num-prompts",
|
"--num-prompts",
|
||||||
@ -992,7 +994,7 @@ def create_argument_parser():
|
|||||||
"--percentile-metrics",
|
"--percentile-metrics",
|
||||||
type=str,
|
type=str,
|
||||||
default="ttft,tpot,itl",
|
default="ttft,tpot,itl",
|
||||||
help="Comma-separated list of selected metrics to report percentiles. "
|
help="Comma-separated list of selected metrics to report percentils. "
|
||||||
"This argument specifies the metrics to report percentiles. "
|
"This argument specifies the metrics to report percentiles. "
|
||||||
'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
|
'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
|
||||||
'Default value is "ttft,tpot,itl".',
|
'Default value is "ttft,tpot,itl".',
|
||||||
|
@ -1,17 +1,736 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
import sys
|
"""Benchmark offline inference throughput."""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import dataclasses
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import random
|
||||||
|
import time
|
||||||
|
import warnings
|
||||||
|
from typing import Any, Optional, Union
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import uvloop
|
||||||
|
from tqdm import tqdm
|
||||||
|
from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase
|
||||||
|
|
||||||
|
from benchmark_dataset import (
|
||||||
|
AIMODataset,
|
||||||
|
BurstGPTDataset,
|
||||||
|
ConversationDataset,
|
||||||
|
InstructCoderDataset,
|
||||||
|
RandomDataset,
|
||||||
|
SampleRequest,
|
||||||
|
ShareGPTDataset,
|
||||||
|
SonnetDataset,
|
||||||
|
VisionArenaDataset,
|
||||||
|
)
|
||||||
|
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
|
||||||
|
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
|
||||||
|
from vllm.entrypoints.openai.api_server import (
|
||||||
|
build_async_engine_client_from_engine_args,
|
||||||
|
)
|
||||||
|
from vllm.inputs import TextPrompt, TokensPrompt
|
||||||
|
from vllm.lora.request import LoRARequest
|
||||||
|
from vllm.outputs import RequestOutput
|
||||||
|
from vllm.sampling_params import BeamSearchParams
|
||||||
|
from vllm.utils import FlexibleArgumentParser, merge_async_iterators
|
||||||
|
|
||||||
|
|
||||||
|
def run_vllm(
|
||||||
|
requests: list[SampleRequest],
|
||||||
|
n: int,
|
||||||
|
engine_args: EngineArgs,
|
||||||
|
disable_detokenize: bool = False,
|
||||||
|
) -> tuple[float, Optional[list[RequestOutput]]]:
|
||||||
|
from vllm import LLM, SamplingParams
|
||||||
|
|
||||||
|
llm = LLM(**dataclasses.asdict(engine_args))
|
||||||
|
assert all(
|
||||||
|
llm.llm_engine.model_config.max_model_len
|
||||||
|
>= (request.prompt_len + request.expected_output_len)
|
||||||
|
for request in requests
|
||||||
|
), (
|
||||||
|
"Please ensure that max_model_len is greater than the sum of"
|
||||||
|
" prompt_len and expected_output_len for all requests."
|
||||||
|
)
|
||||||
|
# Add the requests to the engine.
|
||||||
|
prompts: list[Union[TextPrompt, TokensPrompt]] = []
|
||||||
|
sampling_params: list[SamplingParams] = []
|
||||||
|
for request in requests:
|
||||||
|
prompts.append(
|
||||||
|
TokensPrompt(
|
||||||
|
prompt_token_ids=request.prompt["prompt_token_ids"],
|
||||||
|
multi_modal_data=request.multi_modal_data,
|
||||||
|
)
|
||||||
|
if "prompt_token_ids" in request.prompt
|
||||||
|
else TextPrompt(
|
||||||
|
prompt=request.prompt, multi_modal_data=request.multi_modal_data
|
||||||
|
)
|
||||||
|
)
|
||||||
|
sampling_params.append(
|
||||||
|
SamplingParams(
|
||||||
|
n=n,
|
||||||
|
temperature=1.0,
|
||||||
|
top_p=1.0,
|
||||||
|
ignore_eos=True,
|
||||||
|
max_tokens=request.expected_output_len,
|
||||||
|
detokenize=not disable_detokenize,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
lora_requests: Optional[list[LoRARequest]] = None
|
||||||
|
if engine_args.enable_lora:
|
||||||
|
lora_requests = [request.lora_request for request in requests]
|
||||||
|
|
||||||
|
use_beam_search = False
|
||||||
|
|
||||||
|
outputs = None
|
||||||
|
if not use_beam_search:
|
||||||
|
start = time.perf_counter()
|
||||||
|
outputs = llm.generate(
|
||||||
|
prompts, sampling_params, lora_request=lora_requests, use_tqdm=True
|
||||||
|
)
|
||||||
|
end = time.perf_counter()
|
||||||
|
else:
|
||||||
|
assert lora_requests is None, "BeamSearch API does not support LoRA"
|
||||||
|
prompts = [request.prompt for request in requests]
|
||||||
|
# output_len should be the same for all requests.
|
||||||
|
output_len = requests[0].expected_output_len
|
||||||
|
for request in requests:
|
||||||
|
assert request.expected_output_len == output_len
|
||||||
|
start = time.perf_counter()
|
||||||
|
llm.beam_search(
|
||||||
|
prompts,
|
||||||
|
BeamSearchParams(
|
||||||
|
beam_width=n,
|
||||||
|
max_tokens=output_len,
|
||||||
|
ignore_eos=True,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
end = time.perf_counter()
|
||||||
|
return end - start, outputs
|
||||||
|
|
||||||
|
|
||||||
|
def run_vllm_chat(
|
||||||
|
requests: list[SampleRequest],
|
||||||
|
n: int,
|
||||||
|
engine_args: EngineArgs,
|
||||||
|
disable_detokenize: bool = False,
|
||||||
|
) -> tuple[float, list[RequestOutput]]:
|
||||||
|
"""
|
||||||
|
Run vLLM chat benchmark. This function is recommended ONLY for benchmarking
|
||||||
|
multimodal models as it properly handles multimodal inputs and chat
|
||||||
|
formatting. For non-multimodal models, use run_vllm() instead.
|
||||||
|
"""
|
||||||
|
from vllm import LLM, SamplingParams
|
||||||
|
|
||||||
|
llm = LLM(**dataclasses.asdict(engine_args))
|
||||||
|
|
||||||
|
assert all(
|
||||||
|
llm.llm_engine.model_config.max_model_len
|
||||||
|
>= (request.prompt_len + request.expected_output_len)
|
||||||
|
for request in requests
|
||||||
|
), (
|
||||||
|
"Please ensure that max_model_len is greater than the sum of "
|
||||||
|
"prompt_len and expected_output_len for all requests."
|
||||||
|
)
|
||||||
|
|
||||||
|
prompts = []
|
||||||
|
sampling_params: list[SamplingParams] = []
|
||||||
|
for request in requests:
|
||||||
|
prompts.append(request.prompt)
|
||||||
|
sampling_params.append(
|
||||||
|
SamplingParams(
|
||||||
|
n=n,
|
||||||
|
temperature=1.0,
|
||||||
|
top_p=1.0,
|
||||||
|
ignore_eos=True,
|
||||||
|
max_tokens=request.expected_output_len,
|
||||||
|
detokenize=not disable_detokenize,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
start = time.perf_counter()
|
||||||
|
outputs = llm.chat(prompts, sampling_params, use_tqdm=True)
|
||||||
|
end = time.perf_counter()
|
||||||
|
return end - start, outputs
|
||||||
|
|
||||||
|
|
||||||
|
async def run_vllm_async(
|
||||||
|
requests: list[SampleRequest],
|
||||||
|
n: int,
|
||||||
|
engine_args: AsyncEngineArgs,
|
||||||
|
disable_frontend_multiprocessing: bool = False,
|
||||||
|
disable_detokenize: bool = False,
|
||||||
|
) -> float:
|
||||||
|
from vllm import SamplingParams
|
||||||
|
|
||||||
|
async with build_async_engine_client_from_engine_args(
|
||||||
|
engine_args, disable_frontend_multiprocessing
|
||||||
|
) as llm:
|
||||||
|
model_config = await llm.get_model_config()
|
||||||
|
assert all(
|
||||||
|
model_config.max_model_len
|
||||||
|
>= (request.prompt_len + request.expected_output_len)
|
||||||
|
for request in requests
|
||||||
|
), (
|
||||||
|
"Please ensure that max_model_len is greater than the sum of"
|
||||||
|
" prompt_len and expected_output_len for all requests."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add the requests to the engine.
|
||||||
|
prompts: list[Union[TextPrompt, TokensPrompt]] = []
|
||||||
|
sampling_params: list[SamplingParams] = []
|
||||||
|
lora_requests: list[Optional[LoRARequest]] = []
|
||||||
|
for request in requests:
|
||||||
|
prompts.append(
|
||||||
|
TokensPrompt(
|
||||||
|
prompt_token_ids=request.prompt["prompt_token_ids"],
|
||||||
|
multi_modal_data=request.multi_modal_data,
|
||||||
|
)
|
||||||
|
if "prompt_token_ids" in request.prompt
|
||||||
|
else TextPrompt(
|
||||||
|
prompt=request.prompt, multi_modal_data=request.multi_modal_data
|
||||||
|
)
|
||||||
|
)
|
||||||
|
sampling_params.append(
|
||||||
|
SamplingParams(
|
||||||
|
n=n,
|
||||||
|
temperature=1.0,
|
||||||
|
top_p=1.0,
|
||||||
|
ignore_eos=True,
|
||||||
|
max_tokens=request.expected_output_len,
|
||||||
|
detokenize=not disable_detokenize,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
lora_requests.append(request.lora_request)
|
||||||
|
|
||||||
|
generators = []
|
||||||
|
start = time.perf_counter()
|
||||||
|
for i, (prompt, sp, lr) in enumerate(
|
||||||
|
zip(prompts, sampling_params, lora_requests)
|
||||||
|
):
|
||||||
|
generator = llm.generate(prompt, sp, lora_request=lr, request_id=f"test{i}")
|
||||||
|
generators.append(generator)
|
||||||
|
all_gens = merge_async_iterators(*generators)
|
||||||
|
async for i, res in all_gens:
|
||||||
|
pass
|
||||||
|
end = time.perf_counter()
|
||||||
|
return end - start
|
||||||
|
|
||||||
|
|
||||||
|
def run_hf(
|
||||||
|
requests: list[SampleRequest],
|
||||||
|
model: str,
|
||||||
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
|
n: int,
|
||||||
|
max_batch_size: int,
|
||||||
|
trust_remote_code: bool,
|
||||||
|
disable_detokenize: bool = False,
|
||||||
|
) -> float:
|
||||||
|
llm = AutoModelForCausalLM.from_pretrained(
|
||||||
|
model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code
|
||||||
|
)
|
||||||
|
if llm.config.model_type == "llama":
|
||||||
|
# To enable padding in the HF backend.
|
||||||
|
tokenizer.pad_token = tokenizer.eos_token
|
||||||
|
llm = llm.cuda()
|
||||||
|
|
||||||
|
pbar = tqdm(total=len(requests))
|
||||||
|
start = time.perf_counter()
|
||||||
|
batch: list[str] = []
|
||||||
|
max_prompt_len = 0
|
||||||
|
max_output_len = 0
|
||||||
|
for i in range(len(requests)):
|
||||||
|
prompt = requests[i].prompt
|
||||||
|
prompt_len = requests[i].prompt_len
|
||||||
|
output_len = requests[i].expected_output_len
|
||||||
|
# Add the prompt to the batch.
|
||||||
|
batch.append(prompt)
|
||||||
|
max_prompt_len = max(max_prompt_len, prompt_len)
|
||||||
|
max_output_len = max(max_output_len, output_len)
|
||||||
|
if len(batch) < max_batch_size and i != len(requests) - 1:
|
||||||
|
# Check if we can add more requests to the batch.
|
||||||
|
next_prompt_len = requests[i + 1].prompt_len
|
||||||
|
next_output_len = requests[i + 1].expected_output_len
|
||||||
|
if (
|
||||||
|
max(max_prompt_len, next_prompt_len)
|
||||||
|
+ max(max_output_len, next_output_len)
|
||||||
|
) <= 2048:
|
||||||
|
# We can add more requests to the batch.
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Generate the sequences.
|
||||||
|
input_ids = tokenizer(batch, return_tensors="pt", padding=True).input_ids
|
||||||
|
llm_outputs = llm.generate(
|
||||||
|
input_ids=input_ids.cuda(),
|
||||||
|
do_sample=True,
|
||||||
|
num_return_sequences=n,
|
||||||
|
temperature=1.0,
|
||||||
|
top_p=1.0,
|
||||||
|
use_cache=True,
|
||||||
|
max_new_tokens=max_output_len,
|
||||||
|
)
|
||||||
|
if not disable_detokenize:
|
||||||
|
# Include the decoding time.
|
||||||
|
tokenizer.batch_decode(llm_outputs, skip_special_tokens=True)
|
||||||
|
pbar.update(len(batch))
|
||||||
|
|
||||||
|
# Clear the batch.
|
||||||
|
batch = []
|
||||||
|
max_prompt_len = 0
|
||||||
|
max_output_len = 0
|
||||||
|
end = time.perf_counter()
|
||||||
|
return end - start
|
||||||
|
|
||||||
|
|
||||||
|
def run_mii(
|
||||||
|
requests: list[SampleRequest],
|
||||||
|
model: str,
|
||||||
|
tensor_parallel_size: int,
|
||||||
|
output_len: int,
|
||||||
|
) -> float:
|
||||||
|
from mii import client, serve
|
||||||
|
|
||||||
|
llm = serve(model, tensor_parallel=tensor_parallel_size)
|
||||||
|
prompts = [request.prompt for request in requests]
|
||||||
|
|
||||||
|
start = time.perf_counter()
|
||||||
|
llm.generate(prompts, max_new_tokens=output_len)
|
||||||
|
end = time.perf_counter()
|
||||||
|
client = client(model)
|
||||||
|
client.terminate_server()
|
||||||
|
return end - start
|
||||||
|
|
||||||
|
|
||||||
|
def save_to_pytorch_benchmark_format(
|
||||||
|
args: argparse.Namespace, results: dict[str, Any]
|
||||||
|
) -> None:
|
||||||
|
pt_records = convert_to_pytorch_benchmark_format(
|
||||||
|
args=args,
|
||||||
|
metrics={
|
||||||
|
"requests_per_second": [results["requests_per_second"]],
|
||||||
|
"tokens_per_second": [results["tokens_per_second"]],
|
||||||
|
},
|
||||||
|
extra_info={
|
||||||
|
k: results[k] for k in ["elapsed_time", "num_requests", "total_num_tokens"]
|
||||||
|
},
|
||||||
|
)
|
||||||
|
if pt_records:
|
||||||
|
# Don't use json suffix here as we don't want CI to pick it up
|
||||||
|
pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
|
||||||
|
write_to_json(pt_file, pt_records)
|
||||||
|
|
||||||
|
|
||||||
|
def get_requests(args, tokenizer):
|
||||||
|
# Common parameters for all dataset types.
|
||||||
|
common_kwargs = {
|
||||||
|
"dataset_path": args.dataset_path,
|
||||||
|
"random_seed": args.seed,
|
||||||
|
}
|
||||||
|
sample_kwargs = {
|
||||||
|
"tokenizer": tokenizer,
|
||||||
|
"lora_path": args.lora_path,
|
||||||
|
"max_loras": args.max_loras,
|
||||||
|
"num_requests": args.num_prompts,
|
||||||
|
"input_len": args.input_len,
|
||||||
|
"output_len": args.output_len,
|
||||||
|
}
|
||||||
|
|
||||||
|
if args.dataset_path is None or args.dataset_name == "random":
|
||||||
|
sample_kwargs["range_ratio"] = args.random_range_ratio
|
||||||
|
sample_kwargs["prefix_len"] = args.prefix_len
|
||||||
|
dataset_cls = RandomDataset
|
||||||
|
elif args.dataset_name == "sharegpt":
|
||||||
|
dataset_cls = ShareGPTDataset
|
||||||
|
if args.backend == "vllm-chat":
|
||||||
|
sample_kwargs["enable_multimodal_chat"] = True
|
||||||
|
elif args.dataset_name == "sonnet":
|
||||||
|
assert tokenizer.chat_template or tokenizer.default_chat_template, (
|
||||||
|
"Tokenizer/model must have chat template for sonnet dataset."
|
||||||
|
)
|
||||||
|
dataset_cls = SonnetDataset
|
||||||
|
sample_kwargs["prefix_len"] = args.prefix_len
|
||||||
|
sample_kwargs["return_prompt_formatted"] = True
|
||||||
|
elif args.dataset_name == "burstgpt":
|
||||||
|
dataset_cls = BurstGPTDataset
|
||||||
|
elif args.dataset_name == "hf":
|
||||||
|
common_kwargs["no_stream"] = args.no_stream
|
||||||
|
if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
|
||||||
|
dataset_cls = VisionArenaDataset
|
||||||
|
common_kwargs["dataset_subset"] = None
|
||||||
|
common_kwargs["dataset_split"] = "train"
|
||||||
|
sample_kwargs["enable_multimodal_chat"] = True
|
||||||
|
elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
|
||||||
|
dataset_cls = InstructCoderDataset
|
||||||
|
common_kwargs["dataset_split"] = "train"
|
||||||
|
elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
|
||||||
|
dataset_cls = ConversationDataset
|
||||||
|
common_kwargs["dataset_subset"] = args.hf_subset
|
||||||
|
common_kwargs["dataset_split"] = args.hf_split
|
||||||
|
sample_kwargs["enable_multimodal_chat"] = True
|
||||||
|
elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
|
||||||
|
dataset_cls = AIMODataset
|
||||||
|
common_kwargs["dataset_subset"] = None
|
||||||
|
common_kwargs["dataset_split"] = "train"
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unknown dataset name: {args.dataset_name}")
|
||||||
|
# Remove None values
|
||||||
|
sample_kwargs = {k: v for k, v in sample_kwargs.items() if v is not None}
|
||||||
|
return dataset_cls(**common_kwargs).sample(**sample_kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
def main(args: argparse.Namespace):
|
||||||
|
if args.seed is None:
|
||||||
|
args.seed = 0
|
||||||
|
print(args)
|
||||||
|
random.seed(args.seed)
|
||||||
|
# Sample the requests.
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(
|
||||||
|
args.tokenizer, trust_remote_code=args.trust_remote_code
|
||||||
|
)
|
||||||
|
requests = get_requests(args, tokenizer)
|
||||||
|
is_multi_modal = any(request.multi_modal_data is not None for request in requests)
|
||||||
|
request_outputs: Optional[list[RequestOutput]] = None
|
||||||
|
if args.backend == "vllm":
|
||||||
|
if args.async_engine:
|
||||||
|
elapsed_time = uvloop.run(
|
||||||
|
run_vllm_async(
|
||||||
|
requests,
|
||||||
|
args.n,
|
||||||
|
AsyncEngineArgs.from_cli_args(args),
|
||||||
|
args.disable_frontend_multiprocessing,
|
||||||
|
args.disable_detokenize,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
elapsed_time, request_outputs = run_vllm(
|
||||||
|
requests,
|
||||||
|
args.n,
|
||||||
|
EngineArgs.from_cli_args(args),
|
||||||
|
args.disable_detokenize,
|
||||||
|
)
|
||||||
|
elif args.backend == "hf":
|
||||||
|
assert args.tensor_parallel_size == 1
|
||||||
|
elapsed_time = run_hf(
|
||||||
|
requests,
|
||||||
|
args.model,
|
||||||
|
tokenizer,
|
||||||
|
args.n,
|
||||||
|
args.hf_max_batch_size,
|
||||||
|
args.trust_remote_code,
|
||||||
|
args.disable_detokenize,
|
||||||
|
)
|
||||||
|
elif args.backend == "mii":
|
||||||
|
elapsed_time = run_mii(
|
||||||
|
requests, args.model, args.tensor_parallel_size, args.output_len
|
||||||
|
)
|
||||||
|
elif args.backend == "vllm-chat":
|
||||||
|
elapsed_time, request_outputs = run_vllm_chat(
|
||||||
|
requests, args.n, EngineArgs.from_cli_args(args), args.disable_detokenize
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unknown backend: {args.backend}")
|
||||||
|
|
||||||
|
if request_outputs:
|
||||||
|
# Note: with the vllm and vllm-chat backends,
|
||||||
|
# we have request_outputs, which we use to count tokens.
|
||||||
|
total_prompt_tokens = 0
|
||||||
|
total_output_tokens = 0
|
||||||
|
for ro in request_outputs:
|
||||||
|
if not isinstance(ro, RequestOutput):
|
||||||
|
continue
|
||||||
|
total_prompt_tokens += (
|
||||||
|
len(ro.prompt_token_ids) if ro.prompt_token_ids else 0
|
||||||
|
)
|
||||||
|
total_output_tokens += sum(len(o.token_ids) for o in ro.outputs if o)
|
||||||
|
total_num_tokens = total_prompt_tokens + total_output_tokens
|
||||||
|
else:
|
||||||
|
total_num_tokens = sum(r.prompt_len + r.expected_output_len for r in requests)
|
||||||
|
total_output_tokens = sum(r.expected_output_len for r in requests)
|
||||||
|
total_prompt_tokens = total_num_tokens - total_output_tokens
|
||||||
|
|
||||||
|
if is_multi_modal and args.backend != "vllm-chat":
|
||||||
|
print(
|
||||||
|
"\033[91mWARNING\033[0m: Multi-modal request with "
|
||||||
|
f"{args.backend} backend detected. The "
|
||||||
|
"following metrics are not accurate because image tokens are not"
|
||||||
|
" counted. See vllm-project/vllm/issues/9778 for details."
|
||||||
|
)
|
||||||
|
# TODO(vllm-project/vllm/issues/9778): Count multi-modal token length.
|
||||||
|
# vllm-chat backend counts the image tokens now
|
||||||
|
|
||||||
|
print(
|
||||||
|
f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
|
||||||
|
f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
|
||||||
|
f"{total_output_tokens / elapsed_time:.2f} output tokens/s"
|
||||||
|
)
|
||||||
|
print(f"Total num prompt tokens: {total_prompt_tokens}")
|
||||||
|
print(f"Total num output tokens: {total_output_tokens}")
|
||||||
|
|
||||||
|
# Output JSON results if specified
|
||||||
|
if args.output_json:
|
||||||
|
results = {
|
||||||
|
"elapsed_time": elapsed_time,
|
||||||
|
"num_requests": len(requests),
|
||||||
|
"total_num_tokens": total_num_tokens,
|
||||||
|
"requests_per_second": len(requests) / elapsed_time,
|
||||||
|
"tokens_per_second": total_num_tokens / elapsed_time,
|
||||||
|
}
|
||||||
|
with open(args.output_json, "w") as f:
|
||||||
|
json.dump(results, f, indent=4)
|
||||||
|
save_to_pytorch_benchmark_format(args, results)
|
||||||
|
|
||||||
|
|
||||||
|
def validate_args(args):
|
||||||
|
"""
|
||||||
|
Validate command-line arguments.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# === Deprecation and Defaulting ===
|
||||||
|
if args.dataset is not None:
|
||||||
|
warnings.warn(
|
||||||
|
"The '--dataset' argument will be deprecated in the next release. "
|
||||||
|
"Please use '--dataset-name' and '--dataset-path' instead.",
|
||||||
|
stacklevel=2,
|
||||||
|
)
|
||||||
|
args.dataset_path = args.dataset
|
||||||
|
|
||||||
|
if not getattr(args, "tokenizer", None):
|
||||||
|
args.tokenizer = args.model
|
||||||
|
|
||||||
|
# === Backend Validation ===
|
||||||
|
valid_backends = {"vllm", "hf", "mii", "vllm-chat"}
|
||||||
|
if args.backend not in valid_backends:
|
||||||
|
raise ValueError(f"Unsupported backend: {args.backend}")
|
||||||
|
|
||||||
|
# === Dataset Configuration ===
|
||||||
|
if not args.dataset and not args.dataset_path:
|
||||||
|
print("When dataset path is not set, it will default to random dataset")
|
||||||
|
args.dataset_name = "random"
|
||||||
|
if args.input_len is None:
|
||||||
|
raise ValueError("input_len must be provided for a random dataset")
|
||||||
|
|
||||||
|
# === Dataset Name Specific Checks ===
|
||||||
|
# --hf-subset and --hf-split: only used
|
||||||
|
# when dataset_name is 'hf'
|
||||||
|
if args.dataset_name != "hf" and (
|
||||||
|
getattr(args, "hf_subset", None) is not None
|
||||||
|
or getattr(args, "hf_split", None) is not None
|
||||||
|
):
|
||||||
|
warnings.warn(
|
||||||
|
"--hf-subset and --hf-split will be ignored \
|
||||||
|
since --dataset-name is not 'hf'.",
|
||||||
|
stacklevel=2,
|
||||||
|
)
|
||||||
|
elif args.dataset_name == "hf":
|
||||||
|
if args.dataset_path in (
|
||||||
|
VisionArenaDataset.SUPPORTED_DATASET_PATHS.keys()
|
||||||
|
| ConversationDataset.SUPPORTED_DATASET_PATHS
|
||||||
|
):
|
||||||
|
assert args.backend == "vllm-chat", (
|
||||||
|
f"{args.dataset_path} needs to use vllm-chat as the backend."
|
||||||
|
) # noqa: E501
|
||||||
|
elif args.dataset_path in (
|
||||||
|
InstructCoderDataset.SUPPORTED_DATASET_PATHS
|
||||||
|
| AIMODataset.SUPPORTED_DATASET_PATHS
|
||||||
|
):
|
||||||
|
assert args.backend == "vllm", (
|
||||||
|
f"{args.dataset_path} needs to use vllm as the backend."
|
||||||
|
) # noqa: E501
|
||||||
|
else:
|
||||||
|
raise ValueError(f"{args.dataset_path} is not supported by hf dataset.")
|
||||||
|
|
||||||
|
# --random-range-ratio: only used when dataset_name is 'random'
|
||||||
|
if args.dataset_name != "random" and args.random_range_ratio is not None:
|
||||||
|
warnings.warn(
|
||||||
|
"--random-range-ratio will be ignored since \
|
||||||
|
--dataset-name is not 'random'.",
|
||||||
|
stacklevel=2,
|
||||||
|
)
|
||||||
|
|
||||||
|
# --prefix-len: only used when dataset_name is 'random', 'sonnet', or not
|
||||||
|
# set.
|
||||||
|
if (
|
||||||
|
args.dataset_name not in {"random", "sonnet", None}
|
||||||
|
and args.prefix_len is not None
|
||||||
|
):
|
||||||
|
warnings.warn(
|
||||||
|
"--prefix-len will be ignored since --dataset-name\
|
||||||
|
is not 'random', 'sonnet', or not set.",
|
||||||
|
stacklevel=2,
|
||||||
|
)
|
||||||
|
|
||||||
|
# === LoRA Settings ===
|
||||||
|
if getattr(args, "enable_lora", False) and args.backend != "vllm":
|
||||||
|
raise ValueError("LoRA benchmarking is only supported for vLLM backend")
|
||||||
|
if getattr(args, "enable_lora", False) and args.lora_path is None:
|
||||||
|
raise ValueError("LoRA path must be provided when enable_lora is True")
|
||||||
|
|
||||||
|
# === Backend-specific Validations ===
|
||||||
|
if args.backend == "hf" and args.hf_max_batch_size is None:
|
||||||
|
raise ValueError("HF max batch size is required for HF backend")
|
||||||
|
if args.backend != "hf" and args.hf_max_batch_size is not None:
|
||||||
|
raise ValueError("HF max batch size is only for HF backend.")
|
||||||
|
|
||||||
|
if (
|
||||||
|
args.backend in {"hf", "mii"}
|
||||||
|
and getattr(args, "quantization", None) is not None
|
||||||
|
):
|
||||||
|
raise ValueError("Quantization is only for vLLM backend.")
|
||||||
|
|
||||||
|
if args.backend == "mii" and args.dtype != "auto":
|
||||||
|
raise ValueError("dtype must be auto for MII backend.")
|
||||||
|
if args.backend == "mii" and args.n != 1:
|
||||||
|
raise ValueError("n must be 1 for MII backend.")
|
||||||
|
if args.backend == "mii" and args.tokenizer != args.model:
|
||||||
|
raise ValueError("Tokenizer must be the same as the model for MII backend.")
|
||||||
|
|
||||||
|
# --data-parallel is not supported currently.
|
||||||
|
# https://github.com/vllm-project/vllm/issues/16222
|
||||||
|
if args.data_parallel_size > 1:
|
||||||
|
raise ValueError(
|
||||||
|
"Data parallel is not supported in offline benchmark, \
|
||||||
|
please use benchmark serving instead"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def create_argument_parser():
|
||||||
|
parser = FlexibleArgumentParser(description="Benchmark the throughput.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--backend",
|
||||||
|
type=str,
|
||||||
|
choices=["vllm", "hf", "mii", "vllm-chat"],
|
||||||
|
default="vllm",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--dataset-name",
|
||||||
|
type=str,
|
||||||
|
choices=["sharegpt", "random", "sonnet", "burstgpt", "hf"],
|
||||||
|
help="Name of the dataset to benchmark on.",
|
||||||
|
default="sharegpt",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--no-stream",
|
||||||
|
action="store_true",
|
||||||
|
help="Do not load the dataset in streaming mode.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--dataset",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="Path to the ShareGPT dataset, will be deprecated in\
|
||||||
|
the next release. The dataset is expected to "
|
||||||
|
"be a json in form of list[dict[..., conversations: "
|
||||||
|
"list[dict[..., value: <prompt_or_response>]]]]",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--dataset-path", type=str, default=None, help="Path to the dataset"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--input-len",
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
|
help="Input prompt length for each request",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--output-len",
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
|
help="Output length for each request. Overrides the "
|
||||||
|
"output length from the dataset.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--n", type=int, default=1, help="Number of generated sequences per prompt."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--num-prompts", type=int, default=1000, help="Number of prompts to process."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--hf-max-batch-size",
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
|
help="Maximum batch size for HF backend.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--output-json",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="Path to save the throughput results in JSON format.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--async-engine",
|
||||||
|
action="store_true",
|
||||||
|
default=False,
|
||||||
|
help="Use vLLM async engine rather than LLM class.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--disable-frontend-multiprocessing",
|
||||||
|
action="store_true",
|
||||||
|
default=False,
|
||||||
|
help="Disable decoupled async engine frontend.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--disable-detokenize",
|
||||||
|
action="store_true",
|
||||||
|
help=(
|
||||||
|
"Do not detokenize the response (i.e. do not include "
|
||||||
|
"detokenization time in the measurement)"
|
||||||
|
),
|
||||||
|
)
|
||||||
|
# LoRA
|
||||||
|
parser.add_argument(
|
||||||
|
"--lora-path",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="Path to the LoRA adapters to use. This can be an absolute path, "
|
||||||
|
"a relative path, or a Hugging Face model identifier.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--prefix-len",
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
|
help=f"Number of prefix tokens to be used in RandomDataset "
|
||||||
|
"and SonnetDataset. For RandomDataset, the total input "
|
||||||
|
"length is the sum of prefix-len (default: "
|
||||||
|
f"{RandomDataset.DEFAULT_PREFIX_LEN}) and a random context length "
|
||||||
|
"sampled from [input_len * (1 - range_ratio), "
|
||||||
|
"input_len * (1 + range_ratio)]. For SonnetDataset, "
|
||||||
|
f"prefix_len (default: {SonnetDataset.DEFAULT_PREFIX_LEN}) "
|
||||||
|
"controls how much of the input is fixed lines versus "
|
||||||
|
"random lines, but the total input length remains approximately "
|
||||||
|
"input_len tokens.",
|
||||||
|
)
|
||||||
|
# random dataset
|
||||||
|
parser.add_argument(
|
||||||
|
"--random-range-ratio",
|
||||||
|
type=float,
|
||||||
|
default=None,
|
||||||
|
help=f"Range ratio (default : {RandomDataset.DEFAULT_RANGE_RATIO}) "
|
||||||
|
"for sampling input/output length, "
|
||||||
|
"used only for RandomDataset. Must be in the range [0, 1) to "
|
||||||
|
"define a symmetric sampling range "
|
||||||
|
"[length * (1 - range_ratio), length * (1 + range_ratio)].",
|
||||||
|
)
|
||||||
|
|
||||||
|
# hf dtaset
|
||||||
|
parser.add_argument(
|
||||||
|
"--hf-subset", type=str, default=None, help="Subset of the HF dataset."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--hf-split", type=str, default=None, help="Split of the HF dataset."
|
||||||
|
)
|
||||||
|
|
||||||
|
parser = AsyncEngineArgs.add_cli_args(parser)
|
||||||
|
|
||||||
|
return parser
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
print("""DEPRECATED: This script has been moved to the vLLM CLI.
|
parser = create_argument_parser()
|
||||||
|
args = parser.parse_args()
|
||||||
Please use the following command instead:
|
if args.tokenizer is None:
|
||||||
vllm bench throughput
|
args.tokenizer = args.model
|
||||||
|
validate_args(args)
|
||||||
For help with the new command, run:
|
main(args)
|
||||||
vllm bench throughput --help
|
|
||||||
|
|
||||||
Alternatively, you can run the new command directly with:
|
|
||||||
python -m vllm.entrypoints.cli.main bench throughput --help
|
|
||||||
""")
|
|
||||||
sys.exit(1)
|
|
||||||
|
@ -1,11 +1,10 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import json
|
import json
|
||||||
import math
|
import math
|
||||||
import os
|
import os
|
||||||
import time
|
|
||||||
from types import TracebackType
|
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
|
|
||||||
@ -73,53 +72,3 @@ def write_to_json(filename: str, records: list) -> None:
|
|||||||
cls=InfEncoder,
|
cls=InfEncoder,
|
||||||
default=lambda o: f"<{type(o).__name__} object is not JSON serializable>",
|
default=lambda o: f"<{type(o).__name__} object is not JSON serializable>",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
# Collect time and generate time metrics
|
|
||||||
#
|
|
||||||
# Example Usage:
|
|
||||||
# collector = TimeCollector(TimeCollector.US)
|
|
||||||
# for _ in range(total_iteration):
|
|
||||||
# with collector:
|
|
||||||
# ...
|
|
||||||
# collector.dump_avg_max()
|
|
||||||
class TimeCollector:
|
|
||||||
NS: int = 1
|
|
||||||
US: int = NS * 1000
|
|
||||||
MS: int = US * 1000
|
|
||||||
S: int = MS * 1000
|
|
||||||
|
|
||||||
def __init__(self, scale: int) -> None:
|
|
||||||
self.cnt: int = 0
|
|
||||||
self._sum: int = 0
|
|
||||||
self._max: int | None = None
|
|
||||||
self.scale = scale
|
|
||||||
self.start_time: int = time.monotonic_ns()
|
|
||||||
|
|
||||||
def collect(self, v: int) -> None:
|
|
||||||
self.cnt += 1
|
|
||||||
self._sum += v
|
|
||||||
if self._max is None:
|
|
||||||
self._max = v
|
|
||||||
else:
|
|
||||||
self._max = max(self._max, v)
|
|
||||||
|
|
||||||
def avg(self) -> float | str:
|
|
||||||
return self._sum * 1.0 / self.cnt / self.scale if self.cnt > 0 else "N/A"
|
|
||||||
|
|
||||||
def max(self) -> float | str:
|
|
||||||
return self._max / self.scale if self._max else "N/A"
|
|
||||||
|
|
||||||
def dump_avg_max(self) -> list[float | str]:
|
|
||||||
return [self.avg(), self.max()]
|
|
||||||
|
|
||||||
def __enter__(self) -> None:
|
|
||||||
self.start_time = time.monotonic_ns()
|
|
||||||
|
|
||||||
def __exit__(
|
|
||||||
self,
|
|
||||||
exc_type: type[BaseException] | None,
|
|
||||||
exc_value: BaseException | None,
|
|
||||||
exc_traceback: TracebackType | None,
|
|
||||||
) -> None:
|
|
||||||
self.collect(time.monotonic_ns() - self.start_time)
|
|
||||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user