Compare commits

..

2 Commits

Author SHA1 Message Date
1936d7bab0 format 2024-06-02 00:02:54 +00:00
996cf2de5c Fix hashing logic for non-full blocks 2024-06-02 00:01:30 +00:00
983 changed files with 22349 additions and 129783 deletions

View File

@ -1,43 +1,36 @@
import os
import sys
import zipfile
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 250 MB
VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 250))
MAX_SIZE_MB = 200
def print_top_10_largest_files(zip_file):
"""Print the top 10 largest files in the given zip file."""
with zipfile.ZipFile(zip_file, 'r') as z:
file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()]
file_sizes.sort(key=lambda x: x[1], reverse=True)
for f, size in file_sizes[:10]:
print(f"{f}: {size / (1024 * 1024):.2f} MBs uncompressed.")
print(f"{f}: {size/(1024*1024)} MBs uncompressed.")
def check_wheel_size(directory):
"""Check the size of .whl files in the given directory."""
for root, _, files in os.walk(directory):
for file_name in files:
if file_name.endswith(".whl"):
wheel_path = os.path.join(root, file_name)
wheel_size_mb = os.path.getsize(wheel_path) / (1024 * 1024)
if wheel_size_mb > VLLM_MAX_SIZE_MB:
print(f"Not allowed: Wheel {wheel_path} is larger "
f"({wheel_size_mb:.2f} MB) than the limit "
f"({VLLM_MAX_SIZE_MB} MB).")
for f in files:
if f.endswith(".whl"):
wheel_path = os.path.join(root, f)
wheel_size = os.path.getsize(wheel_path)
wheel_size_mb = wheel_size / (1024 * 1024)
if wheel_size_mb > MAX_SIZE_MB:
print(
f"Wheel {wheel_path} is too large ({wheel_size_mb} MB) "
f"compare to the allowed size ({MAX_SIZE_MB} MB).")
print_top_10_largest_files(wheel_path)
return 1
else:
print(f"Wheel {wheel_path} is within the allowed size "
f"({wheel_size_mb:.2f} MB).")
f"({wheel_size_mb} MB).")
return 0
if __name__ == "__main__":
if len(sys.argv) < 2:
print("Usage: python check-wheel-size.py <directory>")
sys.exit(1)
directory = sys.argv[1]
sys.exit(check_wheel_size(directory))
import sys
sys.exit(check_wheel_size(sys.argv[1]))

View File

@ -0,0 +1,18 @@
#!/bin/bash
set -ex
set -o pipefail
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
# aws s3 sync s3://air-example-data-2/vllm_opensource_llava/ images/
mkdir -p images
cd images
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_pixel_values.pt
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_image_features.pt
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_pixel_values.pt
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_image_features.pt
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign.jpg
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom.jpg
cd -

View File

@ -1,12 +0,0 @@
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2
model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.671
- name: "exact_match,flexible-extract"
value: 0.664
limit: 1000
num_fewshot: 5
trust_remote_code: True

View File

@ -1,11 +0,0 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5
model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.905
- name: "exact_match,flexible-extract"
value: 0.905
limit: 1000
num_fewshot: 5

View File

@ -1,11 +0,0 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.892
- name: "exact_match,flexible-extract"
value: 0.892
limit: 250
num_fewshot: 5

View File

@ -1,11 +0,0 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.752
- name: "exact_match,flexible-extract"
value: 0.754
limit: 1000
num_fewshot: 5

View File

@ -1,11 +0,0 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.753
- name: "exact_match,flexible-extract"
value: 0.753
limit: 1000
num_fewshot: 5

View File

@ -1,11 +0,0 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1
model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.755
- name: "exact_match,flexible-extract"
value: 0.755
limit: 1000
num_fewshot: 5

View File

@ -1,11 +0,0 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.753
- name: "exact_match,flexible-extract"
value: 0.753
limit: 1000
num_fewshot: 5

View File

@ -1,11 +0,0 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.728
- name: "exact_match,flexible-extract"
value: 0.728
limit: 250
num_fewshot: 5

View File

@ -1,11 +0,0 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.758
- name: "exact_match,flexible-extract"
value: 0.759
limit: 1000
num_fewshot: 5

View File

@ -1,11 +0,0 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 -t 1
model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.756
- name: "exact_match,flexible-extract"
value: 0.752
limit: 250
num_fewshot: 5

View File

@ -1,11 +0,0 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
model_name: "HandH1998/QQQ-Llama-3-8b-g128"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.419
- name: "exact_match,flexible-extract"
value: 0.416
limit: 1000
num_fewshot: 5

View File

@ -1,11 +0,0 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1
model_name: "mgoin/Minitron-4B-Base-FP8"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.233
- name: "exact_match,flexible-extract"
value: 0.236
limit: 1000
num_fewshot: 5

View File

@ -1,11 +0,0 @@
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8
model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.86
- name: "exact_match,flexible-extract"
value: 0.86
limit: 250
num_fewshot: 5

View File

@ -1,11 +0,0 @@
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4
model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.624
- name: "exact_match,flexible-extract"
value: 0.624
limit: 250
num_fewshot: 5

View File

@ -1,11 +0,0 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5 -t 4
model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.616
- name: "exact_match,flexible-extract"
value: 0.632
limit: 250
num_fewshot: 5

View File

@ -1,11 +0,0 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1
model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.578
- name: "exact_match,flexible-extract"
value: 0.585
limit: 1000
num_fewshot: 5

View File

@ -1,11 +0,0 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.593
- name: "exact_match,flexible-extract"
value: 0.588
limit: 1000
num_fewshot: 5

View File

@ -1,11 +0,0 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.595
- name: "exact_match,flexible-extract"
value: 0.582
limit: 1000
num_fewshot: 5

View File

@ -1,11 +0,0 @@
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
model_name: "Qwen/Qwen2-57B-A14B-Instruct"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.792
- name: "exact_match,flexible-extract"
value: 0.824
limit: 250
num_fewshot: 5

View File

@ -1,5 +0,0 @@
Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
Meta-Llama-3-70B-Instruct.yaml
Mixtral-8x7B-Instruct-v0.1.yaml
Qwen2-57B-A14-Instruct.yaml
DeepSeek-V2-Lite-Chat.yaml

View File

@ -1,9 +0,0 @@
Meta-Llama-3-8B-Instruct.yaml
Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
Minitron-4B-Base-FP8.yaml
Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
Qwen2-1.5B-Instruct-FP8W8.yaml
Meta-Llama-3-8B-QQQ.yaml

View File

@ -1,46 +0,0 @@
#!/bin/bash
# We can use this script to compute baseline accuracy on GSM for transformers.
#
# Make sure you have lm-eval-harness installed:
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@9516087b81a61d0e220b22cc1b75be76de23bc10
usage() {
echo``
echo "Runs lm eval harness on GSM8k using huggingface transformers."
echo "This pathway is intended to be used to create baselines for "
echo "our automated nm-test-accuracy workflow"
echo
echo "usage: ${0} <options>"
echo
echo " -m - huggingface stub or local directory of the model"
echo " -b - batch size to run the evaluation at"
echo " -l - limit number of samples to run"
echo " -f - number of fewshot samples to use"
echo
}
while getopts "m:b:l:f:" OPT; do
case ${OPT} in
m )
MODEL="$OPTARG"
;;
b )
BATCH_SIZE="$OPTARG"
;;
l )
LIMIT="$OPTARG"
;;
f )
FEWSHOT="$OPTARG"
;;
\? )
usage
exit 1
;;
esac
done
lm_eval --model hf \
--model_args pretrained=$MODEL,parallelize=True \
--tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
--batch_size $BATCH_SIZE

View File

@ -1,51 +0,0 @@
#!/bin/bash
# We can use this script to compute baseline accuracy on GSM for vllm.
# We use this for fp8, which HF does not support.
#
# Make sure you have lm-eval-harness installed:
# pip install lm-eval==0.4.3
usage() {
echo``
echo "Runs lm eval harness on GSM8k using huggingface transformers."
echo "This pathway is intended to be used to create baselines for "
echo "our automated nm-test-accuracy workflow"
echo
echo "usage: ${0} <options>"
echo
echo " -m - huggingface stub or local directory of the model"
echo " -b - batch size to run the evaluation at"
echo " -l - limit number of samples to run"
echo " -f - number of fewshot samples to use"
echo " -t - tensor parallel size to run at"
echo
}
while getopts "m:b:l:f:t:" OPT; do
case ${OPT} in
m )
MODEL="$OPTARG"
;;
b )
BATCH_SIZE="$OPTARG"
;;
l )
LIMIT="$OPTARG"
;;
f )
FEWSHOT="$OPTARG"
;;
t )
TP_SIZE="$OPTARG"
;;
\? )
usage
exit 1
;;
esac
done
lm_eval --model vllm \
--model_args pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend="ray",trust_remote_code=true,max_model_len=4096 \
--tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
--batch_size $BATCH_SIZE

View File

@ -1,59 +0,0 @@
#!/bin/bash
usage() {
echo``
echo "Runs lm eval harness on GSM8k using vllm and compares to "
echo "precomputed baseline (measured by HF transformers.)"
echo
echo "usage: ${0} <options>"
echo
echo " -c - path to the test data config (e.g. configs/small-models.txt)"
echo " -t - tensor parallel size"
echo
}
SUCCESS=0
while getopts "c:t:" OPT; do
case ${OPT} in
c )
CONFIG="$OPTARG"
;;
t )
TP_SIZE="$OPTARG"
;;
\? )
usage
exit 1
;;
esac
done
# Parse list of configs.
IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < $CONFIG
for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
do
LOCAL_SUCCESS=0
echo "=== RUNNING MODEL: $MODEL_CONFIG WITH TP SIZE: $TP_SIZE==="
export LM_EVAL_TEST_DATA_FILE=$PWD/configs/${MODEL_CONFIG}
export LM_EVAL_TP_SIZE=$TP_SIZE
pytest -s test_lm_eval_correctness.py || LOCAL_SUCCESS=$?
if [[ $LOCAL_SUCCESS == 0 ]]; then
echo "=== PASSED MODEL: ${MODEL_CONFIG} ==="
else
echo "=== FAILED MODEL: ${MODEL_CONFIG} ==="
fi
SUCCESS=$((SUCCESS + LOCAL_SUCCESS))
done
if [ "${SUCCESS}" -eq "0" ]; then
exit 0
else
exit 1
fi

View File

@ -1,58 +0,0 @@
"""
LM eval harness on model to compare vs HF baseline computed offline.
Configs are found in configs/$MODEL.yaml
* export LM_EVAL_TEST_DATA_FILE=configs/Meta-Llama-3-70B-Instruct.yaml
* export LM_EVAL_TP_SIZE=4
* pytest -s test_lm_eval_correctness.py
"""
import os
from pathlib import Path
import lm_eval
import numpy
import yaml
RTOL = 0.05
TEST_DATA_FILE = os.environ.get(
"LM_EVAL_TEST_DATA_FILE",
".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1)
def launch_lm_eval(eval_config):
trust_remote_code = eval_config.get('trust_remote_code', False)
model_args = f"pretrained={eval_config['model_name']}," \
f"tensor_parallel_size={TP_SIZE}," \
f"add_bos_token=true," \
f"trust_remote_code={trust_remote_code}"
results = lm_eval.simple_evaluate(
model="vllm",
model_args=model_args,
tasks=[task["name"] for task in eval_config["tasks"]],
num_fewshot=eval_config["num_fewshot"],
limit=eval_config["limit"],
batch_size="auto")
return results
def test_lm_eval_correctness():
eval_config = yaml.safe_load(
Path(TEST_DATA_FILE).read_text(encoding="utf-8"))
# Launch eval requests.
results = launch_lm_eval(eval_config)
# Confirm scores match ground truth.
for task in eval_config["tasks"]:
for metric in task["metrics"]:
ground_truth = metric["value"]
measured_value = results["results"][task["name"]][metric["name"]]
print(f'{task["name"]} | {metric["name"]}: '
f'ground_truth={ground_truth} | measured={measured_value}')
assert numpy.isclose(ground_truth, measured_value, rtol=RTOL)

View File

@ -1,153 +0,0 @@
# vLLM benchmark suite
## Introduction
This directory contains two sets of benchmark for vllm.
- Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance
- Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm.
See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
## Performance benchmark quick overview
**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!), with different models.
**Benchmarking Duration**: about 1hr.
**For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run.
## Nightly benchmark quick overview
**Benchmarking Coverage**: Fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) on Llama-3 8B, 70B and Mixtral 8x7B.
**Benchmarking engines**: vllm, TGI, trt-llm and lmdeploy.
**Benchmarking Duration**: about 3.5hrs.
## Trigger the benchmark
Performance benchmark will be triggered when:
- A PR being merged into vllm.
- Every commit for those PRs with `perf-benchmarks` label AND `ready` label.
Nightly benchmark will be triggered when:
- Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
## Performance benchmark details
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
#### Latency test
Here is an example of one test inside `latency-tests.json`:
```json
[
{
"test_name": "latency_llama8B_tp1",
"parameters": {
"model": "meta-llama/Meta-Llama-3-8B",
"tensor_parallel_size": 1,
"load_format": "dummy",
"num_iters_warmup": 5,
"num_iters": 15
}
},
]
```
In this example:
- The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
- The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
WARNING: The benchmarking script will save json results by itself, so please do not configure `--output-json` parameter in the json file.
#### Throughput test
The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`.
The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
#### Serving test
We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
```
[
{
"test_name": "serving_llama8B_tp1_sharegpt",
"qps_list": [1, 4, 16, "inf"],
"server_parameters": {
"model": "meta-llama/Meta-Llama-3-8B",
"tensor_parallel_size": 1,
"swap_space": 16,
"disable_log_stats": "",
"disable_log_requests": "",
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Meta-Llama-3-8B",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
]
```
Inside this example:
- The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`.
- The `server-parameters` includes the command line arguments for vLLM server.
- The `client-parameters` includes the command line arguments for `benchmark_serving.py`.
- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `benchmark_serving.py`
The number of this test is less stable compared to the delay and latency benchmarks (due to randomized sharegpt dataset sampling inside `benchmark_serving.py`), but a large change on this number (e.g. 5% change) still vary the output greatly.
WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
#### Visualizing the results
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results.
You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
If you do not see the table, please wait till the benchmark finish running.
The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
## Nightly test details
See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines.
#### Workflow
- The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines.
- Inside each container, we run [run-nightly-suite.sh](run-nightly-suite.sh), which will probe the serving engine of the current container.
- The `run-nightly-suite.sh` will redirect the request to `tests/run-[llm serving engine name]-nightly.sh`, which parses the workload described in [nightly-tests.json](tests/nightly-tests.json) and performs the benchmark.
- At last, we run [scripts/plot-nightly-results.py](scripts/plot-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
#### Nightly tests
In [nightly-tests.json](tests/nightly-tests.json), we include the command line arguments for benchmarking commands, together with the benchmarking test cases. The format is highly similar to performance benchmark.
#### Docker containers
The docker containers for benchmarking are specified in `nightly-pipeline.yaml`.
WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `tests/run-[llm serving engine name]-nightly.sh`.
WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git).

View File

@ -1,61 +0,0 @@
steps:
- label: "Wait for container to be ready"
agents:
queue: A100
plugins:
- kubernetes:
podSpec:
containers:
- image: badouralix/curl-jq
command:
- sh
- .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
- wait
- label: "A100"
agents:
queue: A100
plugins:
- kubernetes:
podSpec:
priorityClassName: perf-benchmark
containers:
- image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
command:
- bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
resources:
limits:
nvidia.com/gpu: 8
volumeMounts:
- name: devshm
mountPath: /dev/shm
env:
- name: VLLM_USAGE_SOURCE
value: ci-test
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: token
nodeSelector:
nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
volumes:
- name: devshm
emptyDir:
medium: Memory
# - label: "H100"
# agents:
# queue: H100
# plugins:
# - docker#v5.11.0:
# image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
# command:
# - bash
# - .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
# mount-buildkite-agent: true
# propagate-environment: true
# ipc: host
# gpus: all
# environment:
# - VLLM_USAGE_SOURCE
# - HF_TOKEN

View File

@ -1,45 +0,0 @@
# Nightly benchmark
The main goal of this benchmarking is two-fold:
- Performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and tgi) leads in performance in what workload.
- Reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions in [reproduce.md]().
## Docker images
We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following docker images:
- vllm/vllm-openai:v0.5.0.post1
- nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
- openmmlab/lmdeploy:v0.5.0
- ghcr.io/huggingface/text-generation-inference:2.1
<!-- Please check <a href="artifact://workspace/build/buildkite/vllm/performance-benchmark/.buildkite/nightly-benchmarks/nightly-pipeline.yaml">nightly-pipeline.yaml</a> artifact for more details on how we deploy the docker images. -->
## Hardware
One AWS node with 8x NVIDIA A100 GPUs.
## Workload description
We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following workload:
- Input length: randomly sample 500 prompts from ShareGPT dataset (with fixed random seed).
- Output length: the corresponding output length of these 500 prompts.
- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
- Average QPS (query per second): 4 for the small model (llama-3 8B) and 2 for other two models. For each QPS, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
<!-- Check <a href="artifact://workspace/build/buildkite/vllm/performance-benchmark/.buildkite/nightly-benchmarks/tests/nightly-tests.json">nightly-tests.json</a> artifact for more details. -->
## Plots
In the following plots, the dot shows the mean and the error bar shows the standard error of the mean. Value 0 means that the corresponding benchmark crashed.
<img src="artifact://nightly_results.png" alt="Benchmarking results" height=250 >
## Results
{nightly_results_benchmarking_table}

View File

@ -1,120 +0,0 @@
common_pod_spec: &common_pod_spec
priorityClassName: perf-benchmark
nodeSelector:
nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
volumes:
- name: devshm
emptyDir:
medium: Memory
- name: hf-cache
hostPath:
path: /root/.cache/huggingface
type: Directory
common_container_settings: &common_container_settings
command:
- bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
resources:
limits:
nvidia.com/gpu: 8
volumeMounts:
- name: devshm
mountPath: /dev/shm
- name: hf-cache
mountPath: /root/.cache/huggingface
env:
- name: VLLM_USAGE_SOURCE
value: ci-test
- name: HF_HOME
value: /root/.cache/huggingface
- name: VLLM_SOURCE_CODE_LOC
value: /workspace/build/buildkite/vllm/performance-benchmark
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: token
steps:
- block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours."
- label: "A100 trt benchmark"
priority: 100
agents:
queue: A100
plugins:
- kubernetes:
podSpec:
<<: *common_pod_spec
containers:
- image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
<<: *common_container_settings
- label: "A100 lmdeploy benchmark"
priority: 100
agents:
queue: A100
plugins:
- kubernetes:
podSpec:
<<: *common_pod_spec
containers:
- image: openmmlab/lmdeploy:v0.5.0
<<: *common_container_settings
- label: "A100 vllm benchmark"
priority: 100
agents:
queue: A100
plugins:
- kubernetes:
podSpec:
<<: *common_pod_spec
containers:
- image: vllm/vllm-openai:latest
<<: *common_container_settings
- label: "A100 tgi benchmark"
priority: 100
agents:
queue: A100
plugins:
- kubernetes:
podSpec:
<<: *common_pod_spec
containers:
- image: ghcr.io/huggingface/text-generation-inference:2.1
<<: *common_container_settings
- wait
- label: "Plot"
priority: 100
agents:
queue: A100
plugins:
- kubernetes:
podSpec:
<<: *common_pod_spec
containers:
- image: vllm/vllm-openai:v0.5.0.post1
command:
- bash .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
resources:
limits:
nvidia.com/gpu: 8
volumeMounts:
- name: devshm
mountPath: /dev/shm
env:
- name: VLLM_USAGE_SOURCE
value: ci-test
- name: VLLM_SOURCE_CODE_LOC
value: /workspace/build/buildkite/vllm/performance-benchmark
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: token
- wait

View File

@ -1,62 +0,0 @@
## Latency tests
- Input length: 32 tokens.
- Output length: 128 tokens.
- Batch size: fixed (8).
- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
- Evaluation metrics: end-to-end latency (mean, median, p99).
{latency_tests_markdown_table}
## Throughput tests
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
- Output length: the corresponding output length of these 200 prompts.
- Batch size: dynamically determined by vllm to achieve maximum throughput.
- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
- Evaluation metrics: throughput.
{throughput_tests_markdown_table}
## Serving tests
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
- Output length: the corresponding output length of these 200 prompts.
- Batch size: dynamically determined by vllm and the arrival pattern of the requests.
- **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
- We also added a speculative decoding test for llama-3 70B, under QPS 2
- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
{serving_tests_markdown_table}
## json version of the benchmarking tables
This section contains the data of the markdown tables above in JSON format.
You can load the benchmarking tables into pandas dataframes as follows:
```python
import json
import pandas as pd
benchmarking_results_json = """The json string"""
benchmarking_results = json.loads(benchmarking_results_json)
latency_results = pd.DataFrame.from_dict(benchmarking_results["latency"])
throughput_results = pd.DataFrame.from_dict(benchmarking_results["throughput"])
serving_results = pd.DataFrame.from_dict(benchmarking_results["serving"])
```
The json string for all benchmarking tables:
```json
{benchmarking_results_in_json_string}
```
You can also check the raw experiment data in the Artifact tab of the Buildkite page.

View File

@ -1,76 +0,0 @@
#!/bin/bash
set -o pipefail
set -x
check_gpus() {
# check the number of GPUs and GPU type.
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
if [[ $gpu_count -gt 0 ]]; then
echo "GPU found."
else
echo "Need at least 1 GPU to run benchmarking."
exit 1
fi
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
echo "GPU type is $gpu_type"
}
check_hf_token() {
# check if HF_TOKEN is available and valid
if [[ -z "$HF_TOKEN" ]]; then
echo "Error: HF_TOKEN is not set."
exit 1
elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
echo "Error: HF_TOKEN does not start with 'hf_'."
exit 1
else
echo "HF_TOKEN is set and valid."
fi
}
main() {
check_gpus
check_hf_token
df -h
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
(which jq) || (apt-get update && apt-get -y install jq)
cd $VLLM_SOURCE_CODE_LOC/benchmarks
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
# run lmdeploy
if which lmdeploy >/dev/null; then
echo "lmdeploy is available, redirect to run-lmdeploy-nightly.sh"
bash ../.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
exit 0
fi
# run tgi
if [ -e /tgi-entrypoint.sh ]; then
echo "tgi is available, redirect to run-tgi-nightly.sh"
bash ../.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
exit 0
fi
# run trt
if which trtllm-build >/dev/null; then
echo "trtllm is available, redirect to run-trt-nightly.sh"
bash ../.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
exit 0
fi
# run vllm
if [ -e /vllm-workspace ]; then
echo "vllm is available, redirect to run-vllm-nightly.sh"
bash ../.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
exit 0
fi
}
main "$@"

View File

@ -1,192 +0,0 @@
import json
import os
from pathlib import Path
import pandas as pd
from tabulate import tabulate
results_folder = Path("results/")
# latency results and the keys that will be printed into markdown
latency_results = []
latency_column_mapping = {
"test_name": "Test name",
"gpu_type": "GPU",
"avg_latency": "Mean latency (ms)",
# "P10": "P10 (s)",
# "P25": "P25 (s)",
"P50": "Median latency (ms)",
# "P75": "P75 (s)",
# "P90": "P90 (s)",
"P99": "P99 latency (ms)",
}
# throughput tests and the keys that will be printed into markdown
throughput_results = []
throughput_results_column_mapping = {
"test_name": "Test name",
"gpu_type": "GPU",
# "num_requests": "# of req.",
# "total_num_tokens": "Total # of tokens",
# "elapsed_time": "Elapsed time (s)",
"requests_per_second": "Tput (req/s)",
# "tokens_per_second": "Tput (tok/s)",
}
# serving results and the keys that will be printed into markdown
serving_results = []
serving_column_mapping = {
"test_name": "Test name",
"gpu_type": "GPU",
# "completed": "# of req.",
"request_throughput": "Tput (req/s)",
# "input_throughput": "Input Tput (tok/s)",
# "output_throughput": "Output Tput (tok/s)",
"mean_ttft_ms": "Mean TTFT (ms)",
"median_ttft_ms": "Median TTFT (ms)",
"p99_ttft_ms": "P99 TTFT (ms)",
# "mean_tpot_ms": "Mean TPOT (ms)",
# "median_tpot_ms": "Median",
# "p99_tpot_ms": "P99",
"mean_itl_ms": "Mean ITL (ms)",
"median_itl_ms": "Median ITL (ms)",
"p99_itl_ms": "P99 ITL (ms)",
}
def read_markdown(file):
if os.path.exists(file):
with open(file, "r") as f:
return f.read() + "\n"
else:
return f"{file} not found.\n"
def results_to_json(latency, throughput, serving):
return json.dumps({
'latency': latency.to_dict(),
'throughput': throughput.to_dict(),
'serving': serving.to_dict()
})
if __name__ == "__main__":
# collect results
for test_file in results_folder.glob("*.json"):
with open(test_file, "r") as f:
raw_result = json.loads(f.read())
if "serving" in str(test_file):
# this result is generated via `benchmark_serving.py`
# attach the benchmarking command to raw_result
with open(test_file.with_suffix(".commands"), "r") as f:
command = json.loads(f.read())
raw_result.update(command)
# update the test name of this result
raw_result.update({"test_name": test_file.stem})
# add the result to raw_result
serving_results.append(raw_result)
continue
elif "latency" in f.name:
# this result is generated via `benchmark_latency.py`
# attach the benchmarking command to raw_result
with open(test_file.with_suffix(".commands"), "r") as f:
command = json.loads(f.read())
raw_result.update(command)
# update the test name of this result
raw_result.update({"test_name": test_file.stem})
# get different percentiles
for perc in [10, 25, 50, 75, 90, 99]:
# Multiply 1000 to convert the time unit from s to ms
raw_result.update(
{f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]})
raw_result["avg_latency"] = raw_result["avg_latency"] * 1000
# add the result to raw_result
latency_results.append(raw_result)
continue
elif "throughput" in f.name:
# this result is generated via `benchmark_throughput.py`
# attach the benchmarking command to raw_result
with open(test_file.with_suffix(".commands"), "r") as f:
command = json.loads(f.read())
raw_result.update(command)
# update the test name of this result
raw_result.update({"test_name": test_file.stem})
# add the result to raw_result
throughput_results.append(raw_result)
continue
print(f"Skipping {test_file}")
latency_results = pd.DataFrame.from_dict(latency_results)
serving_results = pd.DataFrame.from_dict(serving_results)
throughput_results = pd.DataFrame.from_dict(throughput_results)
raw_results_json = results_to_json(latency_results, throughput_results,
serving_results)
# remapping the key, for visualization purpose
if not latency_results.empty:
latency_results = latency_results[list(
latency_column_mapping.keys())].rename(
columns=latency_column_mapping)
if not serving_results.empty:
serving_results = serving_results[list(
serving_column_mapping.keys())].rename(
columns=serving_column_mapping)
if not throughput_results.empty:
throughput_results = throughput_results[list(
throughput_results_column_mapping.keys())].rename(
columns=throughput_results_column_mapping)
processed_results_json = results_to_json(latency_results,
throughput_results,
serving_results)
# get markdown tables
latency_md_table = tabulate(latency_results,
headers='keys',
tablefmt='pipe',
showindex=False)
serving_md_table = tabulate(serving_results,
headers='keys',
tablefmt='pipe',
showindex=False)
throughput_md_table = tabulate(throughput_results,
headers='keys',
tablefmt='pipe',
showindex=False)
# document the result
with open(results_folder / "benchmark_results.md", "w") as f:
results = read_markdown("../.buildkite/nightly-benchmarks/" +
"performance-benchmarks-descriptions.md")
results = results.format(
latency_tests_markdown_table=latency_md_table,
throughput_tests_markdown_table=throughput_md_table,
serving_tests_markdown_table=serving_md_table,
benchmarking_results_in_json_string=processed_results_json)
f.write(results)
# document benchmarking results in json
with open(results_folder / "benchmark_results.json", "w") as f:
results = latency_results.to_dict(
orient='records') + throughput_results.to_dict(
orient='records') + serving_results.to_dict(orient='records')
f.write(json.dumps(results))

View File

@ -1,26 +0,0 @@
import argparse
from transformers import AutoTokenizer
def main(model, cachedir):
# Load the tokenizer and save it to the specified directory
tokenizer = AutoTokenizer.from_pretrained(model)
tokenizer.save_pretrained(cachedir)
print(f"Tokenizer saved to {cachedir}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Download and save Hugging Face tokenizer")
parser.add_argument("--model",
type=str,
required=True,
help="Name of the model")
parser.add_argument("--cachedir",
type=str,
required=True,
help="Directory to save the tokenizer")
args = parser.parse_args()
main(args.model, args.cachedir)

View File

@ -1,6 +0,0 @@
from lmdeploy.serve.openai.api_client import APIClient
api_client = APIClient("http://localhost:8000")
model_name = api_client.available_models[0]
print(model_name)

View File

@ -1,102 +0,0 @@
#!/bin/bash
server_params=$1
common_params=$2
model_path=$(echo "$common_params" | jq -r '.model')
model_name="${model_path#*/}"
model_type=$(echo "$server_params" | jq -r '.model_type')
model_dtype=$(echo "$server_params" | jq -r '.model_dtype')
model_tp_size=$(echo "$common_params" | jq -r '.tp')
max_batch_size=$(echo "$server_params" | jq -r '.max_batch_size')
max_input_len=$(echo "$server_params" | jq -r '.max_input_len')
max_output_len=$(echo "$server_params" | jq -r '.max_output_len')
trt_llm_version=$(echo "$server_params" | jq -r '.trt_llm_version')
cd ~
rm -rf models
mkdir -p models
cd models
models_dir=$(pwd)
trt_model_path=${models_dir}/${model_name}-trt-ckpt
trt_engine_path=${models_dir}/${model_name}-trt-engine
cd ~
rm -rf tensorrt-demo
git clone https://github.com/neuralmagic/tensorrt-demo.git
cd tensorrt-demo
tensorrt_demo_dir=$(pwd)
# make sure the parameter inside tensorrt_demo is consistent to envvar
sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/postprocessing/config.pbtxt
sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/preprocessing/config.pbtxt
sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/ensemble/config.pbtxt
sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/preprocessing/config.pbtxt
sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/postprocessing/config.pbtxt
sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/tensorrt_llm_bls/config.pbtxt
cd /
rm -rf tensorrtllm_backend
git clone https://github.com/triton-inference-server/tensorrtllm_backend.git
git lfs install
cd tensorrtllm_backend
git checkout $trt_llm_version
tensorrtllm_backend_dir=$(pwd)
git submodule update --init --recursive
cp -r ${tensorrt_demo_dir}/triton_model_repo ${tensorrtllm_backend_dir}/
cd /tensorrtllm_backend
cd ./tensorrt_llm/examples/${model_type}
if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
echo "Key 'fp8' exists in common params. Use quantize.py instead of convert_checkpoint.py"
echo "Reference: https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/llama/README.md"
python ../quantization/quantize.py \
--model_dir ${model_path} \
--dtype ${model_dtype} \
--tp_size ${model_tp_size} \
--output_dir ${trt_model_path} \
--qformat fp8 \
--kv_cache_dtype fp8 \
--calib_size 2
else
echo "Key 'fp8' does not exist in common params. Use convert_checkpoint.py"
python3 convert_checkpoint.py \
--model_dir ${model_path} \
--dtype ${model_dtype} \
--tp_size ${model_tp_size} \
--output_dir ${trt_model_path}
fi
trtllm-build \
--checkpoint_dir=${trt_model_path} \
--gpt_attention_plugin=${model_dtype} \
--gemm_plugin=${model_dtype} \
--remove_input_padding=enable \
--paged_kv_cache=enable \
--tp_size=${model_tp_size} \
--max_batch_size=${max_batch_size} \
--max_input_len=${max_input_len} \
--max_output_len=${max_output_len} \
--max_num_tokens=${max_output_len} \
--opt_num_tokens=${max_output_len} \
--output_dir=${trt_engine_path}
cd /tensorrtllm_backend/triton_model_repo
rm -rf ./tensorrt_llm/1/*
cp -r ${trt_engine_path}/* ./tensorrt_llm/1
cd /tensorrtllm_backend
python3 scripts/launch_triton_server.py \
--world_size=${model_tp_size} \
--model_repo=/tensorrtllm_backend/triton_model_repo &

View File

@ -1,40 +0,0 @@
#!/bin/bash
set -ex
set -o pipefail
main() {
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
(which jq) || (apt-get update && apt-get -y install jq)
if [ ! -f /workspace/buildkite-agent ]; then
echo "buildkite-agent binary not found. Skip plotting the results."
exit 0
fi
# initial annotation
description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md"
# download results
cd $VLLM_SOURCE_CODE_LOC/benchmarks
mkdir -p results/
/workspace/buildkite-agent artifact download 'results/*nightly_results.json' results/
ls
ls results/
# generate figures
python3 -m pip install tabulate pandas matplotlib
python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
--description $description \
--results-folder results/
# upload results and figures
/workspace/buildkite-agent artifact upload "nightly_results.png"
/workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
/workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json
/workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md
}
main "$@"

View File

@ -1,135 +0,0 @@
import argparse
import json
import math
from pathlib import Path
import matplotlib.pyplot as plt
import pandas as pd
from tabulate import tabulate
def parse_arguments():
parser = argparse.ArgumentParser(
description=
'Parse command line arguments for summary-nightly-results script.')
parser.add_argument('--results-folder',
type=str,
required=True,
help='The folder where the results are stored.')
parser.add_argument('--description',
type=str,
required=True,
help='Description of the results.')
args = parser.parse_args()
return args
def main(args):
bar_colors = ['#56B4E9', '#009E73', '#D55E00', '#E69F00']
results_folder = Path(args.results_folder)
results = []
# collect results
for test_file in results_folder.glob("*_nightly_results.json"):
with open(test_file, "r") as f:
results = results + json.loads(f.read())
# generate markdown table
df = pd.DataFrame.from_dict(results)
md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
with open(args.description, "r") as f:
description = f.read()
description = description.format(
nightly_results_benchmarking_table=md_table)
with open("nightly_results.md", "w") as f:
f.write(description)
plt.rcParams.update({'font.size': 20})
# plot results
fig, axes = plt.subplots(3, 3, figsize=(16, 14))
fig.subplots_adjust(hspace=1)
methods = ["vllm", "trt", "lmdeploy", "tgi"]
for i, model in enumerate(["llama8B", "llama70B", "mixtral8x7B"]):
for j, metric in enumerate(["TTFT", "ITL"]):
means, stds = [], []
for method in methods:
target = df['Test name'].str.contains(model)
target = target & df['Engine'].str.contains(method)
filtered_df = df[target]
if filtered_df.empty:
means.append(0.)
stds.append(0.)
else:
means.append(filtered_df[f"Mean {metric} (ms)"].values[0])
std = filtered_df[f"Std {metric} (ms)"].values[0]
success = filtered_df["Successful req."].values[0]
stds.append(std / math.sqrt(success))
print(model, metric)
print(means, stds)
ax = axes[i, j + 1]
bars = ax.bar(
["vllm", "trt", "lmdeploy", "tgi"],
means,
yerr=stds,
capsize=10,
)
for idx, bar in enumerate(bars):
bar.set_color(bar_colors[idx])
ax.set_ylim(bottom=0)
ax.set_ylabel(f"{metric} (ms)")
ax.set_title(f"{model} {metric}")
ax.grid(axis='y')
metric = "Tput"
j = 0
if True:
tputs = []
for method in methods:
target = df['Test name'].str.contains(model)
target = target & df['Engine'].str.contains(method)
filtered_df = df[target]
if filtered_df.empty:
tputs.append(0.)
else:
input_tput = filtered_df["Input Tput (tok/s)"].values[0]
output_tput = filtered_df["Output Tput (tok/s)"].values[0]
tputs.append(input_tput + output_tput)
print(model, metric)
print(tputs)
ax = axes[i, j]
bars = ax.bar(
["vllm", "trt", "lmdeploy", "tgi"],
tputs,
)
for idx, bar in enumerate(bars):
bar.set_color(bar_colors[idx])
ax.set_ylim(bottom=0)
ax.set_ylabel("Tput (token/s)")
ax.set_title(f"{model} {metric}")
ax.grid(axis='y')
fig.tight_layout()
fig.savefig("nightly_results.png", bbox_inches='tight', dpi=400)
if __name__ == '__main__':
args = parse_arguments()
main(args)

View File

@ -1,218 +0,0 @@
#!/bin/bash
set -o pipefail
check_gpus() {
# check the number of GPUs and GPU type.
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
if [[ $gpu_count -gt 0 ]]; then
echo "GPU found."
else
echo "Need at least 1 GPU to run benchmarking."
exit 1
fi
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
echo "GPU type is $gpu_type"
}
kill_gpu_processes() {
pkill lmdeploy || true
# waiting for GPU processes to be fully killed
sleep 10
# Print the GPU memory usage
# so that we know if all GPU processes are killed.
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
# The memory usage should be 0 MB.
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
}
json2args() {
# transforms the JSON string to command line args, and '_' is replaced to '-'
# example:
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
local json_string=$1
local args=$(
echo "$json_string" | jq -r '
to_entries |
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
join(" ")
'
)
echo "$args"
}
wait_for_server() {
# wait for vllm server to start
# return 1 if vllm server crashes
timeout 1200 bash -c '
until curl -s localhost:8000/v1/completions > /dev/null; do
sleep 1
done' && return 0 || return 1
}
run_serving_tests() {
# run serving tests using `benchmark_serving.py`
# $1: a json file specifying serving test cases
local serving_test_file
serving_test_file=$1
# Iterate over serving tests
jq -c '.[]' "$serving_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name')
# if TEST_SELECTOR is set, only run the test cases that match the selector
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
echo "Skip test case $test_name."
continue
fi
# append lmdeploy to the test name
test_name=lmdeploy_$test_name
# get common parameters
common_params=$(echo "$params" | jq -r '.common_parameters')
model=$(echo "$common_params" | jq -r '.model')
tp=$(echo "$common_params" | jq -r '.tp')
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
port=$(echo "$common_params" | jq -r '.port')
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
# get client and server arguments
server_params=$(echo "$params" | jq -r '.lmdeploy_server_parameters')
client_params=$(echo "$params" | jq -r '.lmdeploy_client_parameters')
server_args=$(json2args "$server_params")
client_args=$(json2args "$client_params")
qps_list=$(echo "$params" | jq -r '.qps_list')
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
echo "Running over qps list $qps_list"
# check if there is enough GPU to run the test
if [[ $gpu_count -lt $tp ]]; then
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
continue
fi
# prepare tokenizer
rm -rf /tokenizer_cache
mkdir /tokenizer_cache
python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
--model "$model" \
--cachedir /tokenizer_cache
server_command="lmdeploy serve api_server $model \
--tp $tp \
--server-port $port \
$server_args"
# run the server
echo "Running test case $test_name"
echo "Server command: $server_command"
bash -c "$server_command" &
# wait until the server is alive
wait_for_server
if [ $? -eq 0 ]; then
echo ""
echo "lmdeploy server is up and running."
else
echo ""
echo "lmdeploy failed to start within the timeout period."
break
fi
# get model name
model_name=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py)
# iterate over different QPS
for qps in $qps_list; do
# remove the surrounding single quote from qps
if [[ "$qps" == *"inf"* ]]; then
echo "qps was $qps"
qps="inf"
echo "now qps is $qps"
fi
new_test_name=$test_name"_qps_"$qps
client_command="python3 benchmark_serving.py \
--backend lmdeploy \
--tokenizer /tokenizer_cache \
--dataset-name $dataset_name \
--dataset-path $dataset_path \
--num-prompts $num_prompts \
--port $port \
--save-result \
--result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \
--request-rate $qps \
--model \"$model_name\" \
$client_args"
echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command"
eval "$client_command"
# record the benchmarking commands
jq_output=$(jq -n \
--arg server "$server_command" \
--arg client "$client_command" \
--arg gpu "$gpu_type" \
--arg engine "lmdeploy" \
'{
server_command: $server,
client_command: $client,
gpu_type: $gpu,
engine: $engine
}')
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
done
# clean up
kill_gpu_processes
rm -rf /root/.cache/huggingface/*
done
}
upload_to_buildkite() {
# upload the benchmarking results to buildkite
# if the agent binary is not found, skip uploading the results, exit 0
if [ ! -f /workspace/buildkite-agent ]; then
echo "buildkite-agent binary not found. Skip uploading the results."
return 0
fi
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
}
main() {
check_gpus
# enter vllm directory
cd $VLLM_SOURCE_CODE_LOC/benchmarks
declare -g RESULTS_FOLDER=results/
mkdir -p $RESULTS_FOLDER
BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
python -m pip install transformers==4.41.2
export CURRENT_LLM_SERVING_ENGINE=lmdeploy
run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
python -m pip install tabulate pandas
python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
upload_to_buildkite
}
main "$@"

View File

@ -1,381 +0,0 @@
#!/bin/bash
# This script should be run inside the CI process
# This script assumes that we are already inside the vllm/ directory
# Benchmarking results will be available inside vllm/benchmarks/results/
# Do not set -e, as the mixtral 8x22B model tends to crash occasionally
# and we still want to see other benchmarking results even when mixtral crashes.
set -o pipefail
check_gpus() {
# check the number of GPUs and GPU type.
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
if [[ $gpu_count -gt 0 ]]; then
echo "GPU found."
else
echo "Need at least 1 GPU to run benchmarking."
exit 1
fi
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
echo "GPU type is $gpu_type"
}
check_hf_token() {
# check if HF_TOKEN is available and valid
if [[ -z "$HF_TOKEN" ]]; then
echo "Error: HF_TOKEN is not set."
exit 1
elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
echo "Error: HF_TOKEN does not start with 'hf_'."
exit 1
else
echo "HF_TOKEN is set and valid."
fi
}
ensure_sharegpt_downloaded() {
local FILE=ShareGPT_V3_unfiltered_cleaned_split.json
if [ ! -f "$FILE" ]; then
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/$FILE
else
echo "$FILE already exists."
fi
}
json2args() {
# transforms the JSON string to command line args, and '_' is replaced to '-'
# example:
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
local json_string=$1
local args=$(
echo "$json_string" | jq -r '
to_entries |
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
join(" ")
'
)
echo "$args"
}
wait_for_server() {
# wait for vllm server to start
# return 1 if vllm server crashes
timeout 1200 bash -c '
until curl -X POST localhost:8000/v1/completions; do
sleep 1
done' && return 0 || return 1
}
kill_processes_launched_by_current_bash() {
# Kill all python processes launched from current bash script
current_shell_pid=$$
processes=$(ps -eo pid,ppid,command | awk -v ppid="$current_shell_pid" -v proc="$1" '$2 == ppid && $3 ~ proc {print $1}')
if [ -n "$processes" ]; then
echo "Killing the following processes matching '$1':"
echo "$processes"
echo "$processes" | xargs kill -9
else
echo "No processes found matching '$1'."
fi
}
kill_gpu_processes() {
ps -aux
lsof -t -i:8000 | xargs -r kill -9
pkill -f pt_main_thread
# this line doesn't work now
# ps aux | grep python | grep openai | awk '{print $2}' | xargs -r kill -9
pkill -f python3
pkill -f /usr/bin/python3
# wait until GPU memory usage smaller than 1GB
while [ $(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1) -ge 1000 ]; do
sleep 1
done
# remove vllm config file
rm -rf ~/.config/vllm
}
upload_to_buildkite() {
# upload the benchmarking results to buildkite
# if the agent binary is not found, skip uploading the results, exit 0
# Check if buildkite-agent is available in the PATH or at /workspace/buildkite-agent
if command -v buildkite-agent >/dev/null 2>&1; then
BUILDKITE_AGENT_COMMAND="buildkite-agent"
elif [ -f /workspace/buildkite-agent ]; then
BUILDKITE_AGENT_COMMAND="/workspace/buildkite-agent"
else
echo "buildkite-agent binary not found. Skip uploading the results."
return 0
fi
# Use the determined command to annotate and upload artifacts
$BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" <$RESULTS_FOLDER/benchmark_results.md
$BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
}
run_latency_tests() {
# run latency tests using `benchmark_latency.py`
# $1: a json file specifying latency test cases
local latency_test_file
latency_test_file=$1
# Iterate over latency tests
jq -c '.[]' "$latency_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name')
if [[ ! "$test_name" =~ ^latency_ ]]; then
echo "In latency-test.json, test_name must start with \"latency_\"."
exit 1
fi
# if TEST_SELECTOR is set, only run the test cases that match the selector
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
echo "Skip test case $test_name."
continue
fi
# get arguments
latency_params=$(echo "$params" | jq -r '.parameters')
latency_args=$(json2args "$latency_params")
# check if there is enough GPU to run the test
tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
if [[ $gpu_count -lt $tp ]]; then
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
continue
fi
latency_command="python3 benchmark_latency.py \
--output-json $RESULTS_FOLDER/${test_name}.json \
$latency_args"
echo "Running test case $test_name"
echo "Latency command: $latency_command"
# recoding benchmarking command ang GPU command
jq_output=$(jq -n \
--arg latency "$latency_command" \
--arg gpu "$gpu_type" \
'{
latency_command: $latency,
gpu_type: $gpu
}')
echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
# run the benchmark
eval "$latency_command"
kill_gpu_processes
done
}
run_throughput_tests() {
# run throughput tests using `benchmark_throughput.py`
# $1: a json file specifying throughput test cases
local throughput_test_file
throughput_test_file=$1
# Iterate over throughput tests
jq -c '.[]' "$throughput_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name')
if [[ ! "$test_name" =~ ^throughput_ ]]; then
echo "In throughput-test.json, test_name must start with \"throughput_\"."
exit 1
fi
# if TEST_SELECTOR is set, only run the test cases that match the selector
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
echo "Skip test case $test_name."
continue
fi
# get arguments
throughput_params=$(echo "$params" | jq -r '.parameters')
throughput_args=$(json2args "$throughput_params")
# check if there is enough GPU to run the test
tp=$(echo $throughput_params | jq -r '.tensor_parallel_size')
if [[ $gpu_count -lt $tp ]]; then
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
continue
fi
throughput_command="python3 benchmark_throughput.py \
--output-json $RESULTS_FOLDER/${test_name}.json \
$throughput_args"
echo "Running test case $test_name"
echo "Throughput command: $throughput_command"
# recoding benchmarking command ang GPU command
jq_output=$(jq -n \
--arg command "$throughput_command" \
--arg gpu "$gpu_type" \
'{
throughput_command: $command,
gpu_type: $gpu
}')
echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
# run the benchmark
eval "$throughput_command"
kill_gpu_processes
done
}
run_serving_tests() {
# run serving tests using `benchmark_serving.py`
# $1: a json file specifying serving test cases
local serving_test_file
serving_test_file=$1
# Iterate over serving tests
jq -c '.[]' "$serving_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name')
if [[ ! "$test_name" =~ ^serving_ ]]; then
echo "In serving-test.json, test_name must start with \"serving_\"."
exit 1
fi
# if TEST_SELECTOR is set, only run the test cases that match the selector
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
echo "Skip test case $test_name."
continue
fi
# get client and server arguments
server_params=$(echo "$params" | jq -r '.server_parameters')
client_params=$(echo "$params" | jq -r '.client_parameters')
server_args=$(json2args "$server_params")
client_args=$(json2args "$client_params")
qps_list=$(echo "$params" | jq -r '.qps_list')
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
echo "Running over qps list $qps_list"
# check if there is enough GPU to run the test
tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
if [[ $gpu_count -lt $tp ]]; then
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
continue
fi
# check if server model and client model is aligned
server_model=$(echo "$server_params" | jq -r '.model')
client_model=$(echo "$client_params" | jq -r '.model')
if [[ $server_model != "$client_model" ]]; then
echo "Server model and client model must be the same. Skip testcase $testname."
continue
fi
server_command="python3 \
-m vllm.entrypoints.openai.api_server \
$server_args"
# run the server
echo "Running test case $test_name"
echo "Server command: $server_command"
eval "$server_command" &
server_pid=$!
# wait until the server is alive
wait_for_server
if [ $? -eq 0 ]; then
echo ""
echo "vllm server is up and running."
else
echo ""
echo "vllm failed to start within the timeout period."
fi
# iterate over different QPS
for qps in $qps_list; do
# remove the surrounding single quote from qps
if [[ "$qps" == *"inf"* ]]; then
echo "qps was $qps"
qps="inf"
echo "now qps is $qps"
fi
new_test_name=$test_name"_qps_"$qps
client_command="python3 benchmark_serving.py \
--save-result \
--result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \
--request-rate $qps \
$client_args"
echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command"
eval "$client_command"
# record the benchmarking commands
jq_output=$(jq -n \
--arg server "$server_command" \
--arg client "$client_command" \
--arg gpu "$gpu_type" \
'{
server_command: $server,
client_command: $client,
gpu_type: $gpu
}')
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
done
# clean up
kill -9 $server_pid
kill_gpu_processes
done
}
main() {
check_gpus
check_hf_token
# dependencies
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
(which jq) || (apt-get update && apt-get -y install jq)
(which lsof) || (apt-get update && apt-get install -y lsof)
# get the current IP address, required by benchmark_serving.py
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
# turn of the reporting of the status of each request, to clean up the terminal output
export VLLM_LOG_LEVEL="WARNING"
# prepare for benchmarking
cd benchmarks || exit 1
ensure_sharegpt_downloaded
declare -g RESULTS_FOLDER=results/
mkdir -p $RESULTS_FOLDER
QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
# benchmarking
run_serving_tests $QUICK_BENCHMARK_ROOT/tests/serving-tests.json
run_latency_tests $QUICK_BENCHMARK_ROOT/tests/latency-tests.json
run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/throughput-tests.json
# postprocess benchmarking results
pip install tabulate pandas
python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py
upload_to_buildkite
}
main "$@"

View File

@ -1,216 +0,0 @@
#!/bin/bash
set -o pipefail
check_gpus() {
# check the number of GPUs and GPU type.
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
if [[ $gpu_count -gt 0 ]]; then
echo "GPU found."
else
echo "Need at least 1 GPU to run benchmarking."
exit 1
fi
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
echo "GPU type is $gpu_type"
}
kill_gpu_processes() {
pkill text-generation || true
# waiting for GPU processes to be fully killed
sleep 10
# Print the GPU memory usage
# so that we know if all GPU processes are killed.
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
# The memory usage should be 0 MB.
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
}
json2args() {
# transforms the JSON string to command line args, and '_' is replaced to '-'
# example:
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
local json_string=$1
local args=$(
echo "$json_string" | jq -r '
to_entries |
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
join(" ")
'
)
echo "$args"
}
wait_for_server() {
timeout 1200 bash -c '
until curl -s localhost:8000/generate_stream > /dev/null; do
sleep 1
done' && return 0 || return 1
}
run_serving_tests() {
# run serving tests using `benchmark_serving.py`
# $1: a json file specifying serving test cases
local serving_test_file
serving_test_file=$1
# Iterate over serving tests
jq -c '.[]' "$serving_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name')
# if TEST_SELECTOR is set, only run the test cases that match the selector
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
echo "Skip test case $test_name."
continue
fi
# append tgi to the test name
test_name=tgi_$test_name
# get common parameters
common_params=$(echo "$params" | jq -r '.common_parameters')
model=$(echo "$common_params" | jq -r '.model')
tp=$(echo "$common_params" | jq -r '.tp')
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
port=$(echo "$common_params" | jq -r '.port')
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
# get client and server arguments
server_params=$(echo "$params" | jq -r '.tgi_server_parameters')
client_params=$(echo "$params" | jq -r '.tgi_client_parameters')
server_args=$(json2args "$server_params")
client_args=$(json2args "$client_params")
qps_list=$(echo "$params" | jq -r '.qps_list')
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
echo "Running over qps list $qps_list"
# check if there is enough GPU to run the test
if [[ $gpu_count -lt $tp ]]; then
echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
continue
fi
if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
echo "Key 'fp8' exists in common params."
server_command="/tgi-entrypoint.sh \
--model-id $model \
--num-shard $tp \
--port $port \
--quantize fp8 \
$server_args"
else
echo "Key 'fp8' does not exist in common params."
server_command="/tgi-entrypoint.sh \
--model-id $model \
--num-shard $tp \
--port $port \
$server_args"
fi
# run the server
echo "Running test case $test_name"
echo "Server command: $server_command"
eval "$server_command" &
# wait until the server is alive
wait_for_server
if [ $? -eq 0 ]; then
echo ""
echo "tgi server is up and running."
else
echo ""
echo "tgi failed to start within the timeout period."
break
fi
# iterate over different QPS
for qps in $qps_list; do
# remove the surrounding single quote from qps
if [[ "$qps" == *"inf"* ]]; then
echo "qps was $qps"
qps="inf"
echo "now qps is $qps"
fi
new_test_name=$test_name"_qps_"$qps
client_command="python3 benchmark_serving.py \
--backend tgi \
--model $model \
--dataset-name $dataset_name \
--dataset-path $dataset_path \
--num-prompts $num_prompts \
--port $port \
--save-result \
--result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \
--request-rate $qps \
$client_args"
echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command"
eval "$client_command"
# record the benchmarking commands
jq_output=$(jq -n \
--arg server "$server_command" \
--arg client "$client_command" \
--arg gpu "$gpu_type" \
--arg engine "tgi" \
'{
server_command: $server,
client_command: $client,
gpu_type: $gpu,
engine: $engine
}')
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
done
# clean up
kill_gpu_processes
rm -rf /root/.cache/huggingface/*
done
}
upload_to_buildkite() {
# upload the benchmarking results to buildkite
# if the agent binary is not found, skip uploading the results, exit 0
if [ ! -f /workspace/buildkite-agent ]; then
echo "buildkite-agent binary not found. Skip uploading the results."
return 0
fi
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
}
main() {
check_gpus
# enter vllm directory
cd $VLLM_SOURCE_CODE_LOC/benchmarks
declare -g RESULTS_FOLDER=results/
mkdir -p $RESULTS_FOLDER
BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
export CURRENT_LLM_SERVING_ENGINE=tgi
run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
python -m pip install tabulate pandas
python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
upload_to_buildkite
}
main "$@"

View File

@ -1,214 +0,0 @@
#!/bin/bash
set -o pipefail
check_gpus() {
# check the number of GPUs and GPU type.
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
if [[ $gpu_count -gt 0 ]]; then
echo "GPU found."
else
echo "Need at least 1 GPU to run benchmarking."
exit 1
fi
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
echo "GPU type is $gpu_type"
}
kill_gpu_processes() {
pkill tritonserver || true
# waiting for GPU processes to be fully killed
sleep 20
# Print the GPU memory usage
# so that we know if all GPU processes are killed.
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
# The memory usage should be 0 MB.
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
}
json2args() {
# transforms the JSON string to command line args, and '_' is replaced to '-'
# example:
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
local json_string=$1
local args=$(
echo "$json_string" | jq -r '
to_entries |
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
join(" ")
'
)
echo "$args"
}
wait_for_server() {
timeout 1200 bash -c '
until curl -s localhost:8000/generate_stream > /dev/null; do
sleep 1
done' && return 0 || return 1
}
run_serving_tests() {
# run serving tests using `benchmark_serving.py`
# $1: a json file specifying serving test cases
local serving_test_file
serving_test_file=$1
# Iterate over serving tests
jq -c '.[]' "$serving_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name')
# if TEST_SELECTOR is set, only run the test cases that match the selector
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
echo "Skip test case $test_name."
continue
fi
# append trt to the test name
test_name=trt_$test_name
# get common parameters
common_params=$(echo "$params" | jq -r '.common_parameters')
model=$(echo "$common_params" | jq -r '.model')
tp=$(echo "$common_params" | jq -r '.tp')
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
port=$(echo "$common_params" | jq -r '.port')
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
# get client and server arguments
server_params=$(echo "$params" | jq -r '.trt_server_parameters')
client_params=$(echo "$params" | jq -r '.trt_client_parameters')
client_args=$(json2args "$client_params")
qps_list=$(echo "$params" | jq -r '.qps_list')
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
echo "Running over qps list $qps_list"
# check if there is enough GPU to run the test
if [[ $gpu_count -lt $tp ]]; then
echo "Required model_tp_size $tp but only $gpu_count GPU found. Skip testcase $test_name."
continue
fi
cd $VLLM_SOURCE_CODE_LOC/benchmarks
echo "Running test case $test_name"
bash ../.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh "$server_params" "$common_params"
# wait until the server is alive
wait_for_server
if [ $? -eq 0 ]; then
echo ""
echo "trt server is up and running."
else
echo ""
echo "trt failed to start within the timeout period."
break
fi
# prepare tokenizer
cd $VLLM_SOURCE_CODE_LOC/benchmarks
rm -rf /tokenizer_cache
mkdir /tokenizer_cache
python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
--model "$model" \
--cachedir /tokenizer_cache
cd $VLLM_SOURCE_CODE_LOC/benchmarks
# iterate over different QPS
for qps in $qps_list; do
# remove the surrounding single quote from qps
if [[ "$qps" == *"inf"* ]]; then
echo "qps was $qps"
qps="inf"
echo "now qps is $qps"
fi
new_test_name=$test_name"_qps_"$qps
client_command="python3 benchmark_serving.py \
--backend tensorrt-llm \
--tokenizer /tokenizer_cache \
--model $model \
--dataset-name $dataset_name \
--dataset-path $dataset_path \
--num-prompts $num_prompts \
--port $port \
--save-result \
--result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \
--request-rate $qps \
$client_args"
echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command"
eval "$client_command"
server_command=""
# record the benchmarking commands
jq_output=$(jq -n \
--arg server "$server_command" \
--arg client "$client_command" \
--arg gpu "$gpu_type" \
--arg engine "trt" \
'{
server_command: $server,
client_command: $client,
gpu_type: $gpu,
engine: $engine
}')
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
done
# clean up
kill_gpu_processes
rm -rf /root/.cache/huggingface/*
done
}
upload_to_buildkite() {
# upload the benchmarking results to buildkite
# if the agent binary is not found, skip uploading the results, exit 0
if [ ! -f /workspace/buildkite-agent ]; then
echo "buildkite-agent binary not found. Skip uploading the results."
return 0
fi
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
}
main() {
check_gpus
# enter vllm directory
cd $VLLM_SOURCE_CODE_LOC/benchmarks
declare -g RESULTS_FOLDER=results/
mkdir -p $RESULTS_FOLDER
BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
# update transformers package, to make sure mixtral tokenizer is available
python -m pip install transformers -U
export CURRENT_LLM_SERVING_ENGINE=trt
run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
python -m pip install tabulate pandas
python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
upload_to_buildkite
}
main "$@"

View File

@ -1,221 +0,0 @@
#!/bin/bash
set -o pipefail
check_gpus() {
# check the number of GPUs and GPU type.
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
if [[ $gpu_count -gt 0 ]]; then
echo "GPU found."
else
echo "Need at least 1 GPU to run benchmarking."
exit 1
fi
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
echo "GPU type is $gpu_type"
}
kill_gpu_processes() {
# kill all processes on GPU.
pkill pt_main_thread
sleep 10
# remove vllm config file
rm -rf ~/.config/vllm
# Print the GPU memory usage
# so that we know if all GPU processes are killed.
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
# The memory usage should be 0 MB.
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
}
json2args() {
# transforms the JSON string to command line args, and '_' is replaced to '-'
# example:
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
local json_string=$1
local args=$(
echo "$json_string" | jq -r '
to_entries |
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
join(" ")
'
)
echo "$args"
}
wait_for_server() {
# wait for vllm server to start
# return 1 if vllm server crashes
timeout 1200 bash -c '
until curl -s localhost:8000/v1/completions > /dev/null; do
sleep 1
done' && return 0 || return 1
}
run_serving_tests() {
# run serving tests using `benchmark_serving.py`
# $1: a json file specifying serving test cases
local serving_test_file
serving_test_file=$1
# Iterate over serving tests
jq -c '.[]' "$serving_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name')
# if TEST_SELECTOR is set, only run the test cases that match the selector
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
echo "Skip test case $test_name."
continue
fi
# append vllm to the test name
test_name=vllm_$test_name
# get common parameters
common_params=$(echo "$params" | jq -r '.common_parameters')
model=$(echo "$common_params" | jq -r '.model')
tp=$(echo "$common_params" | jq -r '.tp')
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
port=$(echo "$common_params" | jq -r '.port')
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
# get client and server arguments
server_params=$(echo "$params" | jq -r '.vllm_server_parameters')
client_params=$(echo "$params" | jq -r '.vllm_client_parameters')
server_args=$(json2args "$server_params")
client_args=$(json2args "$client_params")
qps_list=$(echo "$params" | jq -r '.qps_list')
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
echo "Running over qps list $qps_list"
# check if there is enough GPU to run the test
if [[ $gpu_count -lt $tp ]]; then
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
continue
fi
if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
server_command="python3 \
-m vllm.entrypoints.openai.api_server \
-tp $tp \
--model $model \
--port $port \
$server_args"
else
echo "Key 'fp8' does not exist in common params."
server_command="python3 \
-m vllm.entrypoints.openai.api_server \
-tp $tp \
--model $model \
--port $port \
$server_args"
fi
# run the server
echo "Running test case $test_name"
echo "Server command: $server_command"
eval "$server_command" &
# wait until the server is alive
wait_for_server
if [ $? -eq 0 ]; then
echo ""
echo "vllm server is up and running."
else
echo ""
echo "vllm failed to start within the timeout period."
break
fi
# iterate over different QPS
for qps in $qps_list; do
# remove the surrounding single quote from qps
if [[ "$qps" == *"inf"* ]]; then
echo "qps was $qps"
qps="inf"
echo "now qps is $qps"
fi
new_test_name=$test_name"_qps_"$qps
client_command="python3 benchmark_serving.py \
--backend vllm \
--model $model \
--dataset-name $dataset_name \
--dataset-path $dataset_path \
--num-prompts $num_prompts \
--port $port \
--save-result \
--result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \
--request-rate $qps \
$client_args"
echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command"
eval "$client_command"
# record the benchmarking commands
jq_output=$(jq -n \
--arg server "$server_command" \
--arg client "$client_command" \
--arg gpu "$gpu_type" \
--arg engine "vllm" \
'{
server_command: $server,
client_command: $client,
gpu_type: $gpu,
engine: $engine
}')
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
done
# clean up
kill_gpu_processes
rm -rf /root/.cache/huggingface/*
done
}
upload_to_buildkite() {
# upload the benchmarking results to buildkite
# if the agent binary is not found, skip uploading the results, exit 0
if [ ! -f /workspace/buildkite-agent ]; then
echo "buildkite-agent binary not found. Skip uploading the results."
return 0
fi
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
}
main() {
check_gpus
# enter vllm directory
cd $VLLM_SOURCE_CODE_LOC/benchmarks
declare -g RESULTS_FOLDER=results/
mkdir -p $RESULTS_FOLDER
BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
export CURRENT_LLM_SERVING_ENGINE=vllm
run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
python3 -m pip install tabulate pandas
python3 $BENCHMARK_ROOT/scripts/summary-nightly-results.py
upload_to_buildkite
}
main "$@"

View File

@ -1,76 +0,0 @@
import datetime
import json
import os
from pathlib import Path
import pandas as pd
from tabulate import tabulate
results_folder = Path("results/")
# serving results and the keys that will be printed into markdown
serving_results = []
serving_column_mapping = {
"test_name": "Test name",
"gpu_type": "GPU",
"completed": "Successful req.",
"request_throughput": "Tput (req/s)",
"mean_ttft_ms": "Mean TTFT (ms)",
"std_ttft_ms": "Std TTFT (ms)",
"mean_itl_ms": "Mean ITL (ms)",
"std_itl_ms": "Std ITL (ms)",
"input_throughput": "Input Tput (tok/s)",
"output_throughput": "Output Tput (tok/s)",
"engine": "Engine",
}
if __name__ == "__main__":
# collect results
for test_file in results_folder.glob("*.json"):
with open(test_file, "r") as f:
raw_result = json.loads(f.read())
# attach the benchmarking command to raw_result
with open(test_file.with_suffix(".commands"), "r") as f:
command = json.loads(f.read())
raw_result.update(command)
# update the test name of this result
raw_result.update({"test_name": test_file.stem})
# add the result to raw_result
serving_results.append(raw_result)
continue
serving_results = pd.DataFrame.from_dict(serving_results)
if not serving_results.empty:
serving_results = serving_results[list(
serving_column_mapping.keys())].rename(
columns=serving_column_mapping)
serving_md_table_with_headers = tabulate(serving_results,
headers='keys',
tablefmt='pipe',
showindex=False)
# remove the first line of header
serving_md_table_lines = serving_md_table_with_headers.split('\n')
serving_md_table_without_header = '\n'.join(serving_md_table_lines[2:])
prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE")
# document benchmarking results in markdown
with open(results_folder / f"{prefix}_nightly_results.md", "w") as f:
# document results with header.
# for those who wants to reproduce our benchmark.
f.write(serving_md_table_with_headers)
f.write('\n')
# document benchmarking results in json
with open(results_folder / f"{prefix}_nightly_results.json", "w") as f:
results = serving_results.to_dict(orient='records')
f.write(json.dumps(results))

View File

@ -1,17 +0,0 @@
#!/bin/sh
TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-test-repo:pull" | jq -r .token)
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
retries=0
while [ $retries -lt 1000 ]; do
if [ $(curl -s -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" $URL) -eq 200 ]; then
exit 0
fi
echo "Waiting for image to be available..."
retries=$((retries + 1))
sleep 5
done
exit 1

View File

@ -1,32 +0,0 @@
[
{
"test_name": "latency_llama8B_tp1",
"parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"load_format": "dummy",
"num_iters_warmup": 5,
"num_iters": 15
}
},
{
"test_name": "latency_llama70B_tp4",
"parameters": {
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"tensor_parallel_size": 4,
"load_format": "dummy",
"num-iters-warmup": 5,
"num-iters": 15
}
},
{
"test_name": "latency_mixtral8x7B_tp2",
"parameters": {
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
"tensor_parallel_size": 2,
"load_format": "dummy",
"num-iters-warmup": 5,
"num-iters": 15
}
}
]

View File

@ -1,116 +0,0 @@
[
{
"test_name": "llama8B_tp1",
"qps_list": [4],
"common_parameters": {
"model": "meta-llama/Meta-Llama-3-8B",
"tp": 1,
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 500,
"port": 8000
},
"lmdeploy_server_parameters": {
},
"lmdeploy_client_parameters": {
},
"tgi_server_parameters": {
},
"tgi_client_parameters": {
"endpoint": "/generate_stream"
},
"trt_server_parameters": {
"model_type": "llama",
"model_dtype": "float16",
"max_batch_size": 256,
"max_input_len": 4096,
"max_output_len": 4096,
"trt_llm_version": "r24.04"
},
"trt_client_parameters": {
"endpoint": "/v2/models/ensemble/generate_stream"
},
"vllm_server_parameters": {
"disable_log_stats": "",
"disable_log_requests": ""
},
"vllm_client_parameters": {
}
},
{
"test_name": "llama70B_tp4",
"qps_list": [2],
"common_parameters": {
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
"tp": 4,
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 500,
"port": 8000
},
"lmdeploy_server_parameters": {
},
"lmdeploy_client_parameters": {
},
"tgi_server_parameters": {
},
"tgi_client_parameters": {
"endpoint": "/generate_stream"
},
"trt_server_parameters": {
"model_type": "llama",
"model_dtype": "float16",
"max_batch_size": 256,
"max_input_len": 4096,
"max_output_len": 4096,
"trt_llm_version": "r24.04"
},
"trt_client_parameters": {
"endpoint": "/v2/models/ensemble/generate_stream"
},
"vllm_server_parameters": {
"disable_log_stats": "",
"disable_log_requests": ""
},
"vllm_client_parameters": {
}
},
{
"test_name": "mixtral8x7B_tp2",
"qps_list": [2],
"common_parameters": {
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
"tp": 2,
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 500,
"port": 8000
},
"lmdeploy_server_parameters": {
},
"lmdeploy_client_parameters": {
},
"tgi_server_parameters": {
},
"tgi_client_parameters": {
"endpoint": "/generate_stream"
},
"trt_server_parameters": {
"model_type": "llama",
"model_dtype": "float16",
"max_batch_size": 256,
"max_input_len": 4096,
"max_output_len": 4096,
"trt_llm_version": "r24.04"
},
"trt_client_parameters": {
"endpoint": "/v2/models/ensemble/generate_stream"
},
"vllm_server_parameters": {
"disable_log_stats": "",
"disable_log_requests": ""
},
"vllm_client_parameters": {
}
}
]

View File

@ -1,80 +0,0 @@
[
{
"test_name": "serving_llama8B_tp1_sharegpt",
"qps_list": [1, 4, 16, "inf"],
"server_parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"swap_space": 16,
"disable_log_stats": "",
"disable_log_requests": "",
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama70B_tp4_sharegpt",
"qps_list": [1, 4, 16, "inf"],
"server_parameters": {
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"tensor_parallel_size": 4,
"swap_space": 16,
"disable_log_stats": "",
"disable_log_requests": "",
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_mixtral8x7B_tp2_sharegpt",
"qps_list": [1, 4, 16, "inf"],
"server_parameters": {
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
"tensor_parallel_size": 2,
"swap_space": 16,
"disable_log_stats": "",
"disable_log_requests": "",
"load_format": "dummy"
},
"client_parameters": {
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama70B_tp4_sharegpt_specdecode",
"qps_list": [2],
"server_parameters": {
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"disable_log_requests": "",
"tensor_parallel_size": 4,
"swap_space": 16,
"speculative_model": "turboderp/Qwama-0.5B-Instruct",
"num_speculative_tokens": 4,
"speculative_draft_tensor_parallel_size": 1,
"use_v2_block_manager": ""
},
"client_parameters": {
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
}
]

View File

@ -1,35 +0,0 @@
[
{
"test_name": "throughput_llama8B_tp1",
"parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"load_format": "dummy",
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200,
"backend": "vllm"
}
},
{
"test_name": "throughput_llama70B_tp4",
"parameters": {
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"tensor_parallel_size": 4,
"load_format": "dummy",
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200,
"backend": "vllm"
}
},
{
"test_name": "throughput_mixtral8x7B_tp2",
"parameters": {
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
"tensor_parallel_size": 2,
"load_format": "dummy",
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200,
"backend": "vllm"
}
}
]

View File

@ -1,32 +0,0 @@
steps:
- label: "Build wheel - CUDA 12.1"
agents:
queue: cpu_queue
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
# rename the files to change linux -> manylinux1
- "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done"
- "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
- "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/"
env:
DOCKER_BUILDKIT: "1"
- block: "Build CUDA 11.8 wheel"
key: block-build-cu118-wheel
- label: "Build wheel - CUDA 11.8"
depends_on: block-build-cu118-wheel
agents:
queue: cpu_queue
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
# rename the files to change linux -> manylinux1
- "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done"
- "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
- "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/"
env:
DOCKER_BUILDKIT: "1"

75
.buildkite/run-amd-test.sh Executable file → Normal file
View File

@ -1,16 +1,7 @@
# This script runs test inside the corresponding ROCm docker container.
set -o pipefail
set -ex
# Print ROCm version
echo "--- Confirming Clean Initial State"
while true; do
sleep 3
if grep -q clean /opt/amdgpu/etc/gpu_state; then
echo "GPUs state is \"clean\""
break
fi
done
echo "--- ROCm info"
rocminfo
@ -54,10 +45,15 @@ while true; do
fi
done
echo "--- Pulling container"
image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
docker pull ${image_name}
echo "--- Building container"
sha=$(git rev-parse --short HEAD)
image_name=rocm_${sha}
container_name=rocm_${sha}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)
docker build \
-t ${image_name} \
-f Dockerfile.rocm \
--progress plain \
.
remove_docker_container() {
docker rm -f ${container_name} || docker image rm -f ${image_name} || true
@ -66,55 +62,12 @@ trap remove_docker_container EXIT
echo "--- Running container"
HF_CACHE="$(realpath ~)/huggingface"
mkdir -p ${HF_CACHE}
HF_MOUNT="/root/.cache/huggingface"
commands=$@
PARALLEL_JOB_COUNT=8
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
if [[ $commands == *"--shard-id="* ]]; then
for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
#replace shard arguments
commands=${@//"--shard-id= "/"--shard-id=${GPU} "}
commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
docker run \
docker run \
--device /dev/kfd --device /dev/dri \
--network host \
--shm-size=16gb \
--rm \
-e HIP_VISIBLE_DEVICES=${GPU} \
-e HF_TOKEN \
-v ${HF_CACHE}:${HF_MOUNT} \
-e HF_HOME=${HF_MOUNT} \
--name ${container_name}_${GPU} \
--name ${container_name} \
${image_name} \
/bin/bash -c "${commands}" \
|& while read -r line; do echo ">>Shard $GPU: $line"; done &
PIDS+=($!)
done
#wait for all processes to finish and collect exit codes
for pid in ${PIDS[@]}; do
wait ${pid}
STATUS+=($?)
done
for st in ${STATUS[@]}; do
if [[ ${st} -ne 0 ]]; then
echo "One of the processes failed with $st"
exit ${st}
fi
done
else
docker run \
--device /dev/kfd --device /dev/dri \
--network host \
--shm-size=16gb \
--rm \
-e HIP_VISIBLE_DEVICES=0 \
-e HF_TOKEN \
-v ${HF_CACHE}:${HF_MOUNT} \
-e HF_HOME=${HF_MOUNT} \
--name ${container_name} \
${image_name} \
/bin/bash -c "${commands}"
fi
/bin/bash -c "${@}"

View File

@ -50,16 +50,16 @@ echo "### Serving Benchmarks" >> benchmark_results.md
sed -n '1p' benchmark_serving.txt >> benchmark_results.md # first line
echo "" >> benchmark_results.md
echo '```' >> benchmark_results.md
tail -n 24 benchmark_serving.txt >> benchmark_results.md # last 24 lines
tail -n 20 benchmark_serving.txt >> benchmark_results.md # last 20 lines
echo '```' >> benchmark_results.md
# if the agent binary is not found, skip uploading the results, exit 0
if [ ! -f /usr/bin/buildkite-agent ]; then
if [ ! -f /workspace/buildkite-agent ]; then
exit 0
fi
# upload the results to buildkite
buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
/workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
# exit with the exit code of the benchmarks
if [ $bench_latency_exit_code -ne 0 ]; then
@ -75,4 +75,4 @@ if [ $bench_serving_exit_code -ne 0 ]; then
fi
rm ShareGPT_V3_unfiltered_cleaned_split.json
buildkite-agent artifact upload "*.json"
/workspace/buildkite-agent artifact upload "*.json"

View File

@ -3,43 +3,12 @@
set -ex
# Try building the docker image
numactl -C 48-95 -N 1 docker build -t cpu-test -f Dockerfile.cpu .
numactl -C 48-95 -N 1 docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .
docker build -t cpu-test -f Dockerfile.cpu .
# Setup cleanup
remove_docker_container() { docker rm -f cpu-test cpu-test-avx2 || true; }
remove_docker_container() { docker rm -f cpu-test || true; }
trap remove_docker_container EXIT
remove_docker_container
# Run the image, setting --shm-size=4g for tensor parallel.
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
--cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
--cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2
# offline inference
docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
# Run basic model test
docker exec cpu-test bash -c "
pip install pytest matplotlib einops transformers_stream_generator
pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py \
--ignore=tests/models/test_oot_registration.py \
--ignore=tests/models/test_registry.py \
--ignore=tests/models/test_fp8.py \
--ignore=tests/models/test_jamba.py \
--ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
# online inference
docker exec cpu-test bash -c "
export VLLM_CPU_KVCACHE_SPACE=10
export VLLM_CPU_OMP_THREADS_BIND=48-92
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m &
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
python3 benchmarks/benchmark_serving.py \
--backend vllm \
--dataset-name random \
--model facebook/opt-125m \
--num-prompts 20 \
--endpoint /v1/completions \
--tokenizer facebook/opt-125m"
# Run the image and launch offline inference
docker run --network host --env VLLM_CPU_KVCACHE_SPACE=1 --name cpu-test cpu-test python3 vllm/examples/offline_inference.py

View File

@ -1,105 +0,0 @@
#!/bin/bash
set -euox pipefail
if [[ $# -lt 4 ]]; then
echo "Usage: .buildkite/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
exit 1
fi
WORKING_DIR=$1
NUM_NODES=$2
NUM_GPUS=$3
DOCKER_IMAGE=$4
shift 4
COMMANDS=("$@")
if [ ${#COMMANDS[@]} -ne $NUM_NODES ]; then
echo "The number of commands must be equal to the number of nodes."
echo "Number of nodes: $NUM_NODES"
echo "Number of commands: ${#COMMANDS[@]}"
exit 1
fi
echo "List of commands"
for command in "${COMMANDS[@]}"; do
echo $command
done
start_network() {
docker network create --subnet=192.168.10.0/24 docker-net
}
start_nodes() {
for node in $(seq 0 $(($NUM_NODES-1))); do
GPU_DEVICES='"device='
for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
GPU_DEVICES+=$(($DEVICE_NUM))
if [ $node_gpu -lt $(($NUM_GPUS - 1)) ]; then
GPU_DEVICES+=','
fi
done
GPU_DEVICES+='"'
# start the container in detached mode
# things to note:
# 1. --shm-size=10.24gb is required. don't use --ipc=host
# 2. pass HF_TOKEN to the container
# 3. map the huggingface cache directory to the container
# 3. assign ip addresses to the containers (head node: 192.168.10.10, worker nodes:
# starting from 192.168.10.11)
docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN -v ~/.cache/huggingface:/root/.cache/huggingface --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE /bin/bash -c "tail -f /dev/null"
# organize containers into a ray cluster
if [ $node -eq 0 ]; then
# start the ray head node
docker exec -d node$node /bin/bash -c "ray start --head --port=6379 --block"
# wait for the head node to be ready
sleep 10
else
# start the ray worker nodes, and connect them to the head node
docker exec -d node$node /bin/bash -c "ray start --address=192.168.10.10:6379 --block"
fi
done
# wait for the cluster to be ready
sleep 10
# print the cluster status
docker exec node0 /bin/bash -c "ray status"
}
run_nodes() {
# important: iterate in reverse order to start the head node last
# we start the worker nodes first, in detached mode, and then start the head node
# in the foreground, so that the output of the head node is visible in the buildkite logs
for node in $(seq $(($NUM_NODES - 1)) -1 0); do
GPU_DEVICES='"device='
for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
GPU_DEVICES+=$(($DEVICE_NUM))
if [ $node_gpu -lt $(($NUM_GPUS - 1)) ]; then
GPU_DEVICES+=','
fi
done
GPU_DEVICES+='"'
echo "Running node$node with GPU devices: $GPU_DEVICES"
if [ $node -ne 0 ]; then
docker exec -d node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
else
docker exec node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
fi
done
}
cleanup() {
for node in $(seq 0 $(($NUM_NODES-1))); do
docker stop node$node
done
docker network rm docker-net
}
trap cleanup EXIT
start_network
start_nodes
run_nodes

View File

@ -1,14 +0,0 @@
# This script build the OpenVINO docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage.
set -ex
# Try building the docker image
docker build -t openvino-test -f Dockerfile.openvino .
# Setup cleanup
remove_docker_container() { docker rm -f openvino-test || true; }
trap remove_docker_container EXIT
remove_docker_container
# Run the image and launch offline inference
docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/vllm/examples/offline_inference.py

View File

@ -1,15 +0,0 @@
set -e
# Build the docker image.
docker build -f Dockerfile.tpu -t vllm-tpu .
# Set up cleanup.
remove_docker_container() { docker rm -f tpu-test || true; }
trap remove_docker_container EXIT
# Remove the container that might not be cleaned up in the previous run.
remove_docker_container
# For HF_TOKEN.
source /etc/environment
# Run a simple end-to-end example.
docker run --privileged --net host --shm-size=16G -it -e HF_TOKEN=$HF_TOKEN --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py"

View File

@ -1,14 +0,0 @@
# This script build the CPU docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage.
set -ex
# Try building the docker image
docker build -t xpu-test -f Dockerfile.xpu .
# Setup cleanup
remove_docker_container() { docker rm -f xpu-test || true; }
trap remove_docker_container EXIT
remove_docker_container
# Run the image and launch offline inference
docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path xpu-test python3 examples/offline_inference.py

View File

@ -1,415 +1,166 @@
# In this file, you can add more tests to run either by adding a new step or
# adding a new command to an existing step. See different options here for examples.
# This script will be feed into Jinja template in `test-template-aws.j2` at
# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2
# to generate the final pipeline yaml file.
# Documentation
# label(str): the name of the test. emoji allowed.
# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
# fast_check_only(bool): run this test on fastcheck pipeline only
# command(str): the single command to run for tests. incompatible with commands.
# commands(list): the list of commands to run for test. incompatbile with command.
# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
# gpu(str): override the GPU selection for the test. default is on L4 GPUs. currently only supports a100
# num_gpus(int): override the number of GPUs for the test. default to 1 GPU. currently support 2,4.
# num_nodes(int): whether to simulate multi-node setup by launch multiple containers on one host,
# in this case, commands must be specified. the first command runs on first host, the second
# command runs on the second host.
# working_dir(str): specify the place where command should execute, default to /vllm-workspace/tests
# source_file_dependencies(list): the list of prefix to opt-in the test for, if empty, the test will always run.
# When adding a test
# - If the test belong to an existing group, add it there
# - If the test is short, add to any existing step
# - If the test takes more than 10min, then it is okay to create a new step.
# Note that all steps execute in parallel.
# This script will be feed into Jinja template in `test-template.j2` to generate
# the final pipeline yaml file.
steps:
##### fast check tests #####
- label: Documentation Build # 2min
working_dir: "/vllm-workspace/test_docs/docs"
fast_check: true
no_gpu: True
commands:
- pip install -r requirements-docs.txt
- SPHINXOPTS=\"-W\" make html
# Check API reference (if it fails, you may have missing mock imports)
- grep \"sig sig-object py\" build/html/dev/sampling_params.html
- label: Async Engine, Inputs, Utils, Worker Test # 15min
fast_check: true
source_file_dependencies:
- vllm/
- tests/async_engine
- tests/test_inputs
- tests/multimodal
- tests/test_utils
- tests/worker
commands:
- pytest -v -s async_engine # Async Engine
- pytest -v -s test_inputs.py
- pytest -v -s multimodal
- pytest -v -s test_utils.py # Utils
- pytest -v -s worker # Worker
- label: Basic Correctness Test # 30min
#mirror_hardwares: [amd]
fast_check: true
source_file_dependencies:
- vllm/
- tests/basic_correctness
commands:
- pytest -v -s basic_correctness/test_basic_correctness.py
- pytest -v -s basic_correctness/test_cpu_offload.py
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
- label: Core Test # 10min
- label: Regression Test
mirror_hardwares: [amd]
fast_check: true
source_file_dependencies:
- vllm/core
- vllm/distributed
- tests/core
commands:
- pytest -v -s core
- label: Entrypoints Test # 20min
working_dir: "/vllm-workspace/tests"
fast_check: true
#mirror_hardwares: [amd]
source_file_dependencies:
- vllm/
commands:
- pip install -e ./plugins/vllm_add_dummy_model
- pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@a4987bba6e9e9b3f22bd3a6c1ecf0abd04fd5622#egg=lm_eval[api]
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
- pytest -v -s entrypoints/openai
- pytest -v -s entrypoints/test_chat_utils.py
- label: Distributed Tests (4 GPUs) # 10min
working_dir: "/vllm-workspace/tests"
num_gpus: 4
fast_check: true
source_file_dependencies:
- vllm/distributed/
- vllm/core/
- tests/distributed
- tests/spec_decode/e2e/test_integration_dist_tp4
commands:
- pytest -v -s distributed/test_pynccl.py
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
- label: Metrics, Tracing Test # 10min
num_gpus: 2
fast_check: true
source_file_dependencies:
- vllm/
- tests/metrics
- tests/tracing
commands:
- pytest -v -s metrics
- "pip install \
'opentelemetry-sdk>=1.26.0,<1.27.0' \
'opentelemetry-api>=1.26.0,<1.27.0' \
'opentelemetry-exporter-otlp>=1.26.0,<1.27.0' \
'opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0'"
- pytest -v -s tracing
##### fast check tests #####
##### 1 GPU test #####
- label: Regression Test # 5min
mirror_hardwares: [amd]
source_file_dependencies:
- vllm/
- tests/test_regression
command: pytest -v -s test_regression.py
working_dir: "/vllm-workspace/tests" # optional
- label: Engine Test # 10min
mirror_hardwares: [amd]
source_file_dependencies:
- vllm/
- tests/engine
- tests/tokenization
commands:
- pytest -v -s engine test_sequence.py test_config.py test_logger.py
# OOM in the CI unless we run this separately
- pytest -v -s tokenization
- label: Examples Test # 12min
working_dir: "/vllm-workspace/examples"
- label: AsyncEngine Test
#mirror_hardwares: [amd]
source_file_dependencies:
- vllm/entrypoints
- examples/
command: pytest -v -s async_engine
- label: Basic Correctness Test
mirror_hardwares: [amd]
commands:
- pip install awscli tensorizer # for llava example and tensorizer test
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_basic_correctness.py
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
- label: Core Test
mirror_hardwares: [amd]
command: pytest -v -s core
- label: Distributed Comm Ops Test
#mirror_hardwares: [amd]
command: pytest -v -s distributed/test_comm_ops.py
working_dir: "/vllm-workspace/tests"
num_gpus: 2
- label: Distributed Tests
mirror_hardwares: [amd]
working_dir: "/vllm-workspace/tests"
num_gpus: 2
commands:
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
- pytest -v -s spec_decode/e2e/test_integration_dist.py
- label: Distributed Tests (Multiple Groups)
#mirror_hardwares: [amd]
working_dir: "/vllm-workspace/tests"
num_gpus: 4
commands:
- pytest -v -s distributed/test_pynccl.py
- label: Engine Test
mirror_hardwares: [amd]
command: pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py
- label: Entrypoints Test
mirror_hardwares: [amd]
commands:
- pytest -v -s test_inputs.py
- pytest -v -s entrypoints -m llm
- pytest -v -s entrypoints -m openai
- label: Examples Test
working_dir: "/vllm-workspace/examples"
mirror_hardwares: [amd]
commands:
# install aws cli for llava_example.py
# install tensorizer for tensorize_vllm_model.py
- pip install awscli tensorizer
- python3 offline_inference.py
- python3 cpu_offload.py
- python3 offline_inference_chat.py
- python3 offline_inference_with_prefix.py
- python3 llm_engine_example.py
- python3 offline_inference_vision_language.py
- python3 llava_example.py
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference_encoder_decoder.py
- label: Models Test # 1hr10min
source_file_dependencies:
- vllm/
- tests/models
commands:
- pip install -e ./plugins/vllm_add_dummy_model
- pytest -v -s models/test_oot_registration.py # it needs a clean process
- pytest -v -s models -m \"not vlm\" --ignore=models/test_oot_registration.py
- label: torch compile integration test
source_file_dependencies:
- vllm/
commands:
- pytest -v -s ./compile/test_full_graph.py
- pytest -v -s ./compile/test_wrapper.py
- label: Vision Language Models Test # 42min
- label: Kernels Test %N
#mirror_hardwares: [amd]
source_file_dependencies:
- vllm/
commands:
- pytest -v -s models -m vlm
command: pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 4
- label: Prefix Caching Test # 7min
- label: Models Test
#mirror_hardwares: [amd]
source_file_dependencies:
- vllm/
- tests/prefix_caching
commands:
- bash ../.buildkite/download-images.sh
- pytest -v -s models --ignore=models/test_llava.py
- label: Llava Test
mirror_hardwares: [amd]
commands:
- bash ../.buildkite/download-images.sh
- pytest -v -s models/test_llava.py
- label: Prefix Caching Test
mirror_hardwares: [amd]
commands:
- pytest -v -s prefix_caching
- label: Samplers Test # 18min
source_file_dependencies:
- vllm/model_executor/layers
- vllm/sampling_metadata.py
- tests/samplers
commands:
- pytest -v -s samplers
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
- label: Samplers Test
#mirror_hardwares: [amd]
command: pytest -v -s samplers
- label: LogitsProcessor Test # 5min
- label: LogitsProcessor Test
mirror_hardwares: [amd]
source_file_dependencies:
- vllm/model_executor/layers
- tests/test_logits_processor
command: pytest -v -s test_logits_processor.py
- label: Speculative decoding tests # 22min
source_file_dependencies:
- vllm/spec_decode
- tests/spec_decode
commands:
# See https://github.com/vllm-project/vllm/issues/5152
- export VLLM_ATTENTION_BACKEND=XFORMERS
- pytest -v -s spec_decode
- label: Utils Test
command: pytest -v -s test_utils.py
- label: LoRA Test %N # 30min each
- label: Worker Test
mirror_hardwares: [amd]
source_file_dependencies:
- vllm/lora
- tests/lora
command: pytest -v -s worker
- label: Speculative decoding tests
#mirror_hardwares: [amd]
command: pytest -v -s spec_decode
- label: LoRA Test %N
#mirror_hardwares: [amd]
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
parallelism: 4
- label: Kernels Test %N # 30min each
source_file_dependencies:
- csrc/
- vllm/attention
- tests/kernels
- label: LoRA Long Context (Distributed)
#mirror_hardwares: [amd]
num_gpus: 4
# This test runs llama 13B, so it is required to run on 4 GPUs.
commands:
- pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 4
# Temporarily run this way because we cannot clean up GPU mem usage
# for multi GPU tests.
# TODO(sang): Fix it.
- pytest -v -s lora/test_long_context.py::test_rotary_emb_replaced
- pytest -v -s lora/test_long_context.py::test_batched_rope_kernel
- pytest -v -s lora/test_long_context.py::test_self_consistency
- pytest -v -s lora/test_long_context.py::test_quality
- pytest -v -s lora/test_long_context.py::test_max_len
- label: Tensorizer Test # 11min
- label: Tensorizer Test
#mirror_hardwares: [amd]
command: apt-get install curl libsodium23 && pytest -v -s tensorizer_loader
- label: Metrics Test
mirror_hardwares: [amd]
soft_fail: true
source_file_dependencies:
- vllm/model_executor/model_loader
- tests/tensorizer_loader
commands:
- apt-get update && apt-get install -y curl libsodium23
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s tensorizer_loader
command: pytest -v -s metrics
- label: Benchmarks # 9min
- label: Quantization Test
#mirror_hardwares: [amd]
command: pytest -v -s quantization
- label: Benchmarks
working_dir: "/vllm-workspace/.buildkite"
mirror_hardwares: [amd]
source_file_dependencies:
- benchmarks/
commands:
- pip install aiohttp
- bash run-benchmarks.sh
- label: Quantization Test # 15min
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
- tests/quantization
command: pytest -v -s quantization
- label: LM Eval Small Models # 53min
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
- label: Documentation Build
working_dir: "/vllm-workspace/test_docs/docs"
no_gpu: True
commands:
- pip install lm-eval
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- bash ./run-tests.sh -c configs/models-small.txt -t 1
- label: OpenAI-Compatible Tool Use # 20 min
fast_check: false
mirror_hardwares: [ amd ]
source_file_dependencies:
- vllm/
- tests/tool_use
commands:
- pytest -v -s tool_use
##### 1 GPU test #####
##### multi gpus test #####
- label: Distributed Comm Ops Test # 7min
working_dir: "/vllm-workspace/tests"
num_gpus: 2
source_file_dependencies:
- vllm/distributed
- tests/distributed
commands:
- pytest -v -s distributed/test_comm_ops.py
- pytest -v -s distributed/test_shm_broadcast.py
- label: 2 Node Tests (4 GPUs in total) # 16min
working_dir: "/vllm-workspace/tests"
num_gpus: 2
num_nodes: 2
source_file_dependencies:
- vllm/distributed/
- vllm/engine/
- vllm/executor/
- vllm/model_executor/models/
- tests/distributed/
commands:
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
- label: Distributed Tests (2 GPUs) # 28min
#mirror_hardwares: [amd]
working_dir: "/vllm-workspace/tests"
num_gpus: 2
source_file_dependencies:
- vllm/distributed/
- vllm/engine/
- vllm/executor/
- vllm/model_executor/models/
- tests/distributed/
commands:
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
- TARGET_TEST_SUITE=L4 pytest -v -s distributed/test_basic_distributed_correctness.py
- pytest -v -s distributed/test_basic_distributed_correctness_enc_dec.py
- pytest -v -s distributed/test_chunked_prefill_distributed.py
- pytest -v -s distributed/test_multimodal_broadcast.py
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
- pip install -e ./plugins/vllm_add_dummy_model
- pytest -v -s distributed/test_distributed_oot.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
- label: Multi-step Tests (4 GPUs) # 21min
working_dir: "/vllm-workspace/tests"
num_gpus: 4
source_file_dependencies:
- vllm/model_executor/layers/sampler.py
- vllm/sequence.py
- vllm/worker/worker_base.py
- vllm/worker/worker.py
- vllm/worker/multi_step_worker.py
- vllm/worker/model_runner_base.py
- vllm/worker/model_runner.py
- vllm/worker/multi_step_model_runner.py
- vllm/engine
- tests/multi_step
commands:
- pytest -v -s multi_step/test_correctness_async_llm.py
- pytest -v -s multi_step/test_correctness_llm.py
- label: Pipeline Parallelism Test # 23min
working_dir: "/vllm-workspace/tests"
num_gpus: 4
source_file_dependencies:
- vllm/distributed/
- vllm/engine/
- vllm/executor/
- vllm/model_executor/models/
- tests/distributed/
commands:
- pytest -v -s distributed/test_pp_cudagraph.py
- pytest -v -s distributed/test_pipeline_parallel.py
- label: LoRA Long Context (Distributed) # 11min
# This test runs llama 13B, so it is required to run on 4 GPUs.
num_gpus: 4
source_file_dependencies:
- vllm/lora
- tests/lora/test_long_context
commands:
# FIXIT: find out which code initialize cuda before running the test
# before the fix, we need to use spawn to test it
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s -x lora/test_long_context.py
- label: Weight Loading Multiple GPU Test
working_dir: "/vllm-workspace/tests"
num_gpus: 2
source_file_dependencies:
- vllm/
- tests/weight_loading
commands:
- bash weight_loading/run_model_weight_loading_test.sh
##### multi gpus test #####
##### A100 test #####
- label: Distributed Tests (A100) # optional
gpu: a100
num_gpus: 4
source_file_dependencies:
- vllm/
commands:
# NOTE: don't test llama model here, it seems hf implementation is buggy
# see https://github.com/vllm-project/vllm/pull/5689 for details
- pytest -v -s distributed/test_custom_all_reduce.py
- TARGET_TEST_SUITE=A100 pytest -v -s distributed/test_basic_distributed_correctness.py
- pytest -v -s -x lora/test_mixtral.py
- label: LM Eval Large Models # optional
gpu: a100
num_gpus: 4
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
commands:
- pip install lm-eval
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- bash ./run-tests.sh -c configs/models-large.txt -t 4
- pip install -r requirements-docs.txt
- SPHINXOPTS=\"-W\" make html

View File

@ -0,0 +1,93 @@
{% set docker_image = "us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:$BUILDKITE_COMMIT" %}
{% set default_num_gpu = 1 %}
{% set default_working_dir = "/vllm-workspace/tests" %}
steps:
- label: ":docker: build image"
commands:
- "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ."
- "docker push {{ docker_image }}"
env:
DOCKER_BUILDKIT: "1"
retry:
automatic:
- exit_status: -1 # Agent was lost
limit: 5
- exit_status: -10 # Agent was lost
limit: 5
- wait
- group: "AMD Tests"
depends_on: ~
steps:
{% for step in steps %}
{% if step.mirror_hardwares and "amd" in step.mirror_hardwares %}
- label: "AMD: {{ step.label }}"
agents:
queue: amd
command: bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" ; ")) | safe }}"
env:
DOCKER_BUILDKIT: "1"
{% endif %}
{% endfor %}
- label: "Neuron Test"
depends_on: ~
agents:
queue: neuron
command: bash .buildkite/run-neuron-test.sh
soft_fail: true
- label: "Intel Test"
depends_on: ~
command: bash .buildkite/run-cpu-test.sh
{% for step in steps %}
- label: "{{ step.label }}"
agents:
queue: kubernetes
soft_fail: {{ step.soft_fail or false }}
{% if step.parallelism %}
parallelism: {{ step.parallelism }}
{% endif %}
retry:
automatic:
- exit_status: -1 # Agent was lost
limit: 5
- exit_status: -10 # Agent was lost
limit: 5
plugins:
- kubernetes:
podSpec:
{% if step.num_gpus %}
priorityClassName: gpu-priority-cls-{{ step.num_gpus }}
{% endif %}
volumes:
- name: dshm
emptyDir:
medium: Memory
containers:
- image: "{{ docker_image }}"
command: ["bash"]
args:
- '-c'
- "'cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}'"
{% if not step.no_gpu %}
resources:
requests:
nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
limits:
nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
{% endif %}
env:
- name: VLLM_USAGE_SOURCE
value: ci-test
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: token
volumeMounts:
- mountPath: /dev/shm
name: dshm
{% endfor %}

View File

@ -1,4 +1 @@
vllm/*.so
/.venv
/build
dist

2
.github/FUNDING.yml vendored
View File

@ -1,2 +0,0 @@
github: [vllm-project]
open_collective: [vllm]

View File

@ -20,10 +20,3 @@ body:
attributes:
value: >
Thanks for contributing 🎉!
- type: checkboxes
id: askllm
attributes:
label: Before submitting a new issue...
options:
- label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
required: true

View File

@ -38,10 +38,3 @@ body:
attributes:
value: >
Thanks for contributing 🎉!
- type: checkboxes
id: askllm
attributes:
label: Before submitting a new issue...
options:
- label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
required: true

View File

@ -36,10 +36,3 @@ body:
attributes:
value: >
Thanks for contributing 🎉!
- type: checkboxes
id: askllm
attributes:
label: Before submitting a new issue...
options:
- label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
required: true

View File

@ -20,14 +20,9 @@ body:
```
It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
value: |
<details>
<summary>The output of `python collect_env.py`</summary>
```text
Your output of `python collect_env.py` here
The output of `python collect_env.py`
```
</details>
validations:
required: true
- type: textarea
@ -89,10 +84,3 @@ body:
- If the error only appears in vllm, please provide the detailed script of how you run `transformers` and `vllm`, also highlight the difference and what you expect.
Thanks for contributing 🎉!
- type: checkboxes
id: askllm
attributes:
label: Before submitting a new issue...
options:
- label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
required: true

View File

@ -29,10 +29,3 @@ body:
attributes:
value: >
Thanks for contributing 🎉!
- type: checkboxes
id: askllm
attributes:
label: Before submitting a new issue...
options:
- label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
required: true

View File

@ -31,10 +31,3 @@ body:
attributes:
value: >
Thanks for contributing 🎉!
- type: checkboxes
id: askllm
attributes:
label: Before submitting a new issue...
options:
- label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
required: true

View File

@ -50,10 +50,3 @@ body:
attributes:
value: >
Thanks for contributing 🎉!
- type: checkboxes
id: askllm
attributes:
label: Before submitting a new issue...
options:
- label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
required: true

View File

@ -47,10 +47,3 @@ body:
attributes:
value: >
Thanks for contributing 🎉!
- type: checkboxes
id: askllm
attributes:
label: Before submitting a new issue...
options:
- label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
required: true

View File

@ -19,10 +19,3 @@ body:
attributes:
value: >
Thanks for contributing 🎉!
- type: checkboxes
id: askllm
attributes:
label: Before submitting a new issue...
options:
- label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
required: true

View File

@ -1,21 +0,0 @@
name: Add label on auto-merge enabled
on:
pull_request_target:
types:
- auto_merge_enabled
jobs:
add-label-on-auto-merge:
runs-on: ubuntu-latest
steps:
- name: Add label
uses: actions/github-script@v5
with:
script: |
github.rest.issues.addLabels({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
labels: ['ready']
})
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

View File

@ -30,11 +30,12 @@ jobs:
run: |
EXCLUDES=(
'csrc/moe/topk_softmax_kernels.cu'
'csrc/quantization/gguf/ggml-common.h'
'csrc/quantization/gguf/dequantize.cuh'
'csrc/quantization/gguf/vecdotq.cuh'
'csrc/quantization/gguf/mmq.cuh'
'csrc/quantization/gguf/mmvq.cuh'
'csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu'
'csrc/punica/bgmv/bgmv_config.h'
'csrc/punica/bgmv/bgmv_impl.cuh'
'csrc/punica/bgmv/vec_dtypes.cuh'
'csrc/punica/punica_ops.cu'
'csrc/punica/type_convert.h'
)
find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \
| grep -vFf <(printf "%s\n" "${EXCLUDES[@]}") \

View File

@ -15,7 +15,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
python-version: ["3.8", "3.9", "3.10", "3.11"]
steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
@ -25,22 +25,26 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install mypy==1.11.1
pip install mypy==1.9.0
pip install types-setuptools
pip install types-PyYAML
pip install types-requests
pip install types-setuptools
- name: Mypy
run: |
mypy
mypy tests --follow-imports skip
mypy vllm/attention --follow-imports skip
mypy vllm/distributed --follow-imports skip
mypy vllm/engine --follow-imports skip
mypy vllm/executor --follow-imports skip
mypy vllm/lora --follow-imports skip
mypy vllm/model_executor --follow-imports skip
mypy vllm/prompt_adapter --follow-imports skip
mypy vllm/spec_decode --follow-imports skip
mypy vllm/worker --follow-imports skip
mypy vllm/attention --config-file pyproject.toml
mypy vllm/core --config-file pyproject.toml
mypy vllm/distributed --config-file pyproject.toml
mypy vllm/entrypoints --config-file pyproject.toml
mypy vllm/executor --config-file pyproject.toml
mypy vllm/usage --config-file pyproject.toml
mypy vllm/*.py --config-file pyproject.toml
mypy vllm/transformers_utils --config-file pyproject.toml
mypy vllm/engine --config-file pyproject.toml
mypy vllm/worker --config-file pyproject.toml
mypy vllm/spec_decode --config-file pyproject.toml
mypy vllm/model_executor --config-file pyproject.toml
mypy vllm/lora --config-file pyproject.toml
mypy vllm/logging --config-file pyproject.toml
mypy vllm/model_executor --config-file pyproject.toml

View File

@ -48,8 +48,8 @@ jobs:
fail-fast: false
matrix:
os: ['ubuntu-20.04']
python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
pytorch-version: ['2.4.0'] # Must be the most recent version that meets requirements-cuda.txt.
python-version: ['3.8', '3.9', '3.10', '3.11']
pytorch-version: ['2.3.0'] # Must be the most recent version that meets requirements-cuda.txt.
cuda-version: ['11.8', '12.1']
steps:

View File

@ -1,21 +0,0 @@
name: PR Reminder Comment Bot
on:
pull_request_target:
types: [opened]
jobs:
pr_reminder:
runs-on: ubuntu-latest
steps:
- name: Remind to run full CI on PR
uses: actions/github-script@v6
with:
script: |
github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
body: '👋 Hi! Thank you for contributing to the vLLM project.\n Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping `simon-mo` or `khluu` to add you in our Buildkite org. \n\nOnce the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n To run CI, PR reviewers can do one of these:\n- Add `ready` label to the PR\n- Enable auto-merge.\n\n🚀'
})
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

View File

@ -15,7 +15,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
python-version: ["3.8", "3.9", "3.10", "3.11"]
steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
@ -25,7 +25,7 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install ruff==0.1.5 codespell==2.3.0 tomli==2.0.1 isort==5.13.2
pip install ruff==0.1.5 codespell==2.2.6 tomli==2.0.1 isort==5.13.2
- name: Analysing the code with ruff
run: |
ruff .

View File

@ -13,6 +13,8 @@ $python_executable -m pip install -r requirements-cuda.txt
# Limit the number of parallel jobs to avoid OOM
export MAX_JOBS=1
# Make sure punica is built for the release (for LoRA)
export VLLM_INSTALL_PUNICA_KERNELS=1
# Make sure release wheels are built for the following architectures
export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
# Build

View File

@ -14,7 +14,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
python-version: ["3.8", "3.9", "3.10", "3.11"]
steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}

8
.gitignore vendored
View File

@ -1,6 +1,3 @@
# vllm commit id, generated by setup.py
vllm/commit_id.py
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
@ -87,9 +84,6 @@ target/
profile_default/
ipython_config.py
# generated files
**/generated/**
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
@ -192,4 +186,4 @@ _build/
hip_compat.h
# Benchmark dataset
benchmarks/*.json
*.json

View File

@ -10,7 +10,6 @@ build:
sphinx:
configuration: docs/source/conf.py
fail_on_warning: true
# If using Sphinx, optionally build your docs in additional formats such as PDF
formats:

View File

@ -1,23 +1,19 @@
cmake_minimum_required(VERSION 3.26)
cmake_minimum_required(VERSION 3.21)
project(vllm_extensions LANGUAGES CXX)
# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
option(VLLM_TARGET_DEVICE "Target device backend for vLLM" "cuda")
message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")
include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
# Suppress potential warnings about unused manually-specified variables
set(ignoreMe "${VLLM_PYTHON_PATH}")
#
# Supported python versions. These versions will be searched in order, the
# first match will be selected. These should be kept in sync with setup.py.
#
set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11" "3.12")
set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11")
# Supported NVIDIA architectures.
set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")
@ -35,8 +31,9 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11
# requirements.txt files and should be kept consistent. The ROCm torch
# versions are derived from Dockerfile.rocm
#
set(TORCH_SUPPORTED_VERSION_CUDA "2.4.0")
set(TORCH_SUPPORTED_VERSION_ROCM "2.5.0")
set(TORCH_SUPPORTED_VERSION_CUDA "2.3.0")
set(TORCH_SUPPORTED_VERSION_ROCM_5X "2.0.1")
set(TORCH_SUPPORTED_VERSION_ROCM_6X "2.1.1")
#
# Try to find python package with an executable that exactly matches
@ -70,37 +67,17 @@ endif()
find_package(Torch REQUIRED)
#
# Add the `default` target which detects which extensions should be
# built based on platform/architecture. This is the same logic that
# setup.py uses to select which extensions should be built and should
# be kept in sync.
# Normally `torch.utils.cpp_extension.CUDAExtension` would add
# `libtorch_python.so` for linking against an extension. Torch's cmake
# configuration does not include this library (presumably since the cmake
# config is used for standalone C++ binaries that link against torch).
# The `libtorch_python.so` library defines some of the glue code between
# torch/python via pybind and is required by VLLM extensions for this
# reason. So, add it by manually with `find_library` using torch's
# installed library path.
#
# The `default` target makes direct use of cmake easier since knowledge
# of which extensions are supported has been factored in, e.g.
#
# mkdir build && cd build
# cmake -G Ninja -DVLLM_PYTHON_EXECUTABLE=`which python3` -DCMAKE_LIBRARY_OUTPUT_DIRECTORY=../vllm ..
# cmake --build . --target default
#
add_custom_target(default)
message(STATUS "Enabling core extension.")
# Define _core_C extension
# built for (almost) every target platform, (excludes TPU and Neuron)
set(VLLM_EXT_SRC
"csrc/core/torch_bindings.cpp")
define_gpu_extension_target(
_core_C
DESTINATION vllm
LANGUAGE CXX
SOURCES ${VLLM_EXT_SRC}
COMPILE_FLAGS ${CXX_COMPILE_FLAGS}
USE_SABI 3
WITH_SOABI)
add_dependencies(default _core_C)
find_library(torch_python_LIBRARY torch_python PATHS
"${TORCH_INSTALL_PREFIX}/lib")
#
# Forward the non-CUDA device extensions to external CMake scripts.
@ -110,7 +87,7 @@ if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda" AND
if (VLLM_TARGET_DEVICE STREQUAL "cpu")
include(${CMAKE_CURRENT_LIST_DIR}/cmake/cpu_extension.cmake)
else()
return()
message(FATAL_ERROR "Unsupported vLLM target device: ${VLLM_TARGET_DEVICE}")
endif()
return()
endif()
@ -134,11 +111,18 @@ elseif(HIP_FOUND)
# .hip extension automatically, HIP must be enabled explicitly.
enable_language(HIP)
# ROCm 5.X and 6.X
if (ROCM_VERSION_DEV_MAJOR GREATER_EQUAL 5 AND
NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM})
message(WARNING "Pytorch version >= ${TORCH_SUPPORTED_VERSION_ROCM} "
"expected for ROCm build, saw ${Torch_VERSION} instead.")
# ROCm 5.x
if (ROCM_VERSION_DEV_MAJOR EQUAL 5 AND
NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM_5X})
message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM_5X} "
"expected for ROCMm 5.x build, saw ${Torch_VERSION} instead.")
endif()
# ROCm 6.x
if (ROCM_VERSION_DEV_MAJOR EQUAL 6 AND
NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM_6X})
message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM_6X} "
"expected for ROCMm 6.x build, saw ${Torch_VERSION} instead.")
endif()
else()
message(FATAL_ERROR "Can't find CUDA or HIP installation.")
@ -168,7 +152,7 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
endif()
#
# Define other extension targets
# Define extension targets
#
#
@ -187,38 +171,30 @@ set(VLLM_EXT_SRC
"csrc/quantization/fp8/common.cu"
"csrc/cuda_utils_kernels.cu"
"csrc/moe_align_block_size_kernels.cu"
"csrc/prepare_inputs/advance_step.cu"
"csrc/torch_bindings.cpp")
"csrc/pybind.cpp")
if(VLLM_GPU_LANG STREQUAL "CUDA")
include(FetchContent)
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
SET(CUTLASS_ENABLE_HEADERS_ONLY=ON)
FetchContent_Declare(
cutlass
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
# CUTLASS 3.5.1
GIT_TAG 06b21349bcf6ddf6a1686a47a137ad1446579db9
GIT_PROGRESS TRUE
# CUTLASS 3.5.0
GIT_TAG 7d49e6c7e2f8896c47f586706e67e1fb215529dc
)
FetchContent_MakeAvailable(cutlass)
list(APPEND VLLM_EXT_SRC
"csrc/mamba/mamba_ssm/selective_scan_fwd.cu"
"csrc/mamba/causal_conv1d/causal_conv1d.cu"
"csrc/quantization/aqlm/gemm_kernels.cu"
"csrc/quantization/awq/gemm_kernels.cu"
"csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
"csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
"csrc/quantization/gptq_marlin/gptq_marlin.cu"
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
"csrc/quantization/gptq_marlin/awq_marlin_repack.cu"
"csrc/quantization/gguf/gguf_kernel.cu"
"csrc/quantization/fp8/fp8_marlin.cu"
"csrc/custom_all_reduce.cu"
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
"csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu"
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
"csrc/quantization/cutlass_w8a8/scaled_mm_dq_entry.cu"
"csrc/quantization/cutlass_w8a8/scaled_mm_dq_c2x.cu"
"csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu")
#
# The CUTLASS kernels for Hopper require sm90a to be enabled.
@ -226,57 +202,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# That adds an extra 17MB to compiled binary, so instead we selectively enable it.
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0)
set_source_files_properties(
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
"csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu"
PROPERTIES
COMPILE_FLAGS
"-gencode arch=compute_90a,code=sm_90a")
endif()
#
# Machete kernels
# The machete kernels only work on hopper and require CUDA 12.0 or later.
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0)
#
# For the Machete kernels we automatically generate sources for various
# preselected input type pairs and schedules.
# Generate sources:
execute_process(
COMMAND ${CMAKE_COMMAND} -E env
PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/machete/generate.py
RESULT_VARIABLE machete_generation_result
OUTPUT_VARIABLE machete_generation_output
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log
ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log
)
if (NOT machete_generation_result EQUAL 0)
message(FATAL_ERROR "Machete generation failed."
" Result: \"${machete_generation_result}\""
"\nCheck the log for details: "
"${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log")
else()
message(STATUS "Machete generation completed successfully.")
endif()
# Add machete generated sources
file(GLOB MACHETE_GEN_SOURCES "csrc/quantization/machete/generated/*.cu")
list(APPEND VLLM_EXT_SRC ${MACHETE_GEN_SOURCES})
message(STATUS "Machete generated sources: ${MACHETE_GEN_SOURCES}")
set_source_files_properties(
${MACHETE_GEN_SOURCES}
PROPERTIES
COMPILE_FLAGS
"-gencode arch=compute_90a,code=sm_90a")
endif()
# Add pytorch binding for machete (add on even CUDA < 12.0 so that we can
# raise an error if the user that this was built with an incompatible
# CUDA version)
list(APPEND VLLM_EXT_SRC
csrc/quantization/machete/machete_pytorch.cu)
endif()
define_gpu_extension_target(
@ -286,8 +217,7 @@ define_gpu_extension_target(
SOURCES ${VLLM_EXT_SRC}
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
ARCHITECTURES ${VLLM_GPU_ARCHES}
INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
USE_SABI 3
INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR};${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
WITH_SOABI)
#
@ -295,14 +225,9 @@ define_gpu_extension_target(
#
set(VLLM_MOE_EXT_SRC
"csrc/moe/torch_bindings.cpp"
"csrc/moe/moe_ops.cpp"
"csrc/moe/topk_softmax_kernels.cu")
if(VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_MOE_EXT_SRC
"csrc/moe/marlin_moe_ops.cu")
endif()
define_gpu_extension_target(
_moe_C
DESTINATION vllm
@ -310,16 +235,93 @@ define_gpu_extension_target(
SOURCES ${VLLM_MOE_EXT_SRC}
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
ARCHITECTURES ${VLLM_GPU_ARCHES}
USE_SABI 3
WITH_SOABI)
#
# _punica_C extension
#
set(VLLM_PUNICA_EXT_SRC
"csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu"
"csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu"
"csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu"
"csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu"
"csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu"
"csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu"
"csrc/punica/punica_ops.cu"
"csrc/punica/punica_pybind.cpp")
#
# Copy GPU compilation flags+update for punica
#
set(VLLM_PUNICA_GPU_FLAGS ${VLLM_GPU_FLAGS})
list(REMOVE_ITEM VLLM_PUNICA_GPU_FLAGS
"-D__CUDA_NO_HALF_OPERATORS__"
"-D__CUDA_NO_HALF_CONVERSIONS__"
"-D__CUDA_NO_BFLOAT16_CONVERSIONS__"
"-D__CUDA_NO_HALF2_OPERATORS__")
#
# Filter out CUDA architectures < 8.0 for punica.
#
if (${VLLM_GPU_LANG} STREQUAL "CUDA")
set(VLLM_PUNICA_GPU_ARCHES)
foreach(ARCH ${VLLM_GPU_ARCHES})
string_to_ver(CODE_VER ${ARCH})
if (CODE_VER GREATER_EQUAL 8.0)
list(APPEND VLLM_PUNICA_GPU_ARCHES ${ARCH})
endif()
endforeach()
message(STATUS "Punica target arches: ${VLLM_PUNICA_GPU_ARCHES}")
elseif(${VLLM_GPU_LANG} STREQUAL "HIP")
set(VLLM_PUNICA_GPU_ARCHES ${VLLM_GPU_ARCHES})
message(STATUS "Punica target arches: ${VLLM_PUNICA_GPU_ARCHES}")
endif()
if (VLLM_PUNICA_GPU_ARCHES)
define_gpu_extension_target(
_punica_C
DESTINATION vllm
LANGUAGE ${VLLM_GPU_LANG}
SOURCES ${VLLM_PUNICA_EXT_SRC}
COMPILE_FLAGS ${VLLM_PUNICA_GPU_FLAGS}
ARCHITECTURES ${VLLM_PUNICA_GPU_ARCHES}
WITH_SOABI)
else()
message(WARNING "Unable to create _punica_C target because none of the "
"requested architectures (${VLLM_GPU_ARCHES}) are supported, i.e. >= 8.0")
endif()
#
# Add the `default` target which detects which extensions should be
# built based on platform/architecture. This is the same logic that
# setup.py uses to select which extensions should be built and should
# be kept in sync.
#
# The `default` target makes direct use of cmake easier since knowledge
# of which extensions are supported has been factored in, e.g.
#
# mkdir build && cd build
# cmake -G Ninja -DVLLM_PYTHON_EXECUTABLE=`which python3` -DCMAKE_LIBRARY_OUTPUT_DIRECTORY=../vllm ..
# cmake --build . --target default
#
add_custom_target(default)
if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
message(STATUS "Enabling C extension.")
add_dependencies(default _C)
# Enable punica if -DVLLM_INSTALL_PUNICA_KERNELS=ON or
# VLLM_INSTALL_PUNICA_KERNELS is set in the environment and
# there are supported target arches.
if (VLLM_PUNICA_GPU_ARCHES AND
(ENV{VLLM_INSTALL_PUNICA_KERNELS} OR VLLM_INSTALL_PUNICA_KERNELS))
message(STATUS "Enabling punica extension.")
add_dependencies(default _punica_C)
endif()
endif()
if(VLLM_GPU_LANG STREQUAL "CUDA")
message(STATUS "Enabling moe extension.")
add_dependencies(default _moe_C)
endif()

View File

@ -5,43 +5,31 @@
# docs/source/dev/dockerfile/dockerfile.rst and
# docs/source/assets/dev/dockerfile-stages-dependency.png
ARG CUDA_VERSION=12.4.1
#################### BASE BUILD IMAGE ####################
# prepare basic build environment
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
ARG CUDA_VERSION=12.4.1
ARG PYTHON_VERSION=3.10
ENV DEBIAN_FRONTEND=noninteractive
FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS dev
# Install Python and other dependencies
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
&& apt-get update -y \
&& apt-get install -y ccache software-properties-common git curl sudo \
&& add-apt-repository ppa:deadsnakes/ppa \
&& apt-get update -y \
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
&& update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
&& python3 --version && python3 -m pip --version
RUN apt-get update -y \
&& apt-get install -y python3-pip git
# Workaround for https://github.com/openai/triton/issues/2507 and
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
# this won't be needed for future versions of this docker image
# or future versions of triton.
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
RUN ldconfig /usr/local/cuda-12.4/compat/
WORKDIR /workspace
# install build and runtime dependencies
COPY requirements-common.txt requirements-common.txt
COPY requirements-adag.txt requirements-adag.txt
COPY requirements-cuda.txt requirements-cuda.txt
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -r requirements-cuda.txt
pip install -r requirements-cuda.txt
# install development dependencies
COPY requirements-dev.txt requirements-dev.txt
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -r requirements-dev.txt
# cuda arch list used by torch
# can be useful for both `dev` and `test`
@ -51,14 +39,17 @@ ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
#################### BASE BUILD IMAGE ####################
#################### WHEEL BUILD IMAGE ####################
FROM base AS build
FROM dev AS build
# install build dependencies
COPY requirements-build.txt requirements-build.txt
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -r requirements-build.txt
pip install -r requirements-build.txt
# install compiler cache to speed up compilation leveraging local or remote caching
RUN apt-get update -y && apt-get install -y ccache
# files and directories related to build wheels
COPY csrc csrc
@ -66,7 +57,6 @@ COPY setup.py setup.py
COPY cmake cmake
COPY CMakeLists.txt CMakeLists.txt
COPY requirements-common.txt requirements-common.txt
COPY requirements-adag.txt requirements-adag.txt
COPY requirements-cuda.txt requirements-cuda.txt
COPY pyproject.toml pyproject.toml
COPY vllm vllm
@ -77,99 +67,38 @@ ENV MAX_JOBS=${max_jobs}
# number of threads used by nvcc
ARG nvcc_threads=8
ENV NVCC_THREADS=$nvcc_threads
ARG buildkite_commit
ENV BUILDKITE_COMMIT=${buildkite_commit}
ARG USE_SCCACHE
ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
ARG SCCACHE_REGION_NAME=us-west-2
# if USE_SCCACHE is set, use sccache to speed up compilation
RUN --mount=type=cache,target=/root/.cache/pip \
if [ "$USE_SCCACHE" = "1" ]; then \
echo "Installing sccache..." \
&& curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \
&& tar -xzf sccache.tar.gz \
&& sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
&& rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
&& export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
&& export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
&& export SCCACHE_IDLE_TIMEOUT=0 \
&& export CMAKE_BUILD_TYPE=Release \
&& sccache --show-stats \
&& python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
&& sccache --show-stats; \
fi
# make sure punica kernels are built (for LoRA)
ENV VLLM_INSTALL_PUNICA_KERNELS=1
ENV CCACHE_DIR=/root/.cache/ccache
RUN --mount=type=cache,target=/root/.cache/ccache \
--mount=type=cache,target=/root/.cache/pip \
if [ "$USE_SCCACHE" != "1" ]; then \
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
fi
python3 setup.py bdist_wheel --dist-dir=dist
# Check the size of the wheel if RUN_WHEEL_CHECK is true
# check the size of the wheel, we cannot upload wheels larger than 100MB
COPY .buildkite/check-wheel-size.py check-wheel-size.py
# Default max size of the wheel is 250MB
ARG VLLM_MAX_SIZE_MB=250
ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB
ARG RUN_WHEEL_CHECK=true
RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
python3 check-wheel-size.py dist; \
else \
echo "Skipping wheel size check."; \
fi
RUN python3 check-wheel-size.py dist
#################### EXTENSION Build IMAGE ####################
#################### DEV IMAGE ####################
FROM base as dev
COPY requirements-lint.txt requirements-lint.txt
COPY requirements-test.txt requirements-test.txt
COPY requirements-dev.txt requirements-dev.txt
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -r requirements-dev.txt
#################### DEV IMAGE ####################
#################### vLLM installation IMAGE ####################
# image with vLLM installed
FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu20.04 AS vllm-base
ARG CUDA_VERSION=12.4.1
ARG PYTHON_VERSION=3.10
FROM nvidia/cuda:12.4.1-base-ubuntu22.04 AS vllm-base
WORKDIR /vllm-workspace
ENV DEBIAN_FRONTEND=noninteractive
RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
# Install Python and other dependencies
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
&& apt-get update -y \
&& apt-get install -y ccache software-properties-common git curl sudo vim python3-pip \
&& add-apt-repository ppa:deadsnakes/ppa \
&& apt-get update -y \
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
&& update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
&& python3 --version && python3 -m pip --version
RUN apt-get update -y \
&& apt-get install -y python3-pip git vim
# Workaround for https://github.com/openai/triton/issues/2507 and
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
# this won't be needed for future versions of this docker image
# or future versions of triton.
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
RUN ldconfig /usr/local/cuda-12.4/compat/
# install vllm wheel first, so that torch etc will be installed
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
--mount=type=cache,target=/root/.cache/pip \
python3 -m pip install dist/*.whl --verbose
RUN --mount=type=cache,target=/root/.cache/pip \
. /etc/environment && \
python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl
pip install dist/*.whl --verbose
#################### vLLM installation IMAGE ####################
@ -182,7 +111,7 @@ ADD . /vllm-workspace/
# install development dependencies (for testing)
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -r requirements-dev.txt
pip install -r requirements-dev.txt
# doc requires source code
# we hide them inside `test_docs/` , so that this source code
@ -199,7 +128,7 @@ FROM vllm-base AS vllm-openai
# install additional dependencies for openai api server
RUN --mount=type=cache,target=/root/.cache/pip \
pip install accelerate hf_transfer 'modelscope!=1.15.0'
pip install accelerate hf_transfer modelscope
ENV VLLM_USAGE_SOURCE production-docker-image

View File

@ -1,53 +1,22 @@
# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
FROM ubuntu:22.04 AS cpu-test-1
FROM ubuntu:22.04
RUN --mount=type=cache,target=/var/cache/apt \
apt-get update -y \
&& apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
RUN apt-get update -y \
&& apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip \
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
# intel-openmp provides additional performance improvement vs. openmp
# tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
RUN --mount=type=cache,target=/root/.cache/pip \
pip install intel-openmp
RUN pip install --upgrade pip \
&& pip install wheel packaging ninja setuptools>=49.4.0 numpy
ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so"
RUN echo 'ulimit -c 0' >> ~/.bashrc
RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.4.0%2Bgitfbaa4bc-cp310-cp310-linux_x86_64.whl
ENV PIP_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cpu
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
pip install --upgrade pip && \
pip install -r requirements-build.txt
FROM cpu-test-1 AS build
COPY ./ /workspace/vllm
WORKDIR /workspace/vllm
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \
--mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \
pip install -v -r requirements-cpu.txt
RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
COPY ./ ./
# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
ARG VLLM_CPU_DISABLE_AVX512
ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
ENV CCACHE_DIR=/root/.cache/ccache
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=cache,target=/root/.cache/ccache \
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
pip install dist/*.whl
RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
WORKDIR /workspace/
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
CMD ["/bin/bash"]

View File

@ -1,5 +1,5 @@
# default base image
ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.1.2-neuronx-py310-sdk2.19.1-ubuntu20.04"
ARG BASE_IMAGE="763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-inference-neuronx:2.1.1-neuronx-py310-sdk2.17.0-ubuntu20.04"
FROM $BASE_IMAGE
@ -28,7 +28,7 @@ COPY ./requirements-neuron.txt /app/vllm/requirements-neuron.txt
RUN cd /app/vllm \
&& python3 -m pip install -U -r requirements-neuron.txt
ENV VLLM_TARGET_DEVICE neuron
ENV VLLM_BUILD_WITH_NEURON 1
RUN cd /app/vllm \
&& pip install -e . \
&& cd ..

View File

@ -1,29 +0,0 @@
# The vLLM Dockerfile is used to construct vLLM image that can be directly used
# to run the OpenAI compatible server.
FROM ubuntu:22.04 AS dev
RUN apt-get update -y && \
apt-get install -y python3-pip git
WORKDIR /workspace
# copy requirements
COPY requirements-build.txt /workspace/vllm/
COPY requirements-common.txt /workspace/vllm/
COPY requirements-openvino.txt /workspace/vllm/
COPY vllm/ /workspace/vllm/vllm
COPY csrc/core /workspace/vllm/csrc/core
COPY cmake/utils.cmake /workspace/vllm/cmake/
COPY CMakeLists.txt /workspace/vllm/
COPY setup.py /workspace/vllm/
# install build requirements
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm/requirements-build.txt
# build vLLM with OpenVINO backend
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace/vllm/
COPY examples/ /workspace/vllm/examples
COPY benchmarks/ /workspace/vllm/benchmarks
CMD ["/bin/bash"]

View File

@ -1,22 +0,0 @@
FROM mambaorg/micromamba
ARG MAMBA_DOCKERFILE_ACTIVATE=1
USER root
RUN apt-get update -y && apt-get install -y git wget vim numactl gcc-12 g++-12 protobuf-compiler libprotobuf-dev && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
# Some packages in requirements-cpu are installed here
# IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
# Currently these may not be available for venv or pip directly
RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 pytorch-cpu=2.1.2 torchvision-cpu=0.16.2 && micromamba clean --all --yes
COPY ./ /workspace/vllm
WORKDIR /workspace/vllm
# These packages will be in rocketce eventually
RUN pip install -v -r requirements-cpu.txt --prefer-binary --extra-index-url https://repo.fury.io/mgiessing
RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
WORKDIR /vllm-workspace
ENTRYPOINT ["/opt/conda/bin/python3", "-m", "vllm.entrypoints.openai.api_server"]

View File

@ -1,33 +1,35 @@
# Default ROCm 6.1 base image
ARG BASE_IMAGE="rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging"
# default base image
ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
FROM $BASE_IMAGE
ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
RUN echo "Base image is $BASE_IMAGE"
# BASE_IMAGE for ROCm_5.7: "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1"
# BASE_IMAGE for ROCm_6.0: "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
# Default ROCm ARCHes to build vLLM for.
ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100"
# Whether to install CK-based flash-attention
# If 0, will not install flash-attention
ARG BUILD_FA="1"
# If `TRY_FA_WHEEL=1`, we will try installing flash-attention from `FA_WHEEL_URL`
# If this succeeds, we use the downloaded wheel and skip building flash-attention.
# Otherwise, ROCm flash-attention from `FA_BRANCH` will be built for the
# architectures specified in `FA_GFX_ARCHS`
ARG TRY_FA_WHEEL="1"
ARG FA_WHEEL_URL="https://github.com/ROCm/flash-attention/releases/download/v2.5.9post1-cktile-vllm/flash_attn-2.5.9.post1-cp39-cp39-linux_x86_64.whl"
ARG FA_GFX_ARCHS="gfx90a;gfx942"
ARG FA_BRANCH="23a2b1c2"
RUN echo "FA_GFX_ARCHS is $FA_GFX_ARCHS"
# Whether to build triton on rocm
ARG FA_BRANCH="ae7928c"
RUN echo "FA_BRANCH is $FA_BRANCH"
# whether to build flash-attention
# if 0, will not build flash attention
# this is useful for gfx target where flash-attention is not supported
# In that case, we need to use the python reference attention implementation in vllm
ARG BUILD_FA="1"
# whether to build triton on rocm
ARG BUILD_TRITON="1"
ARG TRITON_BRANCH="e0fc12c"
### Base image build stage
FROM $BASE_IMAGE AS base
# Import arg(s) defined before this build stage
ARG PYTORCH_ROCM_ARCH
# Install some basic utilities
RUN apt-get update && apt-get install python3 python3-pip -y
# Install some basic utilities
RUN apt-get update && apt-get install -y \
curl \
ca-certificates \
@ -38,144 +40,75 @@ RUN apt-get update && apt-get install -y \
build-essential \
wget \
unzip \
nvidia-cuda-toolkit \
tmux \
ccache \
&& rm -rf /var/lib/apt/lists/*
# When launching the container, mount the code directory to /vllm-workspace
### Mount Point ###
# When launching the container, mount the code directory to /app
ARG APP_MOUNT=/vllm-workspace
VOLUME [ ${APP_MOUNT} ]
WORKDIR ${APP_MOUNT}
RUN python3 -m pip install --upgrade pip
# Remove sccache so it doesn't interfere with ccache
# TODO: implement sccache support across components
RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
# Install torch == 2.5.0 on ROCm
RUN case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
*"rocm-6.1"*) \
python3 -m pip uninstall -y torch torchvision \
&& python3 -m pip install --no-cache-dir --pre \
torch==2.5.0.dev20240726 \
torchvision==0.20.0.dev20240726 \
--index-url https://download.pytorch.org/whl/nightly/rocm6.1;; \
*) ;; esac
RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer
ENV PATH=$PATH:/opt/rocm/bin:/libtorch/bin:
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/libtorch/lib:
ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/libtorch/include:/libtorch/include/torch/csrc/api/include/:/opt/rocm/include/:
ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
ENV CCACHE_DIR=/root/.cache/ccache
### AMD-SMI build stage
FROM base AS build_amdsmi
# Build amdsmi wheel always
RUN cd /opt/rocm/share/amd_smi \
&& python3 -m pip wheel . --wheel-dir=/install
### Flash-Attention wheel build stage
FROM base AS build_fa
ARG BUILD_FA
ARG TRY_FA_WHEEL
ARG FA_WHEEL_URL
ARG FA_GFX_ARCHS
ARG FA_BRANCH
# Build ROCm flash-attention wheel if `BUILD_FA = 1`
RUN --mount=type=cache,target=${CCACHE_DIR} \
if [ "$BUILD_FA" = "1" ]; then \
if [ "${TRY_FA_WHEEL}" = "1" ] && python3 -m pip install "${FA_WHEEL_URL}"; then \
# If a suitable wheel exists, we download it instead of building FA
mkdir -p /install && wget -N "${FA_WHEEL_URL}" -P /install; \
else \
mkdir -p libs \
&& cd libs \
&& git clone https://github.com/ROCm/flash-attention.git \
&& cd flash-attention \
&& git checkout "${FA_BRANCH}" \
&& git submodule update --init \
&& GPU_ARCHS="${FA_GFX_ARCHS}" python3 setup.py bdist_wheel --dist-dir=/install; \
fi; \
# Create an empty directory otherwise as later build stages expect one
else mkdir -p /install; \
# Install ROCm flash-attention
RUN if [ "$BUILD_FA" = "1" ]; then \
mkdir libs \
&& cd libs \
&& git clone https://github.com/ROCm/flash-attention.git \
&& cd flash-attention \
&& git checkout ${FA_BRANCH} \
&& git submodule update --init \
&& export GPU_ARCHS=${FA_GFX_ARCHS} \
&& if [ "$BASE_IMAGE" = "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1" ]; then \
patch /opt/conda/envs/py_3.10/lib/python3.10/site-packages/torch/utils/hipify/hipify_python.py hipify_patch.patch; fi \
&& python3 setup.py install \
&& cd ..; \
fi
# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
# Manually removed it so that later steps of numpy upgrade can continue
RUN if [ "$BASE_IMAGE" = "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" ]; then \
rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/; fi
### Triton wheel build stage
FROM base AS build_triton
ARG BUILD_TRITON
ARG TRITON_BRANCH
# Build triton wheel if `BUILD_TRITON = 1`
RUN --mount=type=cache,target=${CCACHE_DIR} \
if [ "$BUILD_TRITON" = "1" ]; then \
# build triton
RUN if [ "$BUILD_TRITON" = "1" ]; then \
mkdir -p libs \
&& cd libs \
&& git clone https://github.com/OpenAI/triton.git \
&& cd triton \
&& git checkout "${TRITON_BRANCH}" \
&& cd python \
&& python3 setup.py bdist_wheel --dist-dir=/install; \
# Create an empty directory otherwise as later build stages expect one
else mkdir -p /install; \
&& pip uninstall -y triton \
&& git clone https://github.com/ROCm/triton.git \
&& cd triton/python \
&& pip3 install . \
&& cd ../..; \
fi
### Final vLLM build stage
FROM base AS final
# Import the vLLM development directory from the build context
WORKDIR /vllm-workspace
COPY . .
# Package upgrades for useful functionality or to avoid dependency issues
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install --upgrade numba scipy huggingface-hub[cli]
#RUN python3 -m pip install pynvml # to be removed eventually
RUN python3 -m pip install --upgrade pip numba
# make sure punica kernels are built (for LoRA)
ENV VLLM_INSTALL_PUNICA_KERNELS=1
# Workaround for ray >= 2.10.0
ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
# Silences the HF Tokenizers warning
ENV TOKENIZERS_PARALLELISM=false
RUN --mount=type=cache,target=${CCACHE_DIR} \
--mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -Ur requirements-rocm.txt \
&& case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
*"rocm-6.1"*) \
# Bring in upgrades to HIP graph earlier than ROCm 6.2 for vLLM
wget -N https://github.com/ROCm/vllm/raw/fa78403/rocm_patch/libamdhip64.so.6 -P /opt/rocm/lib \
# Prevent interference if torch bundles its own HIP runtime
&& rm -f "$(python3 -c 'import torch; print(torch.__path__[0])')"/lib/libamdhip64.so* || true;; \
*) ;; esac \
&& python3 setup.py clean --all \
&& python3 setup.py develop
ENV VLLM_NCCL_SO_PATH=/opt/rocm/lib/librccl.so
# Copy amdsmi wheel into final image
RUN --mount=type=bind,from=build_amdsmi,src=/install,target=/install \
mkdir -p libs \
&& cp /install/*.whl libs \
# Preemptively uninstall to avoid same-version no-installs
&& python3 -m pip uninstall -y amdsmi;
# Copy triton wheel(s) into final image if they were built
RUN --mount=type=bind,from=build_triton,src=/install,target=/install \
mkdir -p libs \
&& if ls /install/*.whl; then \
cp /install/*.whl libs \
# Preemptively uninstall to avoid same-version no-installs
&& python3 -m pip uninstall -y triton; fi
# Copy flash-attn wheel(s) into final image if they were built
RUN --mount=type=bind,from=build_fa,src=/install,target=/install \
mkdir -p libs \
&& if ls /install/*.whl; then \
cp /install/*.whl libs \
# Preemptively uninstall to avoid same-version no-installs
&& python3 -m pip uninstall -y flash-attn; fi
# Install wheels that were built to the final image
RUN --mount=type=cache,target=/root/.cache/pip \
if ls libs/*.whl; then \
python3 -m pip install libs/*.whl; fi
pip install -U -r requirements-rocm.txt \
&& patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h ./rocm_patch/rocm_bf16.patch \
&& python3 setup.py install \
&& cp build/lib.linux-x86_64-cpython-39/vllm/_C.cpython-39-x86_64-linux-gnu.so vllm/ \
&& cp build/lib.linux-x86_64-cpython-39/vllm/_punica_C.cpython-39-x86_64-linux-gnu.so vllm/ \
&& cd ..
CMD ["/bin/bash"]

View File

@ -1,17 +0,0 @@
ARG NIGHTLY_DATE="20240828"
ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE"
FROM $BASE_IMAGE
WORKDIR /workspace
# Install the TPU and Pallas dependencies.
RUN python3 -m pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
RUN python3 -m pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
# Build vLLM.
COPY . /workspace/vllm
ENV VLLM_TARGET_DEVICE="tpu"
RUN cd /workspace/vllm && python3 -m pip install -r requirements-tpu.txt
RUN cd /workspace/vllm && python3 setup.py develop
CMD ["/bin/bash"]

View File

@ -1,22 +0,0 @@
FROM intel/oneapi-basekit:2024.1.0-devel-ubuntu20.04
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
rm /etc/apt/sources.list.d/intel-graphics.list && \
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
chmod 644 /usr/share/keyrings/intel-graphics.gpg
RUN apt-get update -y \
&& apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip
COPY ./ /workspace/vllm
WORKDIR /workspace/vllm
RUN pip install -v -r requirements-xpu.txt
RUN VLLM_TARGET_DEVICE=xpu python3 setup.py install
CMD ["/bin/bash"]

View File

@ -1,5 +1,4 @@
include LICENSE
include requirements-adag.txt
include requirements-common.txt
include requirements-cuda.txt
include requirements-rocm.txt

View File

@ -10,29 +10,32 @@ Easy, fast, and cheap LLM serving for everyone
</h3>
<p align="center">
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://discord.gg/jz7wjKhh6g"><b>Discord</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> |
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://discord.gg/jz7wjKhh6g"><b>Discord</b></a> |
</p>
---
**vLLM & NVIDIA Triton User Meetup (Monday, September 9, 5pm-9pm PT) at Fort Mason, San Francisco**
**The Fourth vLLM Bay Area Meetup (June 11th 5:30pm-8pm PT)**
We are excited to announce our sixth vLLM Meetup, in collaboration with NVIDIA Triton Team.
Join us to hear the vLLM's recent update about performance.
Register now [here](https://lu.ma/87q3nvnh) and be part of the event!
We are thrilled to announce our fourth vLLM Meetup!
The vLLM team will share recent updates and roadmap.
We will also have vLLM collaborators from BentoML and Cloudflare coming up to the stage to discuss their experience in deploying LLMs with vLLM.
Please register [here](https://lu.ma/agivllm) and join us!
---
*Latest News* 🔥
- [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing).
- [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html).
- [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing).
- [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing).
- [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) with IBM! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
- [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) with a16z! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).
- [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
- [2024/01] Added ROCm 6.0 support to vLLM.
- [2023/12] Added ROCm 5.7 support to vLLM.
- [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).
- [2023/09] We created our [Discord server](https://discord.gg/jz7wjKhh6g)! Join us to discuss vLLM and LLM serving! We will also post the latest announcements and updates there.
- [2023/09] We released our [PagedAttention paper](https://arxiv.org/abs/2309.06180) on arXiv!
- [2023/08] We would like to express our sincere gratitude to [Andreessen Horowitz](https://a16z.com/2023/08/30/supporting-the-open-source-ai-community/) (a16z) for providing a generous grant to support the open-source development and research of vLLM.
- [2023/07] Added support for LLaMA-2! You can run and serve 7B/13B/70B LLaMA-2s on vLLM with a single command!
- [2023/06] Serving vLLM On any Cloud with SkyPilot. Check out a 1-click [example](https://github.com/skypilot-org/skypilot/blob/master/llm/vllm) to start the vLLM demo, and the [blog post](https://blog.skypilot.co/serving-llm-24x-faster-on-the-cloud-with-vllm-and-skypilot/) for the story behind vLLM development on the clouds.
- [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).
---
@ -45,35 +48,30 @@ vLLM is fast with:
- Efficient management of attention key and value memory with **PagedAttention**
- Continuous batching of incoming requests
- Fast model execution with CUDA/HIP graph
- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8.
- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer.
- Speculative decoding
- Chunked prefill
**Performance benchmark**: We include a [performance benchmark](https://buildkite.com/vllm/performance-benchmark/builds/4068) that compares the performance of vLLM against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [text-generation-inference](https://github.com/huggingface/text-generation-inference) and [lmdeploy](https://github.com/InternLM/lmdeploy)).
- Quantization: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [SqueezeLLM](https://arxiv.org/abs/2306.07629), FP8 KV Cache
- Optimized CUDA kernels
vLLM is flexible and easy to use with:
- Seamless integration with popular Hugging Face models
- High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
- Tensor parallelism and pipeline parallelism support for distributed inference
- Tensor parallelism support for distributed inference
- Streaming outputs
- OpenAI-compatible API server
- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron.
- Prefix caching support
- Multi-lora support
- Support NVIDIA GPUs and AMD GPUs
- (Experimental) Prefix caching support
- (Experimental) Multi-lora support
vLLM seamlessly supports most popular open-source models on HuggingFace, including:
- Transformer-like LLMs (e.g., Llama)
- Mixture-of-Expert LLMs (e.g., Mixtral)
- Embedding Models (e.g. E5-Mistral)
- Multi-modal LLMs (e.g., LLaVA)
Find the full list of supported models [here](https://docs.vllm.ai/en/latest/models/supported_models.html).
## Getting Started
Install vLLM with `pip` or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source):
Install vLLM with pip or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source):
```bash
pip install vllm
@ -104,18 +102,14 @@ vLLM is a community project. Our compute resources for development and testing a
- Databricks
- DeepInfra
- Dropbox
- Google Cloud
- Lambda Lab
- NVIDIA
- Replicate
- Roblox
- RunPod
- Sequoia Capital
- Skywork AI
- Trainy
- UC Berkeley
- UC San Diego
- ZhenFund
We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM.

View File

@ -4,13 +4,10 @@ import sys
import time
import traceback
from dataclasses import dataclass, field
from typing import List, Optional, Union
from typing import List, Optional
import aiohttp
import huggingface_hub.constants
from tqdm.asyncio import tqdm
from transformers import (AutoTokenizer, PreTrainedTokenizer,
PreTrainedTokenizerFast)
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
@ -71,13 +68,9 @@ async def async_request_tgi(
chunk_bytes = chunk_bytes.strip()
if not chunk_bytes:
continue
chunk_bytes = chunk_bytes.decode("utf-8")
#NOTE: Sometimes TGI returns a ping response without
# any data, we should skip it.
if chunk_bytes.startswith(":"):
continue
chunk = remove_prefix(chunk_bytes, "data:")
chunk = remove_prefix(chunk_bytes.decode("utf-8"),
"data:")
data = json.loads(chunk)
timestamp = time.perf_counter()
@ -225,8 +218,8 @@ async def async_request_openai_completions(
) -> RequestFuncOutput:
api_url = request_func_input.api_url
assert api_url.endswith(
("completions", "profile")
), "OpenAI Completions API URL must end with 'completions' or 'profile'."
"v1/completions"
), "OpenAI Completions API URL must end with 'v1/completions'."
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
assert not request_func_input.use_beam_search
@ -265,9 +258,6 @@ async def async_request_openai_completions(
else:
data = json.loads(chunk)
# NOTE: Some completion API might have a last
# usage summary response without a token so we
# want to check a token was generated
if data["choices"][0]["text"]:
timestamp = time.perf_counter()
# First token
@ -276,7 +266,10 @@ async def async_request_openai_completions(
output.ttft = ttft
# Decoding phase
else:
# NOTE: Some completion API might have a last
# usage summary response without a token so we
# do not want to include as inter-token-latency
elif data.get("usage", None) is None:
output.itl.append(timestamp -
most_recent_timestamp)
@ -305,8 +298,8 @@ async def async_request_openai_chat_completions(
) -> RequestFuncOutput:
api_url = request_func_input.api_url
assert api_url.endswith(
"chat/completions"
), "OpenAI Chat Completions API URL must end with 'chat/completions'."
"v1/chat/completions"
), "OpenAI Chat Completions API URL must end with 'v1/chat/completions'."
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
assert not request_func_input.use_beam_search
@ -391,30 +384,6 @@ def remove_prefix(text: str, prefix: str) -> str:
return text
def get_model(pretrained_model_name_or_path: str) -> str:
if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
from modelscope import snapshot_download
model_path = snapshot_download(
model_id=pretrained_model_name_or_path,
local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
return model_path
return pretrained_model_name_or_path
def get_tokenizer(
pretrained_model_name_or_path: str, trust_remote_code: bool
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
if pretrained_model_name_or_path is not None and not os.path.exists(
pretrained_model_name_or_path):
pretrained_model_name_or_path = get_model(
pretrained_model_name_or_path)
return AutoTokenizer.from_pretrained(pretrained_model_name_or_path,
trust_remote_code=trust_remote_code)
ASYNC_REQUEST_FUNCS = {
"tgi": async_request_tgi,
"vllm": async_request_openai_completions,
@ -423,5 +392,4 @@ ASYNC_REQUEST_FUNCS = {
"openai": async_request_openai_completions,
"openai-chat": async_request_openai_chat_completions,
"tensorrt-llm": async_request_trt_llm,
"scalellm": async_request_openai_completions,
}

View File

@ -10,10 +10,8 @@ import torch
from tqdm import tqdm
from vllm import LLM, SamplingParams
from vllm.engine.arg_utils import EngineArgs
from vllm.inputs import PromptInputs
from vllm.inputs import PromptStrictInputs
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
from vllm.utils import FlexibleArgumentParser
def main(args: argparse.Namespace):
@ -21,33 +19,24 @@ def main(args: argparse.Namespace):
# NOTE(woosuk): If the request cannot be processed in a single batch,
# the engine will automatically process the request in multiple batches.
llm = LLM(
model=args.model,
speculative_model=args.speculative_model,
num_speculative_tokens=args.num_speculative_tokens,
speculative_draft_tensor_parallel_size=\
args.speculative_draft_tensor_parallel_size,
tokenizer=args.tokenizer,
quantization=args.quantization,
tensor_parallel_size=args.tensor_parallel_size,
trust_remote_code=args.trust_remote_code,
dtype=args.dtype,
max_model_len=args.max_model_len,
enforce_eager=args.enforce_eager,
kv_cache_dtype=args.kv_cache_dtype,
quantization_param_path=args.quantization_param_path,
device=args.device,
ray_workers_use_nsight=args.ray_workers_use_nsight,
use_v2_block_manager=args.use_v2_block_manager,
enable_chunked_prefill=args.enable_chunked_prefill,
download_dir=args.download_dir,
block_size=args.block_size,
gpu_memory_utilization=args.gpu_memory_utilization,
load_format=args.load_format,
distributed_executor_backend=args.distributed_executor_backend,
otlp_traces_endpoint=args.otlp_traces_endpoint,
enable_prefix_caching=args.enable_prefix_caching,
)
llm = LLM(model=args.model,
speculative_model=args.speculative_model,
num_speculative_tokens=args.num_speculative_tokens,
tokenizer=args.tokenizer,
quantization=args.quantization,
tensor_parallel_size=args.tensor_parallel_size,
trust_remote_code=args.trust_remote_code,
dtype=args.dtype,
enforce_eager=args.enforce_eager,
kv_cache_dtype=args.kv_cache_dtype,
quantization_param_path=args.quantization_param_path,
device=args.device,
ray_workers_use_nsight=args.ray_workers_use_nsight,
use_v2_block_manager=args.use_v2_block_manager,
enable_chunked_prefill=args.enable_chunked_prefill,
download_dir=args.download_dir,
block_size=args.block_size,
gpu_memory_utilization=args.gpu_memory_utilization)
sampling_params = SamplingParams(
n=args.n,
@ -61,7 +50,7 @@ def main(args: argparse.Namespace):
dummy_prompt_token_ids = np.random.randint(10000,
size=(args.batch_size,
args.input_len))
dummy_inputs: List[PromptInputs] = [{
dummy_inputs: List[PromptStrictInputs] = [{
"prompt_token_ids": batch
} for batch in dummy_prompt_token_ids.tolist()]
@ -106,7 +95,7 @@ def main(args: argparse.Namespace):
for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
latencies.append(run_to_completion(profile_dir=None))
latencies = np.array(latencies)
percentages = [10, 25, 50, 75, 90, 99]
percentages = [10, 25, 50, 75, 90]
percentiles = np.percentile(latencies, percentages)
print(f'Avg latency: {np.mean(latencies)} seconds')
for percentage, percentile in zip(percentages, percentiles):
@ -124,16 +113,12 @@ def main(args: argparse.Namespace):
if __name__ == '__main__':
parser = FlexibleArgumentParser(
parser = argparse.ArgumentParser(
description='Benchmark the latency of processing a single batch of '
'requests till completion.')
parser.add_argument('--model', type=str, default='facebook/opt-125m')
parser.add_argument('--speculative-model', type=str, default=None)
parser.add_argument('--num-speculative-tokens', type=int, default=None)
parser.add_argument('--speculative-draft-tensor-parallel-size',
'-spec-draft-tp',
type=int,
default=None)
parser.add_argument('--tokenizer', type=str, default=None)
parser.add_argument('--quantization',
'-q',
@ -159,12 +144,6 @@ if __name__ == '__main__':
parser.add_argument('--trust-remote-code',
action='store_true',
help='trust remote code from huggingface')
parser.add_argument(
'--max-model-len',
type=int,
default=None,
help='Maximum length of a sequence (including prompt and output). '
'If None, will be derived from the model.')
parser.add_argument(
'--dtype',
type=str,
@ -208,10 +187,9 @@ if __name__ == '__main__':
parser.add_argument(
"--device",
type=str,
default="auto",
choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"],
help='device type for vLLM execution, supporting CUDA, OpenVINO and '
'CPU.')
default="cuda",
choices=["cuda", "cpu"],
help='device type for vLLM execution, supporting CUDA and CPU.')
parser.add_argument('--block-size',
type=int,
default=16,
@ -221,9 +199,6 @@ if __name__ == '__main__':
action='store_true',
help='If True, the prefill requests can be chunked based on the '
'max_num_batched_tokens')
parser.add_argument("--enable-prefix-caching",
action='store_true',
help="Enable automatic prefix caching")
parser.add_argument('--use-v2-block-manager', action='store_true')
parser.add_argument(
"--ray-workers-use-nsight",
@ -246,40 +221,5 @@ if __name__ == '__main__':
help='the fraction of GPU memory to be used for '
'the model executor, which can range from 0 to 1.'
'If unspecified, will use the default value of 0.9.')
parser.add_argument(
'--load-format',
type=str,
default=EngineArgs.load_format,
choices=[
'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer',
'bitsandbytes'
],
help='The format of the model weights to load.\n\n'
'* "auto" will try to load the weights in the safetensors format '
'and fall back to the pytorch bin format if safetensors format '
'is not available.\n'
'* "pt" will load the weights in the pytorch bin format.\n'
'* "safetensors" will load the weights in the safetensors format.\n'
'* "npcache" will load the weights in pytorch format and store '
'a numpy cache to speed up the loading.\n'
'* "dummy" will initialize the weights with random values, '
'which is mainly for profiling.\n'
'* "tensorizer" will load the weights using tensorizer from '
'CoreWeave. See the Tensorize vLLM Model script in the Examples'
'section for more information.\n'
'* "bitsandbytes" will load the weights using bitsandbytes '
'quantization.\n')
parser.add_argument(
'--distributed-executor-backend',
choices=['ray', 'mp'],
default=None,
help='Backend to use for distributed serving. When more than 1 GPU '
'is used, will be automatically set to "ray" if installed '
'or "mp" (multiprocessing) otherwise.')
parser.add_argument(
'--otlp-traces-endpoint',
type=str,
default=None,
help='Target URL to which OpenTelemetry traces will be sent.')
args = parser.parse_args()
main(args)

View File

@ -1,44 +1,7 @@
"""
Benchmark the efficiency of prefix caching.
This script allows you to benchmark the performance of
a model with and without prefix caching using either fixed prompts
or prompts sampled from the ShareGPT dataset.
Fixed example usage:
python benchmark_prefix_caching.py \
--model meta-llama/Llama-2-7b-chat-hf \
--enable-prefix-caching \
--num-prompts 1 \
--repeat-count 100
ShareGPT example usage:
# This command samples 20 prompts with input lengths
# between 128 and 256 tokens from the ShareGPT dataset,
# then replicates each prompt 5 times.
python benchmark_prefix_caching.py \
--model meta-llama/Llama-2-7b-chat-hf \
--dataset-path /path/to/ShareGPT_V3_unfiltered_cleaned_split.json \
--enable-prefix-caching \
--num-prompts 20 \
--repeat-count 5 \
--input-length-range 128:256
"""
import json
import random
import argparse
import time
from typing import List, Optional, Tuple
from transformers import PreTrainedTokenizerBase
from vllm import LLM, SamplingParams
from vllm.utils import FlexibleArgumentParser
try:
from vllm.transformers_utils.tokenizer import get_tokenizer
except ImportError:
from backend_request_func import get_tokenizer
PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as fellows. You need to answer my question about the table.\n# Table\n|Opening|Opening|Sl. No.|Film|Cast|Director|Music Director|Notes|\n|----|----|----|----|----|----|----|----|\n|J A N|9|1|Agni Pushpam|Jayabharathi, Kamalahasan|Jeassy|M. K. Arjunan||\n|J A N|16|2|Priyamvada|Mohan Sharma, Lakshmi, KPAC Lalitha|K. S. Sethumadhavan|V. Dakshinamoorthy||\n|J A N|23|3|Yakshagaanam|Madhu, Sheela|Sheela|M. S. Viswanathan||\n|J A N|30|4|Paalkkadal|Sheela, Sharada|T. K. Prasad|A. T. Ummer||\n|F E B|5|5|Amma|Madhu, Srividya|M. Krishnan Nair|M. K. Arjunan||\n|F E B|13|6|Appooppan|Thikkurissi Sukumaran Nair, Kamal Haasan|P. Bhaskaran|M. S. Baburaj||\n|F E B|20|7|Srishti|Chowalloor Krishnankutty, Ravi Alummoodu|K. T. Muhammad|M. S. Baburaj||\n|F E B|20|8|Vanadevatha|Prem Nazir, Madhubala|Yusufali Kechery|G. Devarajan||\n|F E B|27|9|Samasya|Madhu, Kamalahaasan|K. Thankappan|Shyam||\n|F E B|27|10|Yudhabhoomi|K. P. Ummer, Vidhubala|Crossbelt Mani|R. K. Shekhar||\n|M A R|5|11|Seemantha Puthran|Prem Nazir, Jayabharathi|A. B. Raj|M. K. Arjunan||\n|M A R|12|12|Swapnadanam|Rani Chandra, Dr. Mohandas|K. G. George|Bhaskar Chandavarkar||\n|M A R|19|13|Thulavarsham|Prem Nazir, sreedevi, Sudheer|N. Sankaran Nair|V. Dakshinamoorthy||\n|M A R|20|14|Aruthu|Kaviyoor Ponnamma, Kamalahasan|Ravi|G. Devarajan||\n|M A R|26|15|Swimming Pool|Kamal Haasan, M. G. Soman|J. Sasikumar|M. K. Arjunan||\n\n# Question\nWhat' s the content in the (1,1) cells\n" # noqa: E501
@ -52,83 +15,7 @@ def test_prefix(llm=None, sampling_params=None, prompts=None):
print(f"cost time {end_time - start_time}")
def sample_requests(
dataset_path: str,
num_requests: int,
tokenizer: PreTrainedTokenizerBase,
input_length_range: Tuple[int, int],
fixed_output_len: Optional[int],
) -> List[Tuple[str, int, int]]:
if fixed_output_len is not None and fixed_output_len < 4:
raise ValueError("output_len too small")
# Load the dataset.
with open(dataset_path) as f:
dataset = json.load(f)
# Filter out the conversations with less than 2 turns.
dataset = [data for data in dataset if len(data["conversations"]) >= 2]
# Only keep the first two turns of each conversation.
dataset = [(data["conversations"][0]["value"],
data["conversations"][1]["value"]) for data in dataset]
# Shuffle the dataset.
random.shuffle(dataset)
min_len, max_len = input_length_range
# Filter out sequences that are too long or too short
filtered_dataset: List[Tuple[str, int, int]] = []
for i in range(len(dataset)):
if len(filtered_dataset) == num_requests:
break
# Tokenize the prompts and completions.
prompt = dataset[i][0]
prompt_token_ids = tokenizer(prompt).input_ids
completion = dataset[i][1]
completion_token_ids = tokenizer(completion).input_ids
prompt_len = len(prompt_token_ids)
output_len = len(completion_token_ids
) if fixed_output_len is None else fixed_output_len
if prompt_len < 4 or output_len < 4:
# Prune too short sequences.
continue
if min_len <= prompt_len <= max_len:
filtered_dataset.append((prompt, prompt_len, output_len))
return filtered_dataset
def repeat_and_sort_requests(requests: List[Tuple[str, int, int]],
repeat_count: int,
sort: bool = False) -> List[str]:
repeated_requests = requests * repeat_count
if sort:
repeated_requests.sort(key=lambda x: x[1])
else:
random.shuffle(repeated_requests)
return [req[0] for req in repeated_requests]
def main(args):
tokenizer = get_tokenizer(args.model, trust_remote_code=True)
input_length_range = tuple(map(int, args.input_length_range.split(':')))
if args.dataset_path is not None:
print(f"Start to sample {args.num_prompts} prompts"
"from {args.dataset_path}")
filtered_datasets = sample_requests(
dataset_path=args.dataset_path,
num_requests=args.num_prompts,
tokenizer=tokenizer,
input_length_range=input_length_range,
fixed_output_len=args.output_len,
)
else:
prompt_len = len(tokenizer(PROMPT).input_ids)
filtered_datasets = [(PROMPT, prompt_len, args.output_len)
] * args.num_prompts
llm = LLM(model=args.model,
tokenizer_mode='auto',
trust_remote_code=True,
@ -137,13 +24,10 @@ def main(args):
tensor_parallel_size=args.tensor_parallel_size,
enable_prefix_caching=args.enable_prefix_caching)
num_prompts = 100
prompts = [PROMPT] * num_prompts
sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
print("Testing filtered datasets")
prompts = repeat_and_sort_requests(filtered_datasets,
repeat_count=args.repeat_count,
sort=args.sort)
print("------warm up------")
test_prefix(
llm=llm,
@ -160,16 +44,12 @@ def main(args):
if __name__ == "__main__":
parser = FlexibleArgumentParser(
description=
'Benchmark the performance with or without automatic prefix caching.')
parser = argparse.ArgumentParser(
description='Benchmark the performance with or without automatic '
'prefix caching.')
parser.add_argument('--model',
type=str,
default='baichuan-inc/Baichuan2-13B-Chat')
parser.add_argument("--dataset-path",
type=str,
default=None,
help="Path to the dataset.")
parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
parser.add_argument('--output-len', type=int, default=10)
parser.add_argument('--enable-prefix-caching',
@ -178,21 +58,5 @@ if __name__ == "__main__":
parser.add_argument('--use-v2-block-manager',
action='store_true',
help='Use BlockSpaceMangerV2')
parser.add_argument('--num-prompts',
type=int,
default=1,
help="Number of the prompts sampled from dataset")
parser.add_argument('--repeat-count',
type=int,
default=100,
help='Number of times to repeat each prompt')
parser.add_argument('--sort',
action='store_true',
help='Sort prompts by input length')
parser.add_argument('--input-length-range',
type=str,
default='128:256',
help='Range of input lengths for sampling prompts,'
'specified as "min:max" (e.g., "128:256").')
args = parser.parse_args()
main(args)

View File

@ -2,8 +2,8 @@
On the server side, run one of the following commands:
vLLM OpenAI API server
vllm serve <your_model> \
--swap-space 16 \
python -m vllm.entrypoints.openai.api_server \
--model <your_model> --swap-space 16 \
--disable-log-requests
(TGI backend)
@ -17,7 +17,7 @@ On the client side, run:
--dataset-path <path to dataset> \
--request-rate <request_rate> \ # By default <request_rate> is inf
--num-prompts <num_prompts> # By default <num_prompts> is 1000
when using tgi backend, add
--endpoint /generate_stream
to the end of the command above.
@ -31,7 +31,7 @@ import time
import warnings
from dataclasses import dataclass
from datetime import datetime
from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple
from typing import AsyncGenerator, List, Optional, Tuple
import numpy as np
from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
@ -39,15 +39,7 @@ from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
from tqdm.asyncio import tqdm
from transformers import PreTrainedTokenizerBase
try:
from vllm.transformers_utils.tokenizer import get_tokenizer
except ImportError:
from backend_request_func import get_tokenizer
try:
from vllm.utils import FlexibleArgumentParser
except ImportError:
from argparse import ArgumentParser as FlexibleArgumentParser
from vllm.transformers_utils.tokenizer import get_tokenizer
@dataclass
@ -56,27 +48,14 @@ class BenchmarkMetrics:
total_input: int
total_output: int
request_throughput: float
input_throughput: float
output_throughput: float
total_token_throughput: float
mean_ttft_ms: float
median_ttft_ms: float
std_ttft_ms: float
percentiles_ttft_ms: List[Tuple[float, float]]
p99_ttft_ms: float
mean_tpot_ms: float
median_tpot_ms: float
std_tpot_ms: float
percentiles_tpot_ms: List[Tuple[float, float]]
mean_itl_ms: float
median_itl_ms: float
std_itl_ms: float
percentiles_itl_ms: List[Tuple[float, float]]
# E2EL stands for end-to-end latency per request.
# It is the time taken on the client side from sending
# a request to receiving a complete response.
mean_e2el_ms: float
median_e2el_ms: float
std_e2el_ms: float
percentiles_e2el_ms: List[Tuple[float, float]]
p99_tpot_ms: float
def sample_sharegpt_requests(
@ -87,6 +66,7 @@ def sample_sharegpt_requests(
) -> List[Tuple[str, int, int]]:
if fixed_output_len is not None and fixed_output_len < 4:
raise ValueError("output_len too small")
# Load the dataset.
with open(dataset_path) as f:
dataset = json.load(f)
@ -194,31 +174,6 @@ def sample_sonnet_requests(
return sampled_requests
def sample_random_requests(
input_len: int, output_len: int, num_prompts: int, range_ratio: float,
tokenizer: PreTrainedTokenizerBase) -> List[Tuple[str, int, int]]:
input_lens = np.random.randint(
int(input_len * range_ratio),
input_len + 1,
size=num_prompts,
)
output_lens = np.random.randint(
int(output_len * range_ratio),
output_len + 1,
size=num_prompts,
)
offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts)
input_requests = []
for i in range(num_prompts):
prompt = tokenizer.decode([(offsets[i] + i + j) % tokenizer.vocab_size
for j in range(input_lens[i])])
input_requests.append(
(prompt, int(input_lens[i]), int(output_lens[i])))
return input_requests
async def get_request(
input_requests: List[Tuple[str, int, int]],
request_rate: float,
@ -230,7 +185,6 @@ async def get_request(
if request_rate == float("inf"):
# If the request rate is infinity, then we don't need to wait.
continue
# Sample the request interval from the exponential distribution.
interval = np.random.exponential(1.0 / request_rate)
# The next request will be sent after the interval.
@ -242,33 +196,21 @@ def calculate_metrics(
outputs: List[RequestFuncOutput],
dur_s: float,
tokenizer: PreTrainedTokenizerBase,
selected_percentile_metrics: List[str],
selected_percentiles: List[float],
) -> Tuple[BenchmarkMetrics, List[int]]:
actual_output_lens: List[int] = []
actual_output_lens = []
total_input = 0
completed = 0
itls: List[float] = []
tpots: List[float] = []
ttfts: List[float] = []
e2els: List[float] = []
tpots = []
ttfts = []
for i in range(len(outputs)):
if outputs[i].success:
# We use the tokenizer to count the number of output tokens for all
# serving backends instead of looking at len(outputs[i].itl) since
# multiple output tokens may be bundled together
# Note : this may inflate the output token count slightly
output_len = len(
tokenizer(outputs[i].generated_text,
add_special_tokens=False).input_ids)
output_len = len(tokenizer(outputs[i].generated_text).input_ids)
actual_output_lens.append(output_len)
total_input += input_requests[i][1]
if output_len > 1:
tpots.append(
(outputs[i].latency - outputs[i].ttft) / (output_len - 1))
itls += outputs[i].itl
ttfts.append(outputs[i].ttft)
e2els.append(outputs[i].latency)
completed += 1
else:
actual_output_lens.append(0)
@ -283,29 +225,15 @@ def calculate_metrics(
total_input=total_input,
total_output=sum(actual_output_lens),
request_throughput=completed / dur_s,
input_throughput=total_input / dur_s,
output_throughput=sum(actual_output_lens) / dur_s,
total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s,
mean_ttft_ms=np.mean(ttfts or 0) *
1000, # ttfts is empty if streaming is not supported by backend
std_ttft_ms=np.std(ttfts or 0) * 1000,
median_ttft_ms=np.median(ttfts or 0) * 1000,
percentiles_ttft_ms=[(p, np.percentile(ttfts or 0, p) * 1000)
for p in selected_percentiles],
p99_ttft_ms=np.percentile(ttfts or 0, 99) * 1000,
mean_tpot_ms=np.mean(tpots or 0) * 1000,
std_tpot_ms=np.std(tpots or 0) * 1000,
median_tpot_ms=np.median(tpots or 0) * 1000,
percentiles_tpot_ms=[(p, np.percentile(tpots or 0, p) * 1000)
for p in selected_percentiles],
mean_itl_ms=np.mean(itls or 0) * 1000,
std_itl_ms=np.std(itls or 0) * 1000,
median_itl_ms=np.median(itls or 0) * 1000,
percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000)
for p in selected_percentiles],
mean_e2el_ms=np.median(e2els or 0) * 1000,
std_e2el_ms=np.std(e2els or 0) * 1000,
median_e2el_ms=np.mean(e2els or 0) * 1000,
percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000)
for p in selected_percentiles],
p99_tpot_ms=np.percentile(tpots or 0, 99) * 1000,
)
return metrics, actual_output_lens
@ -314,7 +242,6 @@ def calculate_metrics(
async def benchmark(
backend: str,
api_url: str,
base_url: str,
model_id: str,
tokenizer: PreTrainedTokenizerBase,
input_requests: List[Tuple[str, int, int]],
@ -322,12 +249,9 @@ async def benchmark(
use_beam_search: bool,
request_rate: float,
disable_tqdm: bool,
profile: bool,
selected_percentile_metrics: List[str],
selected_percentiles: List[str],
):
if backend in ASYNC_REQUEST_FUNCS:
request_func = ASYNC_REQUEST_FUNCS[backend]
request_func = ASYNC_REQUEST_FUNCS.get(backend)
else:
raise ValueError(f"Unknown backend: {backend}")
@ -349,28 +273,12 @@ async def benchmark(
f"are correctly specified. Error: {test_output.error}")
else:
print("Initial test run completed. Starting main benchmark run...")
if profile:
print("Starting profiler...")
profile_input = RequestFuncInput(
model=model_id,
prompt=test_prompt,
api_url=base_url + "/start_profile",
prompt_len=test_prompt_len,
output_len=test_output_len,
best_of=best_of,
use_beam_search=use_beam_search,
)
profile_output = await request_func(request_func_input=profile_input)
if profile_output.success:
print("Profiler started")
print(f"Traffic request rate: {request_rate}")
pbar = None if disable_tqdm else tqdm(total=len(input_requests))
benchmark_start_time = time.perf_counter()
tasks: List[asyncio.Task] = []
tasks = []
async for request in get_request(input_requests, request_rate):
prompt, prompt_len, output_len = request
request_func_input = RequestFuncInput(
@ -388,22 +296,7 @@ async def benchmark(
pbar=pbar)))
outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
if profile:
print("Stopping profiler...")
profile_input = RequestFuncInput(
model=model_id,
prompt=test_prompt,
api_url=base_url + "/stop_profile",
prompt_len=test_prompt_len,
output_len=test_output_len,
best_of=best_of,
use_beam_search=use_beam_search,
)
profile_output = await request_func(request_func_input=profile_input)
if profile_output.success:
print("Profiler stopped")
if pbar is not None:
if not disable_tqdm:
pbar.close()
benchmark_duration = time.perf_counter() - benchmark_start_time
@ -413,8 +306,6 @@ async def benchmark(
outputs=outputs,
dur_s=benchmark_duration,
tokenizer=tokenizer,
selected_percentile_metrics=selected_percentile_metrics,
selected_percentiles=selected_percentiles,
)
print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
@ -426,10 +317,23 @@ async def benchmark(
metrics.total_output))
print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
metrics.request_throughput))
print("{:<40} {:<10.2f}".format("Input token throughput (tok/s):",
metrics.input_throughput))
print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
metrics.output_throughput))
print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):",
metrics.total_token_throughput))
print("{s:{c}^{n}}".format(s='Time to First Token', n=50, c='-'))
print("{:<40} {:<10.2f}".format("Mean TTFT (ms):", metrics.mean_ttft_ms))
print("{:<40} {:<10.2f}".format("Median TTFT (ms):",
metrics.median_ttft_ms))
print("{:<40} {:<10.2f}".format("P99 TTFT (ms):", metrics.p99_ttft_ms))
print("{s:{c}^{n}}".format(s='Time per Output Token (excl. 1st token)',
n=50,
c='-'))
print("{:<40} {:<10.2f}".format("Mean TPOT (ms):", metrics.mean_tpot_ms))
print("{:<40} {:<10.2f}".format("Median TPOT (ms):",
metrics.median_tpot_ms))
print("{:<40} {:<10.2f}".format("P99 TPOT (ms):", metrics.p99_tpot_ms))
print("=" * 50)
result = {
"duration": benchmark_duration,
@ -437,8 +341,14 @@ async def benchmark(
"total_input_tokens": metrics.total_input,
"total_output_tokens": metrics.total_output,
"request_throughput": metrics.request_throughput,
"input_throughput": metrics.input_throughput,
"output_throughput": metrics.output_throughput,
"total_token_throughput": metrics.total_token_throughput,
"mean_ttft_ms": metrics.mean_ttft_ms,
"median_ttft_ms": metrics.median_ttft_ms,
"p99_ttft_ms": metrics.p99_ttft_ms,
"mean_tpot_ms": metrics.mean_tpot_ms,
"median_tpot_ms": metrics.median_tpot_ms,
"p99_tpot_ms": metrics.p99_tpot_ms,
"input_lens": [output.prompt_len for output in outputs],
"output_lens": actual_output_lens,
"ttfts": [output.ttft for output in outputs],
@ -446,47 +356,6 @@ async def benchmark(
"generated_texts": [output.generated_text for output in outputs],
"errors": [output.error for output in outputs],
}
def process_one_metric(
# E.g., "ttft"
metric_attribute_name: str,
# E.g., "TTFT"
metric_name: str,
# E.g., "Time to First Token"
metric_header: str,
):
# This function print and add statistics of the specified
# metric.
if metric_attribute_name not in selected_percentile_metrics:
return
print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-'))
print("{:<40} {:<10.2f}".format(
f"Mean {metric_name} (ms):",
getattr(metrics, f"mean_{metric_attribute_name}_ms")))
print("{:<40} {:<10.2f}".format(
f"Median {metric_name} (ms):",
getattr(metrics, f"median_{metric_attribute_name}_ms")))
result[f"mean_{metric_attribute_name}_ms"] = getattr(
metrics, f"mean_{metric_attribute_name}_ms")
result[f"median_{metric_attribute_name}_ms"] = getattr(
metrics, f"median_{metric_attribute_name}_ms")
result[f"std_{metric_attribute_name}_ms"] = getattr(
metrics, f"std_{metric_attribute_name}_ms")
for p, value in getattr(metrics,
f"percentiles_{metric_attribute_name}_ms"):
p_word = str(int(p)) if int(p) == p else str(p)
print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):",
value))
result[f"p{p_word}_{metric_attribute_name}_ms"] = value
process_one_metric("ttft", "TTFT", "Time to First Token")
process_one_metric("tpot", "TPOT",
"Time per Output Token (excl. 1st token)")
process_one_metric("itl", "ITL", "Inter-token Latency")
process_one_metric("e2el", "E2EL", "End-to-end Latency")
print("=" * 50)
return result
@ -501,10 +370,8 @@ def main(args: argparse.Namespace):
if args.base_url is not None:
api_url = f"{args.base_url}{args.endpoint}"
base_url = f"{args.base_url}"
else:
api_url = f"http://{args.host}:{args.port}{args.endpoint}"
base_url = f"http://{args.host}:{args.port}"
tokenizer = get_tokenizer(tokenizer_id,
trust_remote_code=args.trust_remote_code)
@ -560,15 +427,6 @@ def main(args: argparse.Namespace):
for prompt, prompt_formatted, prompt_len,
output_len in input_requests]
elif args.dataset_name == "random":
input_requests = sample_random_requests(
input_len=args.random_input_len,
output_len=args.random_output_len,
num_prompts=args.num_prompts,
range_ratio=args.random_range_ratio,
tokenizer=tokenizer,
)
else:
raise ValueError(f"Unknown dataset: {args.dataset_name}")
@ -576,7 +434,6 @@ def main(args: argparse.Namespace):
benchmark(
backend=backend,
api_url=api_url,
base_url=base_url,
model_id=model_id,
tokenizer=tokenizer,
input_requests=input_requests,
@ -584,16 +441,11 @@ def main(args: argparse.Namespace):
use_beam_search=args.use_beam_search,
request_rate=args.request_rate,
disable_tqdm=args.disable_tqdm,
profile=args.profile,
selected_percentile_metrics=args.percentile_metrics.split(","),
selected_percentiles=[
float(p) for p in args.metric_percentiles.split(",")
],
))
# Save config and results to json
if args.save_result:
result_json: Dict[str, Any] = {}
result_json = {}
# Setup
current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
@ -626,8 +478,6 @@ def main(args: argparse.Namespace):
# Save to file
base_model_id = model_id.split("/")[-1]
file_name = f"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json" #noqa
if args.result_filename:
file_name = args.result_filename
if args.result_dir:
file_name = os.path.join(args.result_dir, file_name)
with open(file_name, "w") as outfile:
@ -635,7 +485,7 @@ def main(args: argparse.Namespace):
if __name__ == "__main__":
parser = FlexibleArgumentParser(
parser = argparse.ArgumentParser(
description="Benchmark the online serving throughput.")
parser.add_argument(
"--backend",
@ -668,7 +518,7 @@ if __name__ == "__main__":
"--dataset-name",
type=str,
default="sharegpt",
choices=["sharegpt", "sonnet", "random"],
choices=["sharegpt", "sonnet"],
help="Name of the dataset to benchmark on.",
)
parser.add_argument("--dataset-path",
@ -685,7 +535,7 @@ if __name__ == "__main__":
"--tokenizer",
type=str,
help=
"Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
"Name or path of the tokenizer, if not using the default tokenizer.",
)
parser.add_argument(
"--best-of",
@ -728,27 +578,6 @@ if __name__ == "__main__":
help=
"Number of prefix tokens per request, used only for sonnet dataset.",
)
parser.add_argument(
"--random-input-len",
type=int,
default=1024,
help=
"Number of input tokens per request, used only for random sampling.",
)
parser.add_argument(
"--random-output-len",
type=int,
default=128,
help=
"Number of output tokens per request, used only for random sampling.",
)
parser.add_argument(
"--random-range-ratio",
type=float,
default=1.0,
help="Range of sampled ratio of input/output length, "
"used only for random sampling.",
)
parser.add_argument(
"--request-rate",
type=float,
@ -769,12 +598,6 @@ if __name__ == "__main__":
action="store_true",
help="Specify to disable tqdm progress bar.",
)
parser.add_argument(
"--profile",
action="store_true",
help="Use Torch Profiler. The endpoint must be launched with "
"VLLM_TORCH_PROFILER_DIR to enable profiler.",
)
parser.add_argument(
"--save-result",
action="store_true",
@ -795,32 +618,6 @@ if __name__ == "__main__":
help="Specify directory to save benchmark json results."
"If not specified, results are saved in the current directory.",
)
parser.add_argument(
"--result-filename",
type=str,
default=None,
help="Specify the filename to save benchmark json results."
"If not specified, results will be saved in "
"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
" format.",
)
parser.add_argument(
"--percentile-metrics",
type=str,
default="ttft,tpot,itl",
help="Comma-seperated list of selected metrics to report percentils. "
"This argument specifies the metrics to report percentiles. "
"Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
"Default value is \"ttft,tpot,itl\".")
parser.add_argument(
"--metric-percentiles",
type=str,
default="99",
help="Comma-seperated list of percentiles for selected metrics. "
"To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
"Default value is \"99\". "
"Use \"--percentile-metrics\" to select metrics.",
)
args = parser.parse_args()
main(args)

View File

@ -6,16 +6,11 @@ import time
from typing import List, Optional, Tuple
import torch
import uvloop
from tqdm import tqdm
from transformers import (AutoModelForCausalLM, AutoTokenizer,
PreTrainedTokenizerBase)
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
from vllm.entrypoints.openai.api_server import (
build_async_engine_client_from_engine_args)
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
from vllm.utils import FlexibleArgumentParser, merge_async_iterators
def sample_requests(
@ -83,13 +78,8 @@ def run_vllm(
enable_prefix_caching: bool,
enable_chunked_prefill: bool,
max_num_batched_tokens: int,
distributed_executor_backend: Optional[str],
gpu_memory_utilization: float = 0.9,
num_scheduler_steps: int = 1,
use_v2_block_manager: bool = False,
download_dir: Optional[str] = None,
load_format: str = EngineArgs.load_format,
disable_async_output_proc: bool = False,
) -> float:
from vllm import LLM, SamplingParams
llm = LLM(
@ -110,16 +100,11 @@ def run_vllm(
download_dir=download_dir,
enable_chunked_prefill=enable_chunked_prefill,
max_num_batched_tokens=max_num_batched_tokens,
distributed_executor_backend=distributed_executor_backend,
load_format=load_format,
num_scheduler_steps=num_scheduler_steps,
use_v2_block_manager=use_v2_block_manager,
disable_async_output_proc=disable_async_output_proc,
)
# Add the requests to the engine.
prompts: List[str] = []
sampling_params: List[SamplingParams] = []
prompts = []
sampling_params = []
for prompt, _, output_len in requests:
prompts.append(prompt)
sampling_params.append(
@ -138,93 +123,6 @@ def run_vllm(
return end - start
async def run_vllm_async(
requests: List[Tuple[str, int, int]],
model: str,
tokenizer: str,
quantization: Optional[str],
tensor_parallel_size: int,
seed: int,
n: int,
use_beam_search: bool,
trust_remote_code: bool,
dtype: str,
max_model_len: Optional[int],
enforce_eager: bool,
kv_cache_dtype: str,
quantization_param_path: Optional[str],
device: str,
enable_prefix_caching: bool,
enable_chunked_prefill: bool,
max_num_batched_tokens: int,
distributed_executor_backend: Optional[str],
gpu_memory_utilization: float = 0.9,
num_scheduler_steps: int = 1,
use_v2_block_manager: bool = False,
download_dir: Optional[str] = None,
load_format: str = EngineArgs.load_format,
disable_async_output_proc: bool = False,
disable_frontend_multiprocessing: bool = False,
) -> float:
from vllm import SamplingParams
engine_args = AsyncEngineArgs(
model=model,
tokenizer=tokenizer,
quantization=quantization,
tensor_parallel_size=tensor_parallel_size,
seed=seed,
trust_remote_code=trust_remote_code,
dtype=dtype,
max_model_len=max_model_len,
gpu_memory_utilization=gpu_memory_utilization,
enforce_eager=enforce_eager,
kv_cache_dtype=kv_cache_dtype,
quantization_param_path=quantization_param_path,
device=device,
enable_prefix_caching=enable_prefix_caching,
download_dir=download_dir,
enable_chunked_prefill=enable_chunked_prefill,
max_num_batched_tokens=max_num_batched_tokens,
distributed_executor_backend=distributed_executor_backend,
load_format=load_format,
num_scheduler_steps=num_scheduler_steps,
use_v2_block_manager=use_v2_block_manager,
disable_async_output_proc=disable_async_output_proc,
worker_use_ray=False,
engine_use_ray=False,
disable_log_requests=True,
)
async with build_async_engine_client_from_engine_args(
engine_args, disable_frontend_multiprocessing) as llm:
# Add the requests to the engine.
prompts: List[str] = []
sampling_params: List[SamplingParams] = []
for prompt, _, output_len in requests:
prompts.append(prompt)
sampling_params.append(
SamplingParams(
n=n,
temperature=0.0 if use_beam_search else 1.0,
top_p=1.0,
use_beam_search=use_beam_search,
ignore_eos=True,
max_tokens=output_len,
))
generators = []
start = time.perf_counter()
for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)):
generator = llm.generate(prompt, sp, request_id=f"test{i}")
generators.append(generator)
all_gens = merge_async_iterators(*generators)
async for i, res in all_gens:
pass
end = time.perf_counter()
return end - start
def run_hf(
requests: List[Tuple[str, int, int]],
model: str,
@ -320,24 +218,15 @@ def main(args: argparse.Namespace):
args.output_len)
if args.backend == "vllm":
run_args = [
elapsed_time = run_vllm(
requests, args.model, args.tokenizer, args.quantization,
args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
args.trust_remote_code, args.dtype, args.max_model_len,
args.enforce_eager, args.kv_cache_dtype,
args.quantization_param_path, args.device,
args.enable_prefix_caching, args.enable_chunked_prefill,
args.max_num_batched_tokens, args.distributed_executor_backend,
args.gpu_memory_utilization, args.num_scheduler_steps,
args.use_v2_block_manager, args.download_dir, args.load_format,
args.disable_async_output_proc
]
if args.async_engine:
run_args.append(args.disable_frontend_multiprocessing)
elapsed_time = uvloop.run(run_vllm_async(*run_args))
else:
elapsed_time = run_vllm(*run_args)
args.max_num_batched_tokens, args.gpu_memory_utilization,
args.download_dir)
elif args.backend == "hf":
assert args.tensor_parallel_size == 1
elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
@ -367,7 +256,7 @@ def main(args: argparse.Namespace):
if __name__ == "__main__":
parser = FlexibleArgumentParser(description="Benchmark the throughput.")
parser = argparse.ArgumentParser(description="Benchmark the throughput.")
parser.add_argument("--backend",
type=str,
choices=["vllm", "hf", "mii"],
@ -454,22 +343,13 @@ if __name__ == "__main__":
parser.add_argument(
"--device",
type=str,
default="auto",
choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"],
help='device type for vLLM execution, supporting CUDA, OpenVINO and '
'CPU.')
parser.add_argument(
"--num-scheduler-steps",
type=int,
default=1,
help="Maximum number of forward steps per scheduler call.")
parser.add_argument("--use-v2-block-manager",
action='store_true',
help="Enable block manager v2.")
default="cuda",
choices=["cuda", "cpu"],
help='device type for vLLM execution, supporting CUDA and CPU.')
parser.add_argument(
"--enable-prefix-caching",
action='store_true',
help="Enable automatic prefix caching for vLLM backend.")
help="enable automatic prefix caching for vLLM backend.")
parser.add_argument("--enable-chunked-prefill",
action='store_true',
help="enable chunked prefill for vLLM backend.")
@ -488,49 +368,6 @@ if __name__ == "__main__":
type=str,
default=None,
help='Path to save the throughput results in JSON format.')
parser.add_argument(
'--distributed-executor-backend',
choices=['ray', 'mp'],
default=None,
help='Backend to use for distributed serving. When more than 1 GPU '
'is used, will be automatically set to "ray" if installed '
'or "mp" (multiprocessing) otherwise.')
parser.add_argument(
'--load-format',
type=str,
default=EngineArgs.load_format,
choices=[
'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer',
'bitsandbytes'
],
help='The format of the model weights to load.\n\n'
'* "auto" will try to load the weights in the safetensors format '
'and fall back to the pytorch bin format if safetensors format '
'is not available.\n'
'* "pt" will load the weights in the pytorch bin format.\n'
'* "safetensors" will load the weights in the safetensors format.\n'
'* "npcache" will load the weights in pytorch format and store '
'a numpy cache to speed up the loading.\n'
'* "dummy" will initialize the weights with random values, '
'which is mainly for profiling.\n'
'* "tensorizer" will load the weights using tensorizer from '
'CoreWeave. See the Tensorize vLLM Model script in the Examples'
'section for more information.\n'
'* "bitsandbytes" will load the weights using bitsandbytes '
'quantization.\n')
parser.add_argument(
"--disable-async-output-proc",
action='store_true',
default=False,
help="Disable async output processor for vLLM backend.")
parser.add_argument("--async-engine",
action='store_true',
default=False,
help="Use vLLM async engine rather than LLM class.")
parser.add_argument("--disable-frontend-multiprocessing",
action='store_true',
default=False,
help="Disable decoupled async engine frontend.")
args = parser.parse_args()
if args.tokenizer is None:
args.tokenizer = args.model

View File

@ -1,389 +0,0 @@
import argparse
import copy
import itertools
import pickle as pkl
import time
from typing import Callable, Iterable, List, Tuple
import torch
import torch.utils.benchmark as TBenchmark
from torch.utils.benchmark import Measurement as TMeasurement
from weight_shapes import WEIGHT_SHAPES
from vllm import _custom_ops as ops
from vllm.utils import FlexibleArgumentParser
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
DEFAULT_TP_SIZES = [1]
# helpers
def to_fp8(tensor: torch.Tensor) -> torch.Tensor:
finfo = torch.finfo(torch.float8_e4m3fn)
return torch.round(tensor.clamp(
min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
def to_int8(tensor: torch.Tensor) -> torch.Tensor:
return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
k: int) -> Tuple[torch.Tensor, torch.Tensor]:
a = torch.randn((m, k), device='cuda') * 5
b = torch.randn((n, k), device='cuda').t() * 5
if dtype == torch.int8:
return to_int8(a), to_int8(b)
if dtype == torch.float8_e4m3fn:
return to_fp8(a), to_fp8(b)
raise ValueError("unsupported dtype")
# bench
def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
**kwargs) -> TMeasurement:
min_run_time = 1
globals = {
"args": args,
"kwargs": kwargs,
"fn": fn,
}
return TBenchmark.Timer(
stmt="fn(*args, **kwargs)",
globals=globals,
label=label,
sub_label=sub_label,
description=description,
).blocked_autorange(min_run_time=min_run_time)
def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
sub_label: str) -> Iterable[TMeasurement]:
assert dtype == torch.int8
a, b = make_rand_tensors(torch.int8, m, n, k)
scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
azp = torch.zeros((m, ), device="cuda", dtype=torch.int32)
azp_adj = torch.zeros((n, ), device="cuda", dtype=torch.int32)
timers = []
# pytorch impl - bfloat16
timers.append(
bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
torch.mm, a.to(dtype=torch.bfloat16),
b.to(dtype=torch.bfloat16)))
# pytorch impl - float16
timers.append(
bench_fn(label, sub_label,
"pytorch_fp16_fp16_fp16_matmul-no-scales", torch.mm,
a.to(dtype=torch.float16), b.to(dtype=torch.float16)))
# cutlass impl
timers.append(
bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm",
ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
torch.bfloat16))
# cutlass with bias
timers.append(
bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_bias",
ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16,
bias))
# cutlass with azp per-tensor
timers.append(
bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp",
ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
torch.bfloat16, azp_adj))
# cutlass with azp per-tensor + bias
timers.append(
bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_bias",
ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
torch.bfloat16, azp_adj, None, bias))
# cutlass with azp per-token
timers.append(
bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_pt",
ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
torch.bfloat16, azp_adj, azp))
# cutlass with azp per-token + bias
timers.append(
bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_pt_bias",
ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
torch.bfloat16, azp_adj, azp, bias))
return timers
def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
sub_label: str) -> Iterable[TMeasurement]:
assert dtype == torch.float8_e4m3fn
a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
timers = []
# pytorch impl w. bf16
timers.append(
bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
torch.mm, a.to(dtype=torch.bfloat16, device="cuda"),
b.to(dtype=torch.bfloat16, device="cuda")))
# pytorch impl: bf16 output, without fp8 fast accum
timers.append(
bench_fn(label,
sub_label,
"pytorch_fp8_fp8_bf16_scaled_mm",
torch._scaled_mm,
a,
b,
scale_a=scale_a,
scale_b=scale_b,
out_dtype=torch.bfloat16))
# pytorch impl: bf16 output, with fp8 fast accum
timers.append(
bench_fn(label,
sub_label,
"pytorch_fp8_fp8_bf16_scaled_mm_fast_accum",
torch._scaled_mm,
a,
b,
scale_a=scale_a,
scale_b=scale_b,
out_dtype=torch.bfloat16,
use_fast_accum=True))
# pytorch impl: fp16 output, without fp8 fast accum
timers.append(
bench_fn(label,
sub_label,
"pytorch_fp8_fp8_fp16_scaled_mm",
torch._scaled_mm,
a,
b,
scale_a=scale_a,
scale_b=scale_b,
out_dtype=torch.float16))
# pytorch impl: fp16 output, with fp8 fast accum
timers.append(
bench_fn(label,
sub_label,
"pytorch_fp8_fp8_fp16_scaled_mm_fast_accum",
torch._scaled_mm,
a,
b,
scale_a=scale_a,
scale_b=scale_b,
out_dtype=torch.float16,
use_fast_accum=True))
# cutlass impl: bf16 output
timers.append(
bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm",
ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
torch.bfloat16))
# cutlass impl: fp16 output
timers.append(
bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_mm",
ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.float16))
# cutlass impl: bf16 output, with bias
timers.append(
bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm_bias",
ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16,
bias))
# cutlass impl: fp16 output, with bias
timers.append(
bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_mm_bias",
ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.float16,
bias.to(dtype=torch.float16)))
return timers
def bench(dtype: torch.dtype, m: int, k: int, n: int, label: str,
sub_label: str) -> Iterable[TMeasurement]:
if dtype == torch.int8:
return bench_int8(dtype, m, k, n, label, sub_label)
if dtype == torch.float8_e4m3fn:
return bench_fp8(dtype, m, k, n, label, sub_label)
raise ValueError("unsupported type")
# runner
def print_timers(timers: Iterable[TMeasurement]):
compare = TBenchmark.Compare(timers)
compare.print()
def run(dtype: torch.dtype,
MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
results = []
for m, k, n in MKNs:
timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm",
f"MKN=({m}x{k}x{n})")
print_timers(timers)
results.extend(timers)
return results
# output makers
def make_output(data: Iterable[TMeasurement],
MKNs: Iterable[Tuple[int, int, int]],
base_description: str,
timestamp=None):
print(f"== All Results {base_description} ====")
print_timers(data)
# pickle all the results
timestamp = int(time.time()) if timestamp is None else timestamp
with open(f"{base_description}-{timestamp}.pkl", "wb") as f:
pkl.dump(data, f)
# argparse runners
def run_square_bench(args):
dim_sizes = list(
range(args.dim_start, args.dim_end + 1, args.dim_increment))
MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
data = run(args.dtype, MKNs)
make_output(data, MKNs, f"square_bench-{args.dtype}")
def run_range_bench(args):
dim_sizes = list(range(args.dim_start, args.dim_end, args.dim_increment))
n = len(dim_sizes)
Ms = [args.m_constant] * n if args.m_constant is not None else dim_sizes
Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes
Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes
MKNs = list(zip(Ms, Ks, Ns))
data = run(args.dtype, MKNs)
make_output(data, MKNs, f"range_bench-{args.dtype}")
def run_model_bench(args):
print("Benchmarking models:")
for i, model in enumerate(args.models):
print(f"[{i}] {model}")
def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
KNs = []
for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
KN[tp_split_dim] = KN[tp_split_dim] // tp_size
KNs.append(KN)
return KNs
model_bench_data = []
models_tps = list(itertools.product(args.models, args.tp_sizes))
for model, tp_size in models_tps:
Ms = args.batch_sizes
KNs = model_shapes(model, tp_size)
MKNs = []
for m in Ms:
for k, n in KNs:
MKNs.append((m, k, n))
data = run(args.dtype, MKNs)
model_bench_data.append(data)
# Print all results
for data, model_tp in zip(model_bench_data, models_tps):
model, tp_size = model_tp
print(f"== Results {args.dtype} {model}-TP{tp_size} ====")
print_timers(data)
timestamp = int(time.time())
all_data = []
for d in model_bench_data:
all_data.extend(d)
# pickle all data
with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f:
pkl.dump(all_data, f)
if __name__ == '__main__':
def to_torch_dtype(dt):
if dt == "int8":
return torch.int8
if dt == "fp8":
return torch.float8_e4m3fn
raise ValueError("unsupported dtype")
parser = FlexibleArgumentParser(
description="""
Benchmark Cutlass GEMM.
To run square GEMMs:
python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 square_bench --dim-start 128 --dim-end 512 --dim-increment 64
To run constant N and K and sweep M:
python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384
To run dimensions from a model:
python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1
Output:
- a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
""", # noqa: E501
formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument("--dtype",
type=to_torch_dtype,
required=True,
help="Available options are ['int8', 'fp8']")
subparsers = parser.add_subparsers(dest="cmd")
square_parser = subparsers.add_parser("square_bench")
square_parser.add_argument("--dim-start", type=int, required=True)
square_parser.add_argument("--dim-end", type=int, required=True)
square_parser.add_argument("--dim-increment", type=int, required=True)
square_parser.set_defaults(func=run_square_bench)
range_parser = subparsers.add_parser("range_bench")
range_parser.add_argument("--dim-start", type=int, required=True)
range_parser.add_argument("--dim-end", type=int, required=True)
range_parser.add_argument("--dim-increment", type=int, required=True)
range_parser.add_argument("--m-constant", type=int, default=None)
range_parser.add_argument("--n-constant", type=int, default=None)
range_parser.add_argument("--k-constant", type=int, default=None)
range_parser.set_defaults(func=run_range_bench)
model_parser = subparsers.add_parser("model_bench")
model_parser.add_argument("--models",
nargs="+",
type=str,
default=DEFAULT_MODELS,
choices=WEIGHT_SHAPES.keys())
model_parser.add_argument("--tp-sizes",
nargs="+",
type=int,
default=DEFAULT_TP_SIZES)
model_parser.add_argument("--batch-sizes",
nargs="+",
type=int,
default=DEFAULT_BATCH_SIZES)
model_parser.set_defaults(func=run_model_bench)
args = parser.parse_args()
args.func(args)

View File

@ -1,43 +0,0 @@
# Weight Shapes are in the format
# ([K, N], TP_SPLIT_DIM)
# Example:
# A shape of ([14336, 4096], 0) indicates the following GEMM shape,
# - TP1 : K = 14336, N = 4096
# - TP2 : K = 7168, N = 4096
# A shape of ([4096, 6144], 1) indicates the following GEMM shape,
# - TP1 : K = 4096, N = 6144
# - TP4 : K = 4096, N = 1536
# TP1 shapes
WEIGHT_SHAPES = {
"mistralai/Mistral-7B-v0.1": [
([4096, 6144], 1),
([4096, 4096], 0),
([4096, 28672], 1),
([14336, 4096], 0),
],
"meta-llama/Llama-2-7b-hf": [
([4096, 12288], 1),
([4096, 4096], 0),
([4096, 22016], 1),
([11008, 4096], 0),
],
"meta-llama/Llama-3-8b": [
([4096, 6144], 1),
([4096, 4096], 0),
([4096, 28672], 1),
([14336, 4096], 0),
],
"meta-llama/Llama-2-13b-hf": [
([5120, 15360], 1),
([5120, 5120], 0),
([5120, 27648], 1),
([13824, 5120], 0),
],
"meta-llama/Llama-2-70b-hf": [
([8192, 10240], 1),
([8192, 8192], 0),
([8192, 57344], 1),
([28672, 8192], 0),
],
}

View File

@ -1,3 +1,4 @@
import argparse
import os
import sys
from typing import Optional
@ -9,7 +10,6 @@ from vllm import _custom_ops as ops
from vllm.model_executor.layers.quantization.aqlm import (
dequantize_weight, generic_dequantize_gemm, get_int_dtype,
optimized_dequantize_gemm)
from vllm.utils import FlexibleArgumentParser
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
@ -86,9 +86,9 @@ def dequant_no_scale(
# Compare the optimized 1x16 and 2x8 cuda decompression/dequant kernels against
# the generic pytorch version.
# Just visual comparison.
def dequant_test(k: int, parts: torch.Tensor, nbooks: int, bits: int) -> None:
def dequant_test(k: int, parts: torch.tensor, nbooks: int, bits: int) -> None:
n = int(parts.sum().item())
n = parts.sum().item()
device = torch.device('cuda:0')
@ -137,7 +137,7 @@ def dequant_test(k: int, parts: torch.Tensor, nbooks: int, bits: int) -> None:
def main():
parser = FlexibleArgumentParser(description="Benchmark aqlm performance.")
parser = argparse.ArgumentParser(description="Benchmark aqlm performance.")
# Add arguments
parser.add_argument("--nbooks",
@ -204,7 +204,7 @@ def main():
sys.stdout = sys.__stdout__
def run_grid(m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int,
def run_grid(m: int, k: int, parts: torch.tensor, nbooks: int, bits: int,
methods):
# I didn't see visible improvements from increasing these, but feel free :)
@ -252,10 +252,10 @@ def run_grid(m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int,
print('')
def run_timing(num_calls: int, m: int, k: int, parts: torch.Tensor,
def run_timing(num_calls: int, m: int, k: int, parts: torch.tensor,
nbooks: int, bits: int, method) -> float:
n = int(parts.sum().item())
n = parts.sum().item()
device = torch.device('cuda:0')

Some files were not shown because too many files have changed in this diff Show More