mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 23:03:52 +08:00
Compare commits
4 Commits
v0.9.2
...
optimize-p
Author | SHA1 | Date | |
---|---|---|---|
d5bf492f16 | |||
8c7bab79f5 | |||
1936d7bab0 | |||
996cf2de5c |
@ -1,53 +1,36 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import os
|
||||
import sys
|
||||
import zipfile
|
||||
|
||||
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 400 MiB
|
||||
# Note that we have 400 MiB quota, please use it wisely.
|
||||
# See https://github.com/pypi/support/issues/3792 .
|
||||
# Please also sync the value with the one in Dockerfile.
|
||||
VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 400))
|
||||
MAX_SIZE_MB = 200
|
||||
|
||||
|
||||
def print_top_10_largest_files(zip_file):
|
||||
"""Print the top 10 largest files in the given zip file."""
|
||||
with zipfile.ZipFile(zip_file, "r") as z:
|
||||
with zipfile.ZipFile(zip_file, 'r') as z:
|
||||
file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()]
|
||||
file_sizes.sort(key=lambda x: x[1], reverse=True)
|
||||
for f, size in file_sizes[:10]:
|
||||
print(f"{f}: {size / (1024 * 1024):.2f} MBs uncompressed.")
|
||||
print(f"{f}: {size/(1024*1024)} MBs uncompressed.")
|
||||
|
||||
|
||||
def check_wheel_size(directory):
|
||||
"""Check the size of .whl files in the given directory."""
|
||||
for root, _, files in os.walk(directory):
|
||||
for file_name in files:
|
||||
if file_name.endswith(".whl"):
|
||||
wheel_path = os.path.join(root, file_name)
|
||||
wheel_size_mb = os.path.getsize(wheel_path) / (1024 * 1024)
|
||||
if wheel_size_mb > VLLM_MAX_SIZE_MB:
|
||||
for f in files:
|
||||
if f.endswith(".whl"):
|
||||
wheel_path = os.path.join(root, f)
|
||||
wheel_size = os.path.getsize(wheel_path)
|
||||
wheel_size_mb = wheel_size / (1024 * 1024)
|
||||
if wheel_size_mb > MAX_SIZE_MB:
|
||||
print(
|
||||
f"Not allowed: Wheel {wheel_path} is larger "
|
||||
f"({wheel_size_mb:.2f} MB) than the limit "
|
||||
f"({VLLM_MAX_SIZE_MB} MB)."
|
||||
)
|
||||
f"Wheel {wheel_path} is too large ({wheel_size_mb} MB) "
|
||||
f"compare to the allowed size ({MAX_SIZE_MB} MB).")
|
||||
print_top_10_largest_files(wheel_path)
|
||||
return 1
|
||||
else:
|
||||
print(
|
||||
f"Wheel {wheel_path} is within the allowed size "
|
||||
f"({wheel_size_mb:.2f} MB)."
|
||||
)
|
||||
print(f"Wheel {wheel_path} is within the allowed size "
|
||||
f"({wheel_size_mb} MB).")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: python check-wheel-size.py <directory>")
|
||||
sys.exit(1)
|
||||
|
||||
directory = sys.argv[1]
|
||||
sys.exit(check_wheel_size(directory))
|
||||
import sys
|
||||
sys.exit(check_wheel_size(sys.argv[1]))
|
||||
|
18
.buildkite/download-images.sh
Normal file
18
.buildkite/download-images.sh
Normal file
@ -0,0 +1,18 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -ex
|
||||
set -o pipefail
|
||||
|
||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||
|
||||
# aws s3 sync s3://air-example-data-2/vllm_opensource_llava/ images/
|
||||
mkdir -p images
|
||||
cd images
|
||||
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_pixel_values.pt
|
||||
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_image_features.pt
|
||||
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_pixel_values.pt
|
||||
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_image_features.pt
|
||||
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign.jpg
|
||||
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom.jpg
|
||||
|
||||
cd -
|
@ -1,27 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import argparse
|
||||
import os
|
||||
|
||||
template = """<!DOCTYPE html>
|
||||
<html>
|
||||
<body>
|
||||
<h1>Links for vLLM</h1/>
|
||||
<a href="../{wheel_html_escaped}">{wheel}</a><br/>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--wheel", help="The wheel path.", required=True)
|
||||
args = parser.parse_args()
|
||||
|
||||
filename = os.path.basename(args.wheel)
|
||||
|
||||
with open("index.html", "w") as f:
|
||||
print(f"Generated index.html for {args.wheel}")
|
||||
# cloudfront requires escaping the '+' character
|
||||
f.write(
|
||||
template.format(wheel=filename, wheel_html_escaped=filename.replace("+", "%2B"))
|
||||
)
|
@ -1,13 +0,0 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2
|
||||
model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.671
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.664
|
||||
limit: 1000
|
||||
num_fewshot: 5
|
||||
trust_remote_code: True
|
@ -1,12 +0,0 @@
|
||||
# For hf script, without -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5
|
||||
model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.905
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.905
|
||||
limit: 1000
|
||||
num_fewshot: 5
|
@ -1,12 +0,0 @@
|
||||
# For hf script, without -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
|
||||
model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.892
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.892
|
||||
limit: 250
|
||||
num_fewshot: 5
|
@ -1,12 +0,0 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1
|
||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.752
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.754
|
||||
limit: 1000
|
||||
num_fewshot: 5
|
@ -1,12 +0,0 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1
|
||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.753
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.753
|
||||
limit: 1000
|
||||
num_fewshot: 5
|
@ -1,12 +0,0 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1
|
||||
model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.755
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.755
|
||||
limit: 1000
|
||||
num_fewshot: 5
|
@ -1,12 +0,0 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
|
||||
model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.753
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.753
|
||||
limit: 1000
|
||||
num_fewshot: 5
|
@ -1,12 +0,0 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
|
||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.764
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.764
|
||||
limit: 250
|
||||
num_fewshot: 5
|
@ -1,12 +0,0 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
|
||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.728
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.728
|
||||
limit: 250
|
||||
num_fewshot: 5
|
@ -1,12 +0,0 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1
|
||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.758
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.759
|
||||
limit: 1000
|
||||
num_fewshot: 5
|
@ -1,12 +0,0 @@
|
||||
# For hf script, without -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5
|
||||
model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.756
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.752
|
||||
limit: 250
|
||||
num_fewshot: 5
|
@ -1,12 +0,0 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
|
||||
model_name: "HandH1998/QQQ-Llama-3-8b-g128"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.419
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.416
|
||||
limit: 1000
|
||||
num_fewshot: 5
|
@ -1,11 +0,0 @@
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Llama-3.2-1B-Instruct-FP8 -b "auto" -l 1319 -f 5 -t 1
|
||||
model_name: "RedHatAI/Llama-3.2-1B-Instruct-FP8"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.335
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.323
|
||||
limit: 1319
|
||||
num_fewshot: 5
|
@ -1,12 +0,0 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
|
||||
model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.356
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.358
|
||||
limit: 1000
|
||||
num_fewshot: 5
|
@ -1,12 +0,0 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1
|
||||
model_name: "mgoin/Minitron-4B-Base-FP8"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.231
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.22
|
||||
limit: 1000
|
||||
num_fewshot: 5
|
@ -1,12 +0,0 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8
|
||||
model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.86
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.86
|
||||
limit: 250
|
||||
num_fewshot: 5
|
@ -1,12 +0,0 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4
|
||||
model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.624
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.624
|
||||
limit: 250
|
||||
num_fewshot: 5
|
@ -1,12 +0,0 @@
|
||||
# For hf script, without -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5
|
||||
model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.616
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.632
|
||||
limit: 250
|
||||
num_fewshot: 5
|
@ -1,12 +0,0 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16 -b auto -l 1319 -f 5 -t 1
|
||||
model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.30
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.465
|
||||
limit: 1319
|
||||
num_fewshot: 5
|
@ -1,12 +0,0 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1
|
||||
model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.578
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.585
|
||||
limit: 1000
|
||||
num_fewshot: 5
|
@ -1,12 +0,0 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
|
||||
model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.593
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.588
|
||||
limit: 1000
|
||||
num_fewshot: 5
|
@ -1,12 +0,0 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
|
||||
model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.595
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.582
|
||||
limit: 1000
|
||||
num_fewshot: 5
|
@ -1,12 +0,0 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
|
||||
model_name: "Qwen/Qwen2-57B-A14B-Instruct"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.792
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.824
|
||||
limit: 250
|
||||
num_fewshot: 5
|
@ -1,11 +0,0 @@
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2.5-1.5B-Instruct -b auto -l 1319 -f 5 -t 1
|
||||
model_name: "Qwen/Qwen2.5-1.5B-Instruct"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.54
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.59
|
||||
limit: 1319
|
||||
num_fewshot: 5
|
@ -1,11 +0,0 @@
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
|
||||
model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.47
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.64
|
||||
limit: 1319
|
||||
num_fewshot: 5
|
@ -1,12 +0,0 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2
|
||||
model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.6353
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.637
|
||||
limit: null
|
||||
num_fewshot: null
|
@ -1,6 +0,0 @@
|
||||
Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
|
||||
Meta-Llama-3-70B-Instruct.yaml
|
||||
Mixtral-8x7B-Instruct-v0.1.yaml
|
||||
Qwen2-57B-A14-Instruct.yaml
|
||||
DeepSeek-V2-Lite-Chat.yaml
|
||||
Meta-Llama-3-8B-QQQ.yaml
|
@ -1,6 +0,0 @@
|
||||
Qwen2.5-1.5B-Instruct.yaml
|
||||
Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
|
||||
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
|
||||
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
|
||||
Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
|
||||
Qwen1.5-MoE-W4A16-compressed-tensors.yaml
|
@ -1,44 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
def pytest_addoption(parser):
|
||||
parser.addoption(
|
||||
"--config-list-file",
|
||||
action="store",
|
||||
help="Path to the file listing model config YAMLs (one per line)",
|
||||
)
|
||||
parser.addoption(
|
||||
"--tp-size",
|
||||
action="store",
|
||||
default="1",
|
||||
help="Tensor parallel size to use for evaluation",
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def config_list_file(pytestconfig, config_dir):
|
||||
rel_path = pytestconfig.getoption("--config-list-file")
|
||||
return config_dir / rel_path
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def tp_size(pytestconfig):
|
||||
return pytestconfig.getoption("--tp-size")
|
||||
|
||||
|
||||
def pytest_generate_tests(metafunc):
|
||||
if "config_filename" in metafunc.fixturenames:
|
||||
rel_path = metafunc.config.getoption("--config-list-file")
|
||||
config_list_file = Path(rel_path).resolve()
|
||||
config_dir = config_list_file.parent
|
||||
with open(config_list_file, encoding="utf-8") as f:
|
||||
configs = [
|
||||
config_dir / line.strip()
|
||||
for line in f
|
||||
if line.strip() and not line.startswith("#")
|
||||
]
|
||||
metafunc.parametrize("config_filename", configs)
|
@ -1,46 +0,0 @@
|
||||
#!/bin/bash
|
||||
# We can use this script to compute baseline accuracy on GSM for transformers.
|
||||
#
|
||||
# Make sure you have lm-eval-harness installed:
|
||||
# pip install lm-eval==0.4.4
|
||||
|
||||
usage() {
|
||||
echo``
|
||||
echo "Runs lm eval harness on GSM8k using huggingface transformers."
|
||||
echo "This pathway is intended to be used to create baselines for "
|
||||
echo "our automated nm-test-accuracy workflow"
|
||||
echo
|
||||
echo "usage: ${0} <options>"
|
||||
echo
|
||||
echo " -m - huggingface stub or local directory of the model"
|
||||
echo " -b - batch size to run the evaluation at"
|
||||
echo " -l - limit number of samples to run"
|
||||
echo " -f - number of fewshot samples to use"
|
||||
echo
|
||||
}
|
||||
|
||||
while getopts "m:b:l:f:" OPT; do
|
||||
case ${OPT} in
|
||||
m )
|
||||
MODEL="$OPTARG"
|
||||
;;
|
||||
b )
|
||||
BATCH_SIZE="$OPTARG"
|
||||
;;
|
||||
l )
|
||||
LIMIT="$OPTARG"
|
||||
;;
|
||||
f )
|
||||
FEWSHOT="$OPTARG"
|
||||
;;
|
||||
\? )
|
||||
usage
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
lm_eval --model hf \
|
||||
--model_args "pretrained=$MODEL,parallelize=True" \
|
||||
--tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
|
||||
--batch_size "$BATCH_SIZE"
|
@ -1,51 +0,0 @@
|
||||
#!/bin/bash
|
||||
# We can use this script to compute baseline accuracy on GSM for vllm.
|
||||
# We use this for fp8, which HF does not support.
|
||||
#
|
||||
# Make sure you have lm-eval-harness installed:
|
||||
# pip install lm-eval==0.4.4
|
||||
|
||||
usage() {
|
||||
echo``
|
||||
echo "Runs lm eval harness on GSM8k using huggingface transformers."
|
||||
echo "This pathway is intended to be used to create baselines for "
|
||||
echo "our automated nm-test-accuracy workflow"
|
||||
echo
|
||||
echo "usage: ${0} <options>"
|
||||
echo
|
||||
echo " -m - huggingface stub or local directory of the model"
|
||||
echo " -b - batch size to run the evaluation at"
|
||||
echo " -l - limit number of samples to run"
|
||||
echo " -f - number of fewshot samples to use"
|
||||
echo " -t - tensor parallel size to run at"
|
||||
echo
|
||||
}
|
||||
|
||||
while getopts "m:b:l:f:t:" OPT; do
|
||||
case ${OPT} in
|
||||
m )
|
||||
MODEL="$OPTARG"
|
||||
;;
|
||||
b )
|
||||
BATCH_SIZE="$OPTARG"
|
||||
;;
|
||||
l )
|
||||
LIMIT="$OPTARG"
|
||||
;;
|
||||
f )
|
||||
FEWSHOT="$OPTARG"
|
||||
;;
|
||||
t )
|
||||
TP_SIZE="$OPTARG"
|
||||
;;
|
||||
\? )
|
||||
usage
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
lm_eval --model vllm \
|
||||
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend=ray,trust_remote_code=true,max_model_len=4096" \
|
||||
--tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
|
||||
--batch_size "$BATCH_SIZE"
|
@ -1,55 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
LM eval harness on model to compare vs HF baseline computed offline.
|
||||
Configs are found in configs/$MODEL.yaml
|
||||
|
||||
pytest -s -v test_lm_eval_correctness.py \
|
||||
--config-list-file=configs/models-small.txt \
|
||||
--tp-size=1
|
||||
"""
|
||||
|
||||
import lm_eval
|
||||
import numpy as np
|
||||
import yaml
|
||||
|
||||
RTOL = 0.08
|
||||
|
||||
|
||||
def launch_lm_eval(eval_config, tp_size):
|
||||
trust_remote_code = eval_config.get("trust_remote_code", False)
|
||||
model_args = (
|
||||
f"pretrained={eval_config['model_name']},"
|
||||
f"tensor_parallel_size={tp_size},"
|
||||
f"enforce_eager=true,"
|
||||
f"add_bos_token=true,"
|
||||
f"trust_remote_code={trust_remote_code}"
|
||||
)
|
||||
results = lm_eval.simple_evaluate(
|
||||
model="vllm",
|
||||
model_args=model_args,
|
||||
tasks=[task["name"] for task in eval_config["tasks"]],
|
||||
num_fewshot=eval_config["num_fewshot"],
|
||||
limit=eval_config["limit"],
|
||||
batch_size="auto",
|
||||
)
|
||||
return results
|
||||
|
||||
|
||||
def test_lm_eval_correctness_param(config_filename, tp_size):
|
||||
eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8"))
|
||||
|
||||
results = launch_lm_eval(eval_config, tp_size)
|
||||
|
||||
success = True
|
||||
for task in eval_config["tasks"]:
|
||||
for metric in task["metrics"]:
|
||||
ground_truth = metric["value"]
|
||||
measured_value = results["results"][task["name"]][metric["name"]]
|
||||
print(
|
||||
f"{task['name']} | {metric['name']}: "
|
||||
f"ground_truth={ground_truth} | measured={measured_value}"
|
||||
)
|
||||
success = success and np.isclose(ground_truth, measured_value, rtol=RTOL)
|
||||
|
||||
assert success
|
@ -1,181 +0,0 @@
|
||||
# vLLM benchmark suite
|
||||
|
||||
## Introduction
|
||||
|
||||
This directory contains two sets of benchmark for vllm.
|
||||
|
||||
- Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance
|
||||
- Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm.
|
||||
|
||||
See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
|
||||
|
||||
## Performance benchmark quick overview
|
||||
|
||||
**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) and Intel® Xeon® Processors, with different models.
|
||||
|
||||
**Benchmarking Duration**: about 1hr.
|
||||
|
||||
**For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run.
|
||||
|
||||
## Nightly benchmark quick overview
|
||||
|
||||
**Benchmarking Coverage**: Fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) on Llama-3 8B, 70B and Mixtral 8x7B.
|
||||
|
||||
**Benchmarking engines**: vllm, TGI, trt-llm and lmdeploy.
|
||||
|
||||
**Benchmarking Duration**: about 3.5hrs.
|
||||
|
||||
## Trigger the benchmark
|
||||
|
||||
Performance benchmark will be triggered when:
|
||||
- A PR being merged into vllm.
|
||||
- Every commit for those PRs with `perf-benchmarks` label AND `ready` label.
|
||||
|
||||
Manually Trigger the benchmark
|
||||
|
||||
```bash
|
||||
bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
||||
```
|
||||
|
||||
Runtime environment variables:
|
||||
- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
|
||||
- `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
|
||||
- `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
|
||||
- `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file).
|
||||
- `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string.
|
||||
- `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.
|
||||
|
||||
Nightly benchmark will be triggered when:
|
||||
- Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
|
||||
|
||||
## Performance benchmark details
|
||||
|
||||
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
|
||||
> NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead.
|
||||
### Latency test
|
||||
|
||||
Here is an example of one test inside `latency-tests.json`:
|
||||
|
||||
```json
|
||||
[
|
||||
{
|
||||
"test_name": "latency_llama8B_tp1",
|
||||
"parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3-8B",
|
||||
"tensor_parallel_size": 1,
|
||||
"load_format": "dummy",
|
||||
"num_iters_warmup": 5,
|
||||
"num_iters": 15
|
||||
}
|
||||
},
|
||||
]
|
||||
```
|
||||
|
||||
In this example:
|
||||
|
||||
- The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
|
||||
- The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
|
||||
|
||||
Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
|
||||
|
||||
WARNING: The benchmarking script will save json results by itself, so please do not configure `--output-json` parameter in the json file.
|
||||
|
||||
### Throughput test
|
||||
|
||||
The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`.
|
||||
|
||||
The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
|
||||
|
||||
### Serving test
|
||||
|
||||
We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
|
||||
|
||||
```json
|
||||
[
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_sharegpt",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3-8B",
|
||||
"tensor_parallel_size": 1,
|
||||
"swap_space": 16,
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": "",
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3-8B",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
]
|
||||
```
|
||||
|
||||
Inside this example:
|
||||
|
||||
- The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`.
|
||||
- The `server-parameters` includes the command line arguments for vLLM server.
|
||||
- The `client-parameters` includes the command line arguments for `benchmark_serving.py`.
|
||||
- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `benchmark_serving.py`
|
||||
|
||||
The number of this test is less stable compared to the delay and latency benchmarks (due to randomized sharegpt dataset sampling inside `benchmark_serving.py`), but a large change on this number (e.g. 5% change) still vary the output greatly.
|
||||
|
||||
WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
|
||||
|
||||
### Visualizing the results
|
||||
|
||||
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results.
|
||||
You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
|
||||
If you do not see the table, please wait till the benchmark finish running.
|
||||
The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
|
||||
The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
|
||||
|
||||
The `compare-json-results.py` helps to compare benchmark results JSON files converted using `convert-results-json-to-markdown.py`.
|
||||
When run, benchmark script generates results under `benchmark/results` folder, along with the `benchmark_results.md` and `benchmark_results.json`.
|
||||
`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.
|
||||
|
||||
Here is an example using the script to compare result_a and result_b without detail test name.
|
||||
`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json --ignore_test_name`
|
||||
|
||||
| | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio |
|
||||
|----|----------------------------------------|----------------------------------------|----------|
|
||||
| 0 | 142.633982 | 156.526018 | 1.097396 |
|
||||
| 1 | 241.620334 | 294.018783 | 1.216863 |
|
||||
| 2 | 218.298905 | 262.664916 | 1.203235 |
|
||||
| 3 | 242.743860 | 299.816190 | 1.235113 |
|
||||
|
||||
Here is an example using the script to compare result_a and result_b with detail test name.
|
||||
`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json`
|
||||
| | results_a/benchmark_results.json_name | results_a/benchmark_results.json | results_b/benchmark_results.json_name | results_b/benchmark_results.json | perf_ratio |
|
||||
|---|---------------------------------------------|----------------------------------------|---------------------------------------------|----------------------------------------|----------|
|
||||
| 0 | serving_llama8B_tp1_sharegpt_qps_1 | 142.633982 | serving_llama8B_tp1_sharegpt_qps_1 | 156.526018 | 1.097396 |
|
||||
| 1 | serving_llama8B_tp1_sharegpt_qps_16 | 241.620334 | serving_llama8B_tp1_sharegpt_qps_16 | 294.018783 | 1.216863 |
|
||||
| 2 | serving_llama8B_tp1_sharegpt_qps_4 | 218.298905 | serving_llama8B_tp1_sharegpt_qps_4 | 262.664916 | 1.203235 |
|
||||
| 3 | serving_llama8B_tp1_sharegpt_qps_inf | 242.743860 | serving_llama8B_tp1_sharegpt_qps_inf | 299.816190 | 1.235113 |
|
||||
| 4 | serving_llama8B_tp2_random_1024_128_qps_1 | 96.613390 | serving_llama8B_tp4_random_1024_128_qps_1 | 108.404853 | 1.122048 |
|
||||
|
||||
## Nightly test details
|
||||
|
||||
See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines.
|
||||
|
||||
### Workflow
|
||||
|
||||
- The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines.
|
||||
- Inside each container, we run [run-nightly-suite.sh](run-nightly-suite.sh), which will probe the serving engine of the current container.
|
||||
- The `run-nightly-suite.sh` will redirect the request to `tests/run-[llm serving engine name]-nightly.sh`, which parses the workload described in [nightly-tests.json](tests/nightly-tests.json) and performs the benchmark.
|
||||
- At last, we run [scripts/plot-nightly-results.py](scripts/plot-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
|
||||
|
||||
### Nightly tests
|
||||
|
||||
In [nightly-tests.json](tests/nightly-tests.json), we include the command line arguments for benchmarking commands, together with the benchmarking test cases. The format is highly similar to performance benchmark.
|
||||
|
||||
### Docker containers
|
||||
|
||||
The docker containers for benchmarking are specified in `nightly-pipeline.yaml`.
|
||||
|
||||
WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `tests/run-[llm serving engine name]-nightly.sh`.
|
||||
|
||||
WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git).
|
@ -1,184 +0,0 @@
|
||||
steps:
|
||||
- label: "Wait for container to be ready"
|
||||
key: wait-for-container-image
|
||||
agents:
|
||||
queue: A100
|
||||
plugins:
|
||||
- kubernetes:
|
||||
podSpec:
|
||||
containers:
|
||||
- image: badouralix/curl-jq
|
||||
command:
|
||||
- sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
|
||||
- label: "Cleanup H100"
|
||||
agents:
|
||||
queue: H100
|
||||
depends_on: ~
|
||||
command: docker system prune -a --volumes --force
|
||||
|
||||
- label: "A100"
|
||||
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
||||
agents:
|
||||
queue: A100
|
||||
depends_on: wait-for-container-image
|
||||
if: build.branch == "main"
|
||||
plugins:
|
||||
- kubernetes:
|
||||
podSpec:
|
||||
priorityClassName: perf-benchmark
|
||||
containers:
|
||||
- image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
|
||||
command:
|
||||
- bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: 8
|
||||
volumeMounts:
|
||||
- name: devshm
|
||||
mountPath: /dev/shm
|
||||
env:
|
||||
- name: VLLM_USAGE_SOURCE
|
||||
value: ci-test
|
||||
- name: HF_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: hf-token-secret
|
||||
key: token
|
||||
nodeSelector:
|
||||
nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
|
||||
volumes:
|
||||
- name: devshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
|
||||
- label: "H200"
|
||||
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
||||
agents:
|
||||
queue: H200
|
||||
depends_on: wait-for-container-image
|
||||
if: build.branch == "main"
|
||||
plugins:
|
||||
- docker#v5.12.0:
|
||||
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
|
||||
command:
|
||||
- bash
|
||||
- .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
||||
mount-buildkite-agent: true
|
||||
propagate-environment: true
|
||||
ipc: host
|
||||
gpus: 4,5,6,7
|
||||
volumes:
|
||||
- /data/benchmark-hf-cache:/root/.cache/huggingface
|
||||
environment:
|
||||
- VLLM_USAGE_SOURCE
|
||||
- HF_TOKEN
|
||||
|
||||
#- block: "Run H100 Benchmark"
|
||||
#key: block-h100
|
||||
#depends_on: ~
|
||||
|
||||
- label: "H100"
|
||||
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
||||
agents:
|
||||
queue: H100
|
||||
depends_on: wait-for-container-image
|
||||
if: build.branch == "main"
|
||||
plugins:
|
||||
- docker#v5.12.0:
|
||||
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
|
||||
command:
|
||||
- bash
|
||||
- .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
||||
mount-buildkite-agent: true
|
||||
propagate-environment: true
|
||||
ipc: host
|
||||
gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
|
||||
volumes:
|
||||
- /data/benchmark-hf-cache:/root/.cache/huggingface
|
||||
environment:
|
||||
- VLLM_USAGE_SOURCE
|
||||
- HF_TOKEN
|
||||
|
||||
# Premerge benchmark
|
||||
- label: "A100"
|
||||
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
||||
agents:
|
||||
queue: A100
|
||||
depends_on: wait-for-container-image
|
||||
if: build.branch != "main"
|
||||
plugins:
|
||||
- kubernetes:
|
||||
podSpec:
|
||||
priorityClassName: perf-benchmark
|
||||
containers:
|
||||
- image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
|
||||
command:
|
||||
- bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: 8
|
||||
volumeMounts:
|
||||
- name: devshm
|
||||
mountPath: /dev/shm
|
||||
env:
|
||||
- name: VLLM_USAGE_SOURCE
|
||||
value: ci-test
|
||||
- name: HF_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: hf-token-secret
|
||||
key: token
|
||||
nodeSelector:
|
||||
nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
|
||||
volumes:
|
||||
- name: devshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
|
||||
- label: "H200"
|
||||
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
||||
agents:
|
||||
queue: H200
|
||||
depends_on: wait-for-container-image
|
||||
if: build.branch != "main"
|
||||
plugins:
|
||||
- docker#v5.12.0:
|
||||
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
|
||||
command:
|
||||
- bash
|
||||
- .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
||||
mount-buildkite-agent: true
|
||||
propagate-environment: true
|
||||
ipc: host
|
||||
gpus: 4,5,6,7
|
||||
volumes:
|
||||
- /data/benchmark-hf-cache:/root/.cache/huggingface
|
||||
environment:
|
||||
- VLLM_USAGE_SOURCE
|
||||
- HF_TOKEN
|
||||
|
||||
#- block: "Run H100 Benchmark"
|
||||
#key: block-h100
|
||||
#depends_on: ~
|
||||
|
||||
- label: "H100"
|
||||
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
||||
agents:
|
||||
queue: H100
|
||||
depends_on: wait-for-container-image
|
||||
if: build.branch != "main"
|
||||
plugins:
|
||||
- docker#v5.12.0:
|
||||
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
|
||||
command:
|
||||
- bash
|
||||
- .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
||||
mount-buildkite-agent: true
|
||||
propagate-environment: true
|
||||
ipc: host
|
||||
gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
|
||||
volumes:
|
||||
- /data/benchmark-hf-cache:/root/.cache/huggingface
|
||||
environment:
|
||||
- VLLM_USAGE_SOURCE
|
||||
- HF_TOKEN
|
@ -1,27 +0,0 @@
|
||||
|
||||
## Description
|
||||
|
||||
This file contains the downloading link for benchmarking results.
|
||||
|
||||
- [benchmarking pipeline](artifact://nightly-pipeline.yaml)
|
||||
- [benchmarking results](artifact://results.zip)
|
||||
- [benchmarking code](artifact://nightly-benchmarks.zip)
|
||||
|
||||
Please download the visualization scripts in the post
|
||||
|
||||
## Results reproduction
|
||||
|
||||
- Find the docker we use in `benchmarking pipeline`
|
||||
- Deploy the docker, and inside the docker:
|
||||
- Download `nightly-benchmarks.zip`.
|
||||
- In the same folder, run the following code:
|
||||
|
||||
```bash
|
||||
export HF_TOKEN=<your HF token>
|
||||
apt update
|
||||
apt install -y git
|
||||
unzip nightly-benchmarks.zip
|
||||
VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
|
||||
```
|
||||
|
||||
And the results will be inside `./benchmarks/results`.
|
@ -1,39 +0,0 @@
|
||||
|
||||
# Nightly benchmark
|
||||
|
||||
This benchmark aims to:
|
||||
|
||||
- Provide performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and SGLang) leads in performance in what workload.
|
||||
- Be reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions.
|
||||
|
||||
Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end.
|
||||
|
||||
Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
|
||||
|
||||
## Setup
|
||||
|
||||
- Docker images:
|
||||
- vLLM: `vllm/vllm-openai:v0.6.2`
|
||||
- SGLang: `lmsysorg/sglang:v0.3.2-cu121`
|
||||
- LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
|
||||
- TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
|
||||
- *NOTE: we uses r24.07 as the current implementation only works for this version. We are going to bump this up.*
|
||||
- Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
|
||||
- Hardware
|
||||
- 8x Nvidia A100 GPUs
|
||||
- Workload:
|
||||
- Dataset
|
||||
- ShareGPT dataset
|
||||
- Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output)
|
||||
- Decode-heavy dataset (in average 462 input tokens, 256 output tokens)
|
||||
- Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use.
|
||||
- Models: llama-3 8B, llama-3 70B.
|
||||
- We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)).
|
||||
- Average QPS (query per second): 2, 4, 8, 16, 32 and inf.
|
||||
- Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
|
||||
- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
|
||||
|
||||
## Known issues
|
||||
|
||||
- TRT-LLM crashes with Llama 3.1 8B [issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105).
|
||||
- TGI does not support `ignore-eos` flag.
|
@ -1,196 +0,0 @@
|
||||
common_pod_spec: &common_pod_spec
|
||||
priorityClassName: perf-benchmark
|
||||
nodeSelector:
|
||||
nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
|
||||
volumes:
|
||||
- name: devshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
- name: hf-cache
|
||||
hostPath:
|
||||
path: /root/.cache/huggingface
|
||||
type: Directory
|
||||
|
||||
common_container_settings: &common_container_settings
|
||||
command:
|
||||
- bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: 8
|
||||
volumeMounts:
|
||||
- name: devshm
|
||||
mountPath: /dev/shm
|
||||
- name: hf-cache
|
||||
mountPath: /root/.cache/huggingface
|
||||
env:
|
||||
- name: VLLM_USAGE_SOURCE
|
||||
value: ci-test
|
||||
- name: HF_HOME
|
||||
value: /root/.cache/huggingface
|
||||
- name: VLLM_SOURCE_CODE_LOC
|
||||
value: /workspace/build/buildkite/vllm/performance-benchmark
|
||||
- name: HF_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: hf-token-secret
|
||||
key: token
|
||||
|
||||
steps:
|
||||
- block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours."
|
||||
|
||||
|
||||
|
||||
- label: "A100 vllm step 10"
|
||||
priority: 100
|
||||
agents:
|
||||
queue: A100
|
||||
plugins:
|
||||
- kubernetes:
|
||||
podSpec:
|
||||
<<: *common_pod_spec
|
||||
containers:
|
||||
- image: vllm/vllm-openai:v0.6.2
|
||||
<<: *common_container_settings
|
||||
|
||||
|
||||
|
||||
- label: "A100 sglang benchmark"
|
||||
priority: 100
|
||||
agents:
|
||||
queue: A100
|
||||
plugins:
|
||||
- kubernetes:
|
||||
podSpec:
|
||||
<<: *common_pod_spec
|
||||
containers:
|
||||
- image: lmsysorg/sglang:v0.3.2-cu121
|
||||
<<: *common_container_settings
|
||||
|
||||
- label: "A100 lmdeploy benchmark"
|
||||
priority: 100
|
||||
agents:
|
||||
queue: A100
|
||||
plugins:
|
||||
- kubernetes:
|
||||
podSpec:
|
||||
<<: *common_pod_spec
|
||||
containers:
|
||||
- image: openmmlab/lmdeploy:v0.6.1-cu12
|
||||
<<: *common_container_settings
|
||||
|
||||
|
||||
|
||||
|
||||
- label: "A100 trt llama-8B"
|
||||
priority: 100
|
||||
agents:
|
||||
queue: A100
|
||||
plugins:
|
||||
- kubernetes:
|
||||
podSpec:
|
||||
<<: *common_pod_spec
|
||||
containers:
|
||||
- image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
|
||||
<<: *common_container_settings
|
||||
env:
|
||||
- name: VLLM_USAGE_SOURCE
|
||||
value: ci-test
|
||||
- name: HF_HOME
|
||||
value: /root/.cache/huggingface
|
||||
- name: VLLM_SOURCE_CODE_LOC
|
||||
value: /workspace/build/buildkite/vllm/performance-benchmark
|
||||
- name: HF_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: hf-token-secret
|
||||
key: token
|
||||
- name: TEST_SELECTOR
|
||||
value: "llama8B"
|
||||
|
||||
|
||||
- label: "A100 trt llama-70B"
|
||||
priority: 100
|
||||
agents:
|
||||
queue: A100
|
||||
plugins:
|
||||
- kubernetes:
|
||||
podSpec:
|
||||
<<: *common_pod_spec
|
||||
containers:
|
||||
- image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
|
||||
<<: *common_container_settings
|
||||
env:
|
||||
- name: VLLM_USAGE_SOURCE
|
||||
value: ci-test
|
||||
- name: HF_HOME
|
||||
value: /root/.cache/huggingface
|
||||
- name: VLLM_SOURCE_CODE_LOC
|
||||
value: /workspace/build/buildkite/vllm/performance-benchmark
|
||||
- name: HF_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: hf-token-secret
|
||||
key: token
|
||||
- name: TEST_SELECTOR
|
||||
value: "llama70B"
|
||||
|
||||
|
||||
# FIXME(Kuntai): uncomment this after NVIDIA gives us their test docker image
|
||||
# - label: "A100 trt benchmark"
|
||||
# priority: 100
|
||||
# agents:
|
||||
# queue: A100
|
||||
# plugins:
|
||||
# - kubernetes:
|
||||
# podSpec:
|
||||
# <<: *common_pod_spec
|
||||
# containers:
|
||||
# - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
|
||||
# <<: *common_container_settings
|
||||
|
||||
|
||||
# FIXME(Kuntai): uncomment this after TGI supports `--ignore-eos`.
|
||||
# - label: "A100 tgi benchmark"
|
||||
# priority: 100
|
||||
# agents:
|
||||
# queue: A100
|
||||
# plugins:
|
||||
# - kubernetes:
|
||||
# podSpec:
|
||||
# <<: *common_pod_spec
|
||||
# containers:
|
||||
# - image: ghcr.io/huggingface/text-generation-inference:2.2.0
|
||||
# <<: *common_container_settings
|
||||
|
||||
- wait
|
||||
|
||||
- label: "Collect the results"
|
||||
priority: 100
|
||||
agents:
|
||||
queue: A100
|
||||
plugins:
|
||||
- kubernetes:
|
||||
podSpec:
|
||||
<<: *common_pod_spec
|
||||
containers:
|
||||
- image: vllm/vllm-openai:v0.5.0.post1
|
||||
command:
|
||||
- bash .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: 8
|
||||
volumeMounts:
|
||||
- name: devshm
|
||||
mountPath: /dev/shm
|
||||
env:
|
||||
- name: VLLM_USAGE_SOURCE
|
||||
value: ci-test
|
||||
- name: VLLM_SOURCE_CODE_LOC
|
||||
value: /workspace/build/buildkite/vllm/performance-benchmark
|
||||
- name: HF_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: hf-token-secret
|
||||
key: token
|
||||
|
||||
- block: ":rocket: check the results!"
|
@ -1,64 +0,0 @@
|
||||
|
||||
## Latency tests
|
||||
|
||||
- Input length: 32 tokens.
|
||||
- Output length: 128 tokens.
|
||||
- Batch size: fixed (8).
|
||||
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
||||
- CPU Models: llama-3.1 8B.
|
||||
- Evaluation metrics: end-to-end latency (mean, median, p99).
|
||||
|
||||
{latency_tests_markdown_table}
|
||||
|
||||
## Throughput tests
|
||||
|
||||
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
|
||||
- Output length: the corresponding output length of these 200 prompts.
|
||||
- Batch size: dynamically determined by vllm to achieve maximum throughput.
|
||||
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
||||
- CPU Models: llama-3.1 8B.
|
||||
- Evaluation metrics: throughput.
|
||||
|
||||
{throughput_tests_markdown_table}
|
||||
|
||||
## Serving tests
|
||||
|
||||
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
|
||||
- Output length: the corresponding output length of these 200 prompts.
|
||||
- Batch size: dynamically determined by vllm and the arrival pattern of the requests.
|
||||
- **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
|
||||
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
||||
- We also added a speculative decoding test for llama-3 70B on GPU, under QPS 2
|
||||
- CPU Models: llama-3.1 8B.
|
||||
- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
|
||||
- For CPU, we added random dataset tests to benchmark fixed input/output length with 100 prompts.
|
||||
|
||||
{serving_tests_markdown_table}
|
||||
|
||||
## Platform Information
|
||||
|
||||
{platform_markdown_table}
|
||||
|
||||
## json version of the benchmarking tables
|
||||
|
||||
This section contains the data of the markdown tables above in JSON format.
|
||||
You can load the benchmarking tables into pandas dataframes as follows:
|
||||
|
||||
```python
|
||||
import json
|
||||
import pandas as pd
|
||||
|
||||
benchmarking_results_json = """The json string"""
|
||||
benchmarking_results = json.loads(benchmarking_results_json)
|
||||
latency_results = pd.DataFrame.from_dict(benchmarking_results["latency"])
|
||||
throughput_results = pd.DataFrame.from_dict(benchmarking_results["throughput"])
|
||||
serving_results = pd.DataFrame.from_dict(benchmarking_results["serving"])
|
||||
```
|
||||
|
||||
The json string for all benchmarking tables:
|
||||
|
||||
```json
|
||||
{benchmarking_results_in_json_string}
|
||||
```
|
||||
|
||||
You can also check the raw experiment data in the Artifact tab of the Buildkite page.
|
@ -1,66 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import argparse
|
||||
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def compare_data_columns(
|
||||
files, name_column, data_column, drop_column, ignore_test_name=False
|
||||
):
|
||||
print("\ncompare_data_column: " + data_column)
|
||||
frames = []
|
||||
compare_frames = []
|
||||
for file in files:
|
||||
data_df = pd.read_json(file)
|
||||
serving_df = data_df.dropna(subset=[drop_column], ignore_index=True)
|
||||
if ignore_test_name is False:
|
||||
serving_df = serving_df.rename(columns={name_column: file + "_name"})
|
||||
frames.append(serving_df[file + "_name"])
|
||||
serving_df = serving_df.rename(columns={data_column: file})
|
||||
frames.append(serving_df[file])
|
||||
compare_frames.append(serving_df[file])
|
||||
if len(compare_frames) >= 2:
|
||||
# Compare numbers among two files
|
||||
ratio_df = compare_frames[1] / compare_frames[0]
|
||||
frames.append(ratio_df)
|
||||
compare_frames.pop(1)
|
||||
|
||||
concat_df = pd.concat(frames, axis=1)
|
||||
return concat_df
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"-f", "--file", action="append", type=str, help="input file name"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ignore_test_name", action="store_true", help="ignore_test_name or not"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
files = args.file
|
||||
print("comparing : " + ", ".join(files))
|
||||
|
||||
drop_column = "P99"
|
||||
name_column = "Test name"
|
||||
data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
|
||||
html_msgs_for_data_cols = [
|
||||
"Compare Output Tokens /n",
|
||||
"Median TTFT /n",
|
||||
"Median TPOT /n",
|
||||
]
|
||||
ignore_test_name = args.ignore_test_name
|
||||
with open("perf_comparison.html", "w") as text_file:
|
||||
for i in range(len(data_cols_to_compare)):
|
||||
output_df = compare_data_columns(
|
||||
files,
|
||||
name_column,
|
||||
data_cols_to_compare[i],
|
||||
drop_column,
|
||||
ignore_test_name=ignore_test_name,
|
||||
)
|
||||
print(output_df)
|
||||
html = output_df.to_html()
|
||||
text_file.write(html_msgs_for_data_cols[i])
|
||||
text_file.write(html)
|
@ -1,268 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import json
|
||||
import os
|
||||
from importlib import util
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
import psutil
|
||||
from tabulate import tabulate
|
||||
|
||||
results_folder = Path("results/")
|
||||
|
||||
# latency results and the keys that will be printed into markdown
|
||||
latency_results = []
|
||||
latency_column_mapping = {
|
||||
"test_name": "Test name",
|
||||
"gpu_type": "GPU",
|
||||
"avg_latency": "Mean latency (ms)",
|
||||
# "P10": "P10 (s)",
|
||||
# "P25": "P25 (s)",
|
||||
"P50": "Median latency (ms)",
|
||||
# "P75": "P75 (s)",
|
||||
# "P90": "P90 (s)",
|
||||
"P99": "P99 latency (ms)",
|
||||
}
|
||||
|
||||
# throughput tests and the keys that will be printed into markdown
|
||||
throughput_results = []
|
||||
throughput_results_column_mapping = {
|
||||
"test_name": "Test name",
|
||||
"gpu_type": "GPU",
|
||||
"num_requests": "# of req.",
|
||||
"total_num_tokens": "Total # of tokens",
|
||||
"elapsed_time": "Elapsed time (s)",
|
||||
"requests_per_second": "Tput (req/s)",
|
||||
"tokens_per_second": "Tput (tok/s)",
|
||||
}
|
||||
|
||||
# serving results and the keys that will be printed into markdown
|
||||
serving_results = []
|
||||
serving_column_mapping = {
|
||||
"test_name": "Test name",
|
||||
"gpu_type": "GPU",
|
||||
"completed": "# of req.",
|
||||
"request_throughput": "Tput (req/s)",
|
||||
"total_token_throughput": "Total Token Tput (tok/s)",
|
||||
"output_throughput": "Output Tput (tok/s)",
|
||||
"total_input_tokens": "Total input tokens",
|
||||
"total_output_tokens": "Total output tokens",
|
||||
"mean_ttft_ms": "Mean TTFT (ms)",
|
||||
"median_ttft_ms": "Median TTFT (ms)",
|
||||
"p99_ttft_ms": "P99 TTFT (ms)",
|
||||
"mean_tpot_ms": "Mean TPOT (ms)",
|
||||
"median_tpot_ms": "Median",
|
||||
"p99_tpot_ms": "P99",
|
||||
"mean_itl_ms": "Mean ITL (ms)",
|
||||
"median_itl_ms": "Median ITL (ms)",
|
||||
"p99_itl_ms": "P99 ITL (ms)",
|
||||
}
|
||||
|
||||
|
||||
def read_markdown(file):
|
||||
if os.path.exists(file):
|
||||
with open(file) as f:
|
||||
return f.read() + "\n"
|
||||
else:
|
||||
return f"{file} not found.\n"
|
||||
|
||||
|
||||
def results_to_json(latency, throughput, serving):
|
||||
return json.dumps(
|
||||
{
|
||||
"latency": latency.to_dict(),
|
||||
"throughput": throughput.to_dict(),
|
||||
"serving": serving.to_dict(),
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def get_size_with_unit(bytes, suffix="B"):
|
||||
"""
|
||||
Scale bytes to its proper format
|
||||
e.g:
|
||||
1253656 => '1.20MB'
|
||||
1253656678 => '1.17GB'
|
||||
"""
|
||||
factor = 1024
|
||||
for unit in ["", "K", "M", "G", "T", "P"]:
|
||||
if bytes < factor:
|
||||
return f"{bytes:.2f}{unit}{suffix}"
|
||||
bytes /= factor
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# collect results
|
||||
for test_file in results_folder.glob("*.json"):
|
||||
with open(test_file) as f:
|
||||
raw_result = json.loads(f.read())
|
||||
|
||||
if "serving" in str(test_file):
|
||||
# this result is generated via `benchmark_serving.py`
|
||||
|
||||
# attach the benchmarking command to raw_result
|
||||
try:
|
||||
with open(test_file.with_suffix(".commands")) as f:
|
||||
command = json.loads(f.read())
|
||||
except OSError as e:
|
||||
print(e)
|
||||
continue
|
||||
|
||||
raw_result.update(command)
|
||||
|
||||
# update the test name of this result
|
||||
raw_result.update({"test_name": test_file.stem})
|
||||
|
||||
# add the result to raw_result
|
||||
serving_results.append(raw_result)
|
||||
continue
|
||||
|
||||
elif "latency" in f.name:
|
||||
# this result is generated via `benchmark_latency.py`
|
||||
|
||||
# attach the benchmarking command to raw_result
|
||||
try:
|
||||
with open(test_file.with_suffix(".commands")) as f:
|
||||
command = json.loads(f.read())
|
||||
except OSError as e:
|
||||
print(e)
|
||||
continue
|
||||
|
||||
raw_result.update(command)
|
||||
|
||||
# update the test name of this result
|
||||
raw_result.update({"test_name": test_file.stem})
|
||||
|
||||
# get different percentiles
|
||||
for perc in [10, 25, 50, 75, 90, 99]:
|
||||
# Multiply 1000 to convert the time unit from s to ms
|
||||
raw_result.update(
|
||||
{f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]}
|
||||
)
|
||||
raw_result["avg_latency"] = raw_result["avg_latency"] * 1000
|
||||
|
||||
# add the result to raw_result
|
||||
latency_results.append(raw_result)
|
||||
continue
|
||||
|
||||
elif "throughput" in f.name:
|
||||
# this result is generated via `benchmark_throughput.py`
|
||||
|
||||
# attach the benchmarking command to raw_result
|
||||
try:
|
||||
with open(test_file.with_suffix(".commands")) as f:
|
||||
command = json.loads(f.read())
|
||||
except OSError as e:
|
||||
print(e)
|
||||
continue
|
||||
|
||||
raw_result.update(command)
|
||||
|
||||
# update the test name of this result
|
||||
raw_result.update({"test_name": test_file.stem})
|
||||
|
||||
# add the result to raw_result
|
||||
throughput_results.append(raw_result)
|
||||
continue
|
||||
|
||||
print(f"Skipping {test_file}")
|
||||
|
||||
latency_results = pd.DataFrame.from_dict(latency_results)
|
||||
serving_results = pd.DataFrame.from_dict(serving_results)
|
||||
throughput_results = pd.DataFrame.from_dict(throughput_results)
|
||||
|
||||
svmem = psutil.virtual_memory()
|
||||
platform_data = {
|
||||
"Physical cores": [psutil.cpu_count(logical=False)],
|
||||
"Total cores": [psutil.cpu_count(logical=True)],
|
||||
"Total Memory": [get_size_with_unit(svmem.total)],
|
||||
}
|
||||
|
||||
if util.find_spec("numa") is not None:
|
||||
from numa import info
|
||||
|
||||
platform_data["Total NUMA nodes"] = [info.get_num_configured_nodes()]
|
||||
|
||||
if util.find_spec("cpuinfo") is not None:
|
||||
from cpuinfo import get_cpu_info
|
||||
|
||||
platform_data["CPU Brand"] = [get_cpu_info()["brand_raw"]]
|
||||
|
||||
platform_results = pd.DataFrame.from_dict(
|
||||
platform_data, orient="index", columns=["Platform Info"]
|
||||
)
|
||||
|
||||
raw_results_json = results_to_json(
|
||||
latency_results, throughput_results, serving_results
|
||||
)
|
||||
|
||||
# remapping the key, for visualization purpose
|
||||
if not latency_results.empty:
|
||||
latency_results = latency_results[list(latency_column_mapping.keys())].rename(
|
||||
columns=latency_column_mapping
|
||||
)
|
||||
if not serving_results.empty:
|
||||
serving_results = serving_results[list(serving_column_mapping.keys())].rename(
|
||||
columns=serving_column_mapping
|
||||
)
|
||||
if not throughput_results.empty:
|
||||
throughput_results = throughput_results[
|
||||
list(throughput_results_column_mapping.keys())
|
||||
].rename(columns=throughput_results_column_mapping)
|
||||
|
||||
processed_results_json = results_to_json(
|
||||
latency_results, throughput_results, serving_results
|
||||
)
|
||||
|
||||
for df in [latency_results, serving_results, throughput_results]:
|
||||
if df.empty:
|
||||
continue
|
||||
|
||||
# Sort all dataframes by their respective "Test name" columns
|
||||
df.sort_values(by="Test name", inplace=True)
|
||||
|
||||
# The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
|
||||
# we want to turn it into "8xGPUTYPE"
|
||||
df["GPU"] = df["GPU"].apply(
|
||||
lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}"
|
||||
)
|
||||
|
||||
# get markdown tables
|
||||
latency_md_table = tabulate(
|
||||
latency_results, headers="keys", tablefmt="pipe", showindex=False
|
||||
)
|
||||
serving_md_table = tabulate(
|
||||
serving_results, headers="keys", tablefmt="pipe", showindex=False
|
||||
)
|
||||
throughput_md_table = tabulate(
|
||||
throughput_results, headers="keys", tablefmt="pipe", showindex=False
|
||||
)
|
||||
platform_md_table = tabulate(
|
||||
platform_results, headers="keys", tablefmt="pipe", showindex=True
|
||||
)
|
||||
|
||||
# document the result
|
||||
with open(results_folder / "benchmark_results.md", "w") as f:
|
||||
results = read_markdown(
|
||||
"../.buildkite/nightly-benchmarks/"
|
||||
+ "performance-benchmarks-descriptions.md"
|
||||
)
|
||||
results = results.format(
|
||||
latency_tests_markdown_table=latency_md_table,
|
||||
throughput_tests_markdown_table=throughput_md_table,
|
||||
serving_tests_markdown_table=serving_md_table,
|
||||
platform_markdown_table=platform_md_table,
|
||||
benchmarking_results_in_json_string=processed_results_json,
|
||||
)
|
||||
f.write(results)
|
||||
|
||||
# document benchmarking results in json
|
||||
with open(results_folder / "benchmark_results.json", "w") as f:
|
||||
results = (
|
||||
latency_results.to_dict(orient="records")
|
||||
+ throughput_results.to_dict(orient="records")
|
||||
+ serving_results.to_dict(orient="records")
|
||||
)
|
||||
f.write(json.dumps(results))
|
@ -1,26 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import argparse
|
||||
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
|
||||
def main(model, cachedir):
|
||||
# Load the tokenizer and save it to the specified directory
|
||||
tokenizer = AutoTokenizer.from_pretrained(model)
|
||||
tokenizer.save_pretrained(cachedir)
|
||||
print(f"Tokenizer saved to {cachedir}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Download and save Hugging Face tokenizer"
|
||||
)
|
||||
parser.add_argument("--model", type=str, required=True, help="Name of the model")
|
||||
parser.add_argument(
|
||||
"--cachedir", type=str, required=True, help="Directory to save the tokenizer"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
main(args.model, args.cachedir)
|
@ -1,97 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from tabulate import tabulate
|
||||
|
||||
|
||||
def parse_arguments():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Parse command line arguments for summary-nightly-results script."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--results-folder",
|
||||
type=str,
|
||||
required=True,
|
||||
help="The folder where the results are stored.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--description", type=str, required=True, help="Description of the results."
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
def get_perf(df, method, model, metric):
|
||||
means = []
|
||||
|
||||
for qps in [2, 4, 8, 16, "inf"]:
|
||||
target = df["Test name"].str.contains(model)
|
||||
target = target & df["Engine"].str.contains(method)
|
||||
target = target & df["Test name"].str.contains("qps_" + str(qps))
|
||||
filtered_df = df[target]
|
||||
|
||||
if filtered_df.empty:
|
||||
means.append(0.0)
|
||||
else:
|
||||
means.append(filtered_df[metric].values[0])
|
||||
|
||||
return np.array(means)
|
||||
|
||||
|
||||
def get_perf_w_std(df, method, model, metric):
|
||||
if metric in ["TTFT", "ITL"]:
|
||||
mean = get_perf(df, method, model, "Mean " + metric + " (ms)")
|
||||
mean = mean.tolist()
|
||||
std = get_perf(df, method, model, "Std " + metric + " (ms)")
|
||||
if std.mean() == 0:
|
||||
std = None
|
||||
success = get_perf(df, method, model, "Successful req.")
|
||||
if std is not None:
|
||||
std = std / np.sqrt(success)
|
||||
std = std.tolist()
|
||||
|
||||
else:
|
||||
assert metric == "Tput"
|
||||
mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf(
|
||||
df, method, model, "Output Tput (tok/s)"
|
||||
)
|
||||
mean = mean.tolist()
|
||||
std = None
|
||||
|
||||
return mean, std
|
||||
|
||||
|
||||
def main(args):
|
||||
results_folder = Path(args.results_folder)
|
||||
|
||||
results = []
|
||||
|
||||
# collect results
|
||||
for test_file in results_folder.glob("*_nightly_results.json"):
|
||||
with open(test_file) as f:
|
||||
results = results + json.loads(f.read())
|
||||
|
||||
# generate markdown table
|
||||
df = pd.DataFrame.from_dict(results)
|
||||
|
||||
md_table = tabulate(df, headers="keys", tablefmt="pipe", showindex=False)
|
||||
|
||||
with open(args.description) as f:
|
||||
description = f.read()
|
||||
|
||||
description = description.format(nightly_results_benchmarking_table=md_table)
|
||||
|
||||
with open("nightly_results.md", "w") as f:
|
||||
f.write(description)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_arguments()
|
||||
main(args)
|
@ -1,9 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from lmdeploy.serve.openai.api_client import APIClient
|
||||
|
||||
api_client = APIClient("http://localhost:8000")
|
||||
model_name = api_client.available_models[0]
|
||||
|
||||
print(model_name)
|
@ -1,228 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Currently FP8 benchmark is NOT enabled.
|
||||
|
||||
set -x
|
||||
server_params=$1
|
||||
common_params=$2
|
||||
|
||||
json2args() {
|
||||
# transforms the JSON string to command line args, and '_' is replaced to '-'
|
||||
# example:
|
||||
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
|
||||
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
|
||||
local json_string=$1
|
||||
local args=$(
|
||||
echo "$json_string" | jq -r '
|
||||
to_entries |
|
||||
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
|
||||
join(" ")
|
||||
'
|
||||
)
|
||||
echo "$args"
|
||||
}
|
||||
|
||||
launch_trt_server() {
|
||||
|
||||
model_path=$(echo "$common_params" | jq -r '.model')
|
||||
model_name="${model_path#*/}"
|
||||
model_type=$(echo "$server_params" | jq -r '.model_type')
|
||||
model_dtype=$(echo "$server_params" | jq -r '.model_dtype')
|
||||
model_tp_size=$(echo "$common_params" | jq -r '.tp')
|
||||
max_batch_size=$(echo "$server_params" | jq -r '.max_batch_size')
|
||||
max_input_len=$(echo "$server_params" | jq -r '.max_input_len')
|
||||
max_seq_len=$(echo "$server_params" | jq -r '.max_seq_len')
|
||||
max_num_tokens=$(echo "$server_params" | jq -r '.max_num_tokens')
|
||||
trt_llm_version=$(echo "$server_params" | jq -r '.trt_llm_version')
|
||||
|
||||
# create model caching directory
|
||||
cd ~
|
||||
rm -rf models
|
||||
mkdir -p models
|
||||
cd models
|
||||
models_dir=$(pwd)
|
||||
trt_model_path=${models_dir}/${model_name}-trt-ckpt
|
||||
trt_engine_path=${models_dir}/${model_name}-trt-engine
|
||||
|
||||
# clone tensorrt backend
|
||||
cd /
|
||||
rm -rf tensorrtllm_backend
|
||||
git clone https://github.com/triton-inference-server/tensorrtllm_backend.git
|
||||
git lfs install
|
||||
cd tensorrtllm_backend
|
||||
git checkout "$trt_llm_version"
|
||||
git submodule update --init --recursive
|
||||
|
||||
# build trtllm engine
|
||||
cd /tensorrtllm_backend
|
||||
cd "./tensorrt_llm/examples/${model_type}"
|
||||
python3 convert_checkpoint.py \
|
||||
--model_dir "${model_path}" \
|
||||
--dtype "${model_dtype}" \
|
||||
--tp_size "${model_tp_size}" \
|
||||
--output_dir "${trt_model_path}"
|
||||
trtllm-build \
|
||||
--checkpoint_dir "${trt_model_path}" \
|
||||
--use_fused_mlp \
|
||||
--reduce_fusion disable \
|
||||
--workers 8 \
|
||||
--gpt_attention_plugin "${model_dtype}" \
|
||||
--gemm_plugin "${model_dtype}" \
|
||||
--tp_size "${model_tp_size}" \
|
||||
--max_batch_size "${max_batch_size}" \
|
||||
--max_input_len "${max_input_len}" \
|
||||
--max_seq_len "${max_seq_len}" \
|
||||
--max_num_tokens "${max_num_tokens}" \
|
||||
--output_dir "${trt_engine_path}"
|
||||
|
||||
# handle triton protobuf files and launch triton server
|
||||
cd /tensorrtllm_backend
|
||||
mkdir triton_model_repo
|
||||
cp -r all_models/inflight_batcher_llm/* triton_model_repo/
|
||||
cd triton_model_repo
|
||||
rm -rf ./tensorrt_llm/1/*
|
||||
cp -r "${trt_engine_path}"/* ./tensorrt_llm/1
|
||||
python3 ../tools/fill_template.py -i tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,engine_dir:/tensorrtllm_backend/triton_model_repo/tensorrt_llm/1,decoupled_mode:true,batching_strategy:inflight_fused_batching,batch_scheduler_policy:guaranteed_no_evict,exclude_input_in_output:true,triton_max_batch_size:2048,max_queue_delay_microseconds:0,max_beam_width:1,max_queue_size:2048,enable_kv_cache_reuse:false
|
||||
python3 ../tools/fill_template.py -i preprocessing/config.pbtxt "triton_max_batch_size:2048,tokenizer_dir:$model_path,preprocessing_instance_count:5"
|
||||
python3 ../tools/fill_template.py -i postprocessing/config.pbtxt "triton_max_batch_size:2048,tokenizer_dir:$model_path,postprocessing_instance_count:5,skip_special_tokens:false"
|
||||
python3 ../tools/fill_template.py -i ensemble/config.pbtxt triton_max_batch_size:"$max_batch_size"
|
||||
python3 ../tools/fill_template.py -i tensorrt_llm_bls/config.pbtxt "triton_max_batch_size:$max_batch_size,decoupled_mode:true,accumulate_tokens:False,bls_instance_count:1"
|
||||
cd /tensorrtllm_backend
|
||||
python3 scripts/launch_triton_server.py \
|
||||
--world_size="${model_tp_size}" \
|
||||
--model_repo=/tensorrtllm_backend/triton_model_repo &
|
||||
|
||||
}
|
||||
|
||||
launch_tgi_server() {
|
||||
model=$(echo "$common_params" | jq -r '.model')
|
||||
tp=$(echo "$common_params" | jq -r '.tp')
|
||||
port=$(echo "$common_params" | jq -r '.port')
|
||||
server_args=$(json2args "$server_params")
|
||||
|
||||
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
|
||||
echo "Key 'fp8' exists in common params."
|
||||
server_command="/tgi-entrypoint.sh \
|
||||
--model-id $model \
|
||||
--num-shard $tp \
|
||||
--port $port \
|
||||
--quantize fp8 \
|
||||
$server_args"
|
||||
else
|
||||
echo "Key 'fp8' does not exist in common params."
|
||||
server_command="/tgi-entrypoint.sh \
|
||||
--model-id $model \
|
||||
--num-shard $tp \
|
||||
--port $port \
|
||||
$server_args"
|
||||
fi
|
||||
|
||||
echo "Server command: $server_command"
|
||||
eval "$server_command" &
|
||||
|
||||
}
|
||||
|
||||
launch_lmdeploy_server() {
|
||||
model=$(echo "$common_params" | jq -r '.model')
|
||||
tp=$(echo "$common_params" | jq -r '.tp')
|
||||
port=$(echo "$common_params" | jq -r '.port')
|
||||
server_args=$(json2args "$server_params")
|
||||
|
||||
server_command="lmdeploy serve api_server $model \
|
||||
--tp $tp \
|
||||
--server-port $port \
|
||||
$server_args"
|
||||
|
||||
# run the server
|
||||
echo "Server command: $server_command"
|
||||
bash -c "$server_command" &
|
||||
}
|
||||
|
||||
launch_sglang_server() {
|
||||
|
||||
model=$(echo "$common_params" | jq -r '.model')
|
||||
tp=$(echo "$common_params" | jq -r '.tp')
|
||||
port=$(echo "$common_params" | jq -r '.port')
|
||||
server_args=$(json2args "$server_params")
|
||||
|
||||
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
|
||||
echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
|
||||
model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
|
||||
server_command="python3 \
|
||||
-m sglang.launch_server \
|
||||
--tp $tp \
|
||||
--model-path $model \
|
||||
--port $port \
|
||||
$server_args"
|
||||
else
|
||||
echo "Key 'fp8' does not exist in common params."
|
||||
server_command="python3 \
|
||||
-m sglang.launch_server \
|
||||
--tp $tp \
|
||||
--model-path $model \
|
||||
--port $port \
|
||||
$server_args"
|
||||
fi
|
||||
|
||||
# run the server
|
||||
echo "Server command: $server_command"
|
||||
eval "$server_command" &
|
||||
}
|
||||
|
||||
launch_vllm_server() {
|
||||
|
||||
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
|
||||
|
||||
model=$(echo "$common_params" | jq -r '.model')
|
||||
tp=$(echo "$common_params" | jq -r '.tp')
|
||||
port=$(echo "$common_params" | jq -r '.port')
|
||||
server_args=$(json2args "$server_params")
|
||||
|
||||
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
|
||||
echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
|
||||
model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
|
||||
server_command="python3 \
|
||||
-m vllm.entrypoints.openai.api_server \
|
||||
-tp $tp \
|
||||
--model $model \
|
||||
--port $port \
|
||||
$server_args"
|
||||
else
|
||||
echo "Key 'fp8' does not exist in common params."
|
||||
server_command="python3 \
|
||||
-m vllm.entrypoints.openai.api_server \
|
||||
-tp $tp \
|
||||
--model $model \
|
||||
--port $port \
|
||||
$server_args"
|
||||
fi
|
||||
|
||||
# run the server
|
||||
echo "Server command: $server_command"
|
||||
eval "$server_command" &
|
||||
}
|
||||
|
||||
main() {
|
||||
|
||||
if [[ "$CURRENT_LLM_SERVING_ENGINE" == "trt" ]]; then
|
||||
launch_trt_server
|
||||
fi
|
||||
|
||||
if [[ "$CURRENT_LLM_SERVING_ENGINE" == "tgi" ]]; then
|
||||
launch_tgi_server
|
||||
fi
|
||||
|
||||
if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then
|
||||
launch_lmdeploy_server
|
||||
fi
|
||||
|
||||
if [[ "$CURRENT_LLM_SERVING_ENGINE" == "sglang" ]]; then
|
||||
launch_sglang_server
|
||||
fi
|
||||
|
||||
if [[ "$CURRENT_LLM_SERVING_ENGINE" == *"vllm"* ]]; then
|
||||
launch_vllm_server
|
||||
fi
|
||||
}
|
||||
|
||||
main
|
@ -1,78 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -ex
|
||||
set -o pipefail
|
||||
|
||||
|
||||
main() {
|
||||
|
||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||
(which jq) || (apt-get update && apt-get -y install jq)
|
||||
(which zip) || (apt-get install -y zip)
|
||||
|
||||
if [ ! -f /workspace/buildkite-agent ]; then
|
||||
echo "buildkite-agent binary not found. Skip plotting the results."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# initial annotation
|
||||
#description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md"
|
||||
|
||||
# download results
|
||||
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
|
||||
mkdir -p results/
|
||||
/workspace/buildkite-agent artifact download 'results/*nightly_results.json' results/
|
||||
ls
|
||||
ls results/
|
||||
|
||||
# upload benchmark results
|
||||
zip -r results.zip results/
|
||||
/workspace/buildkite-agent artifact upload "results.zip"
|
||||
|
||||
# upload benchmarking scripts
|
||||
cd "$VLLM_SOURCE_CODE_LOC/"
|
||||
zip -r nightly-benchmarks.zip .buildkite/ benchmarks/
|
||||
/workspace/buildkite-agent artifact upload "nightly-benchmarks.zip"
|
||||
|
||||
cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
|
||||
# upload benchmarking pipeline
|
||||
/workspace/buildkite-agent artifact upload "nightly-pipeline.yaml"
|
||||
|
||||
cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
|
||||
/workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly-annotation.md
|
||||
|
||||
|
||||
|
||||
# The figures should be generated by a separate process outside the CI/CD pipeline
|
||||
|
||||
# # generate figures
|
||||
# python3 -m pip install tabulate pandas matplotlib
|
||||
|
||||
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py \
|
||||
# --description $description \
|
||||
# --results-folder results/
|
||||
|
||||
|
||||
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
|
||||
# --description $description \
|
||||
# --results-folder results/ \
|
||||
# --dataset sharegpt
|
||||
|
||||
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
|
||||
# --description $description \
|
||||
# --results-folder results/ \
|
||||
# --dataset sonnet_2048_128
|
||||
|
||||
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
|
||||
# --description $description \
|
||||
# --results-folder results/ \
|
||||
# --dataset sonnet_128_2048
|
||||
|
||||
# # upload results and figures
|
||||
# /workspace/buildkite-agent artifact upload "nightly_results*.png"
|
||||
# /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
|
||||
# /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json
|
||||
# /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md
|
||||
}
|
||||
|
||||
main "$@"
|
@ -1,462 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -o pipefail
|
||||
set -x
|
||||
|
||||
check_gpus() {
|
||||
# check the number of GPUs and GPU type.
|
||||
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
||||
if [[ $gpu_count -gt 0 ]]; then
|
||||
echo "GPU found."
|
||||
else
|
||||
echo "Need at least 1 GPU to run benchmarking."
|
||||
exit 1
|
||||
fi
|
||||
declare -g gpu_type="$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')"
|
||||
echo "GPU type is $gpu_type"
|
||||
}
|
||||
|
||||
check_hf_token() {
|
||||
# check if HF_TOKEN is available and valid
|
||||
if [[ -z "$HF_TOKEN" ]]; then
|
||||
echo "Error: HF_TOKEN is not set."
|
||||
exit 1
|
||||
elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
|
||||
echo "Error: HF_TOKEN does not start with 'hf_'."
|
||||
exit 1
|
||||
else
|
||||
echo "HF_TOKEN is set and valid."
|
||||
fi
|
||||
}
|
||||
|
||||
|
||||
upload_to_buildkite() {
|
||||
# upload the benchmarking results to buildkite
|
||||
|
||||
# if the agent binary is not found, skip uploading the results, exit 0
|
||||
if [ ! -f /workspace/buildkite-agent ]; then
|
||||
echo "buildkite-agent binary not found. Skip uploading the results."
|
||||
return 0
|
||||
fi
|
||||
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
|
||||
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
|
||||
}
|
||||
|
||||
|
||||
get_current_llm_serving_engine() {
|
||||
|
||||
if which lmdeploy >/dev/null; then
|
||||
echo "Container: lmdeploy"
|
||||
export CURRENT_LLM_SERVING_ENGINE=lmdeploy
|
||||
return
|
||||
fi
|
||||
|
||||
if [ -e /tgi-entrypoint.sh ]; then
|
||||
echo "Container: tgi"
|
||||
export CURRENT_LLM_SERVING_ENGINE=tgi
|
||||
return
|
||||
fi
|
||||
|
||||
if which trtllm-build >/dev/null; then
|
||||
echo "Container: tensorrt-llm"
|
||||
export CURRENT_LLM_SERVING_ENGINE=trt
|
||||
return
|
||||
fi
|
||||
|
||||
if [ -e /sgl-workspace ]; then
|
||||
echo "Container: sglang"
|
||||
export CURRENT_LLM_SERVING_ENGINE=sglang
|
||||
return
|
||||
fi
|
||||
|
||||
if [ -e /vllm-workspace ]; then
|
||||
echo "Container: vllm"
|
||||
# move to a completely irrelevant directory, to avoid import vllm from current folder
|
||||
export CURRENT_LLM_SERVING_ENGINE=vllm
|
||||
|
||||
return
|
||||
fi
|
||||
}
|
||||
|
||||
json2args() {
|
||||
# transforms the JSON string to command line args, and '_' is replaced to '-'
|
||||
# example:
|
||||
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
|
||||
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
|
||||
local json_string=$1
|
||||
local args=$(
|
||||
echo "$json_string" | jq -r '
|
||||
to_entries |
|
||||
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
|
||||
join(" ")
|
||||
'
|
||||
)
|
||||
echo "$args"
|
||||
}
|
||||
|
||||
kill_gpu_processes() {
|
||||
pkill -f python
|
||||
pkill -f python3
|
||||
pkill -f tritonserver
|
||||
pkill -f pt_main_thread
|
||||
pkill -f text-generation
|
||||
pkill -f lmdeploy
|
||||
|
||||
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
|
||||
sleep 1
|
||||
done
|
||||
}
|
||||
|
||||
wait_for_server() {
|
||||
# wait for vllm server to start
|
||||
# return 1 if vllm server crashes
|
||||
timeout 1200 bash -c '
|
||||
until curl -s localhost:8000/v1/completions > /dev/null; do
|
||||
sleep 1
|
||||
done' && return 0 || return 1
|
||||
}
|
||||
|
||||
ensure_installed() {
|
||||
# Ensure that the given command is installed by apt-get
|
||||
local cmd=$1
|
||||
if ! which "$cmd" >/dev/null; then
|
||||
apt-get update && apt-get install -y "$cmd"
|
||||
fi
|
||||
}
|
||||
|
||||
run_serving_tests() {
|
||||
# run serving tests using `benchmark_serving.py`
|
||||
# $1: a json file specifying serving test cases
|
||||
|
||||
local serving_test_file
|
||||
serving_test_file=$1
|
||||
|
||||
# Iterate over serving tests
|
||||
jq -c '.[]' "$serving_test_file" | while read -r params; do
|
||||
# get the test name, and append the GPU type back to it.
|
||||
test_name=$(echo "$params" | jq -r '.test_name')
|
||||
|
||||
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
||||
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
||||
echo "Skip test case $test_name."
|
||||
continue
|
||||
fi
|
||||
|
||||
# prepend the current serving engine to the test name
|
||||
test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
|
||||
|
||||
# get common parameters
|
||||
common_params=$(echo "$params" | jq -r '.common_parameters')
|
||||
model=$(echo "$common_params" | jq -r '.model')
|
||||
tp=$(echo "$common_params" | jq -r '.tp')
|
||||
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
|
||||
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
|
||||
port=$(echo "$common_params" | jq -r '.port')
|
||||
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
|
||||
reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
|
||||
|
||||
# get client and server arguments
|
||||
server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
|
||||
client_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_client_parameters")
|
||||
client_args=$(json2args "$client_params")
|
||||
qps_list=$(echo "$params" | jq -r '.qps_list')
|
||||
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
||||
echo "Running over qps list $qps_list"
|
||||
|
||||
# check if there is enough GPU to run the test
|
||||
if [[ $gpu_count -lt $tp ]]; then
|
||||
echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
||||
continue
|
||||
fi
|
||||
|
||||
if [[ $reuse_server == "true" ]]; then
|
||||
echo "Reuse previous server for test case $test_name"
|
||||
else
|
||||
kill_gpu_processes
|
||||
bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
|
||||
"$server_params" "$common_params"
|
||||
fi
|
||||
|
||||
if wait_for_server; then
|
||||
echo ""
|
||||
echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
|
||||
else
|
||||
echo ""
|
||||
echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
|
||||
break
|
||||
fi
|
||||
|
||||
# prepare tokenizer
|
||||
# this is required for lmdeploy.
|
||||
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
|
||||
rm -rf /tokenizer_cache
|
||||
mkdir /tokenizer_cache
|
||||
python3 ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
|
||||
--model "$model" \
|
||||
--cachedir /tokenizer_cache
|
||||
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
|
||||
|
||||
|
||||
# change model name for lmdeploy (it will not follow standard hf name)
|
||||
if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then
|
||||
model=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py)
|
||||
fi
|
||||
|
||||
# iterate over different QPS
|
||||
for qps in $qps_list; do
|
||||
# remove the surrounding single quote from qps
|
||||
if [[ "$qps" == *"inf"* ]]; then
|
||||
echo "qps was $qps"
|
||||
qps="inf"
|
||||
echo "now qps is $qps"
|
||||
fi
|
||||
|
||||
new_test_name=$test_name"_qps_"$qps
|
||||
|
||||
backend=$CURRENT_LLM_SERVING_ENGINE
|
||||
|
||||
if [[ $backend = "trt" ]]; then
|
||||
backend="tensorrt-llm"
|
||||
fi
|
||||
|
||||
if [[ "$backend" == *"vllm"* ]]; then
|
||||
backend="vllm"
|
||||
fi
|
||||
|
||||
if [[ "$dataset_name" = "sharegpt" ]]; then
|
||||
|
||||
client_command="python3 benchmark_serving.py \
|
||||
--backend $backend \
|
||||
--tokenizer /tokenizer_cache \
|
||||
--model $model \
|
||||
--dataset-name $dataset_name \
|
||||
--dataset-path $dataset_path \
|
||||
--num-prompts $num_prompts \
|
||||
--port $port \
|
||||
--save-result \
|
||||
--result-dir $RESULTS_FOLDER \
|
||||
--result-filename ${new_test_name}.json \
|
||||
--request-rate $qps \
|
||||
--ignore-eos \
|
||||
$client_args"
|
||||
|
||||
elif [[ "$dataset_name" = "sonnet" ]]; then
|
||||
|
||||
sonnet_input_len=$(echo "$common_params" | jq -r '.sonnet_input_len')
|
||||
sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
|
||||
sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
|
||||
|
||||
client_command="python3 benchmark_serving.py \
|
||||
--backend $backend \
|
||||
--tokenizer /tokenizer_cache \
|
||||
--model $model \
|
||||
--dataset-name $dataset_name \
|
||||
--dataset-path $dataset_path \
|
||||
--num-prompts $num_prompts \
|
||||
--sonnet-input-len $sonnet_input_len \
|
||||
--sonnet-output-len $sonnet_output_len \
|
||||
--sonnet-prefix-len $sonnet_prefix_len \
|
||||
--port $port \
|
||||
--save-result \
|
||||
--result-dir $RESULTS_FOLDER \
|
||||
--result-filename ${new_test_name}.json \
|
||||
--request-rate $qps \
|
||||
--ignore-eos \
|
||||
$client_args"
|
||||
|
||||
else
|
||||
|
||||
echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name."
|
||||
exit 1
|
||||
|
||||
fi
|
||||
|
||||
|
||||
|
||||
echo "Running test case $test_name with qps $qps"
|
||||
echo "Client command: $client_command"
|
||||
|
||||
eval "$client_command"
|
||||
|
||||
server_command="None"
|
||||
|
||||
# record the benchmarking commands
|
||||
jq_output=$(jq -n \
|
||||
--arg server "$server_command" \
|
||||
--arg client "$client_command" \
|
||||
--arg gpu "$gpu_type" \
|
||||
--arg engine "$CURRENT_LLM_SERVING_ENGINE" \
|
||||
'{
|
||||
server_command: $server,
|
||||
client_command: $client,
|
||||
gpu_type: $gpu,
|
||||
engine: $engine
|
||||
}')
|
||||
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
|
||||
|
||||
done
|
||||
|
||||
done
|
||||
|
||||
kill_gpu_processes
|
||||
}
|
||||
|
||||
run_genai_perf_tests() {
|
||||
# run genai-perf tests
|
||||
|
||||
# $1: a json file specifying genai-perf test cases
|
||||
local genai_perf_test_file
|
||||
genai_perf_test_file=$1
|
||||
|
||||
# Iterate over genai-perf tests
|
||||
jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
|
||||
# get the test name, and append the GPU type back to it.
|
||||
test_name=$(echo "$params" | jq -r '.test_name')
|
||||
|
||||
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
||||
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
||||
echo "Skip test case $test_name."
|
||||
continue
|
||||
fi
|
||||
|
||||
# prepend the current serving engine to the test name
|
||||
test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
|
||||
|
||||
# get common parameters
|
||||
common_params=$(echo "$params" | jq -r '.common_parameters')
|
||||
model=$(echo "$common_params" | jq -r '.model')
|
||||
tp=$(echo "$common_params" | jq -r '.tp')
|
||||
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
|
||||
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
|
||||
port=$(echo "$common_params" | jq -r '.port')
|
||||
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
|
||||
reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
|
||||
|
||||
# get client and server arguments
|
||||
server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
|
||||
qps_list=$(echo "$params" | jq -r '.qps_list')
|
||||
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
||||
echo "Running over qps list $qps_list"
|
||||
|
||||
# check if there is enough GPU to run the test
|
||||
if [[ $gpu_count -lt $tp ]]; then
|
||||
echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
||||
continue
|
||||
fi
|
||||
|
||||
if [[ $reuse_server == "true" ]]; then
|
||||
echo "Reuse previous server for test case $test_name"
|
||||
else
|
||||
kill_gpu_processes
|
||||
bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
|
||||
"$server_params" "$common_params"
|
||||
fi
|
||||
|
||||
if wait_for_server; then
|
||||
echo ""
|
||||
echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
|
||||
else
|
||||
echo ""
|
||||
echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
|
||||
break
|
||||
fi
|
||||
|
||||
# iterate over different QPS
|
||||
for qps in $qps_list; do
|
||||
# remove the surrounding single quote from qps
|
||||
if [[ "$qps" == *"inf"* ]]; then
|
||||
echo "qps was $qps"
|
||||
qps=$num_prompts
|
||||
echo "now qps is $qps"
|
||||
fi
|
||||
|
||||
new_test_name=$test_name"_qps_"$qps
|
||||
backend=$CURRENT_LLM_SERVING_ENGINE
|
||||
|
||||
if [[ "$backend" == *"vllm"* ]]; then
|
||||
backend="vllm"
|
||||
fi
|
||||
#TODO: add output dir.
|
||||
client_command="genai-perf profile \
|
||||
-m $model \
|
||||
--service-kind openai \
|
||||
--backend vllm \
|
||||
--endpoint-type chat \
|
||||
--streaming \
|
||||
--url localhost:$port \
|
||||
--request-rate $qps \
|
||||
--num-prompts $num_prompts \
|
||||
"
|
||||
|
||||
echo "Client command: $client_command"
|
||||
|
||||
eval "$client_command"
|
||||
|
||||
#TODO: process/record outputs
|
||||
done
|
||||
done
|
||||
|
||||
kill_gpu_processes
|
||||
|
||||
}
|
||||
|
||||
prepare_dataset() {
|
||||
|
||||
# download sharegpt dataset
|
||||
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
|
||||
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||
|
||||
# duplicate sonnet by 4x, to allow benchmarking with input length 2048
|
||||
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
|
||||
echo "" > sonnet_4x.txt
|
||||
for _ in {1..4}
|
||||
do
|
||||
cat sonnet.txt >> sonnet_4x.txt
|
||||
done
|
||||
|
||||
}
|
||||
|
||||
main() {
|
||||
|
||||
# check if the environment variable is successfully injected from yaml
|
||||
|
||||
check_gpus
|
||||
check_hf_token
|
||||
get_current_llm_serving_engine
|
||||
|
||||
pip install -U transformers
|
||||
|
||||
pip install -r requirements/dev.txt
|
||||
which genai-perf
|
||||
|
||||
# check storage
|
||||
df -h
|
||||
|
||||
ensure_installed wget
|
||||
ensure_installed curl
|
||||
ensure_installed jq
|
||||
# genai-perf dependency
|
||||
ensure_installed libb64-0d
|
||||
|
||||
prepare_dataset
|
||||
|
||||
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
|
||||
declare -g RESULTS_FOLDER=results/
|
||||
mkdir -p $RESULTS_FOLDER
|
||||
BENCHMARK_ROOT="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
|
||||
|
||||
# run the test
|
||||
run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json"
|
||||
|
||||
# run genai-perf tests
|
||||
run_genai_perf_tests "$BENCHMARK_ROOT/tests/genai-perf-tests.json"
|
||||
mv artifacts/ $RESULTS_FOLDER/
|
||||
|
||||
# upload benchmark results to buildkite
|
||||
python3 -m pip install tabulate pandas
|
||||
python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py"
|
||||
upload_to_buildkite
|
||||
|
||||
}
|
||||
|
||||
main "$@"
|
@ -1,474 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script should be run inside the CI process
|
||||
# This script assumes that we are already inside the vllm/ directory
|
||||
# Benchmarking results will be available inside vllm/benchmarks/results/
|
||||
|
||||
# Do not set -e, as the mixtral 8x22B model tends to crash occasionally
|
||||
# and we still want to see other benchmarking results even when mixtral crashes.
|
||||
set -x
|
||||
set -o pipefail
|
||||
|
||||
check_gpus() {
|
||||
if command -v nvidia-smi; then
|
||||
# check the number of GPUs and GPU type.
|
||||
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
||||
elif command -v amd-smi; then
|
||||
declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
|
||||
fi
|
||||
|
||||
if [[ $gpu_count -gt 0 ]]; then
|
||||
echo "GPU found."
|
||||
else
|
||||
echo "Need at least 1 GPU to run benchmarking."
|
||||
exit 1
|
||||
fi
|
||||
if command -v nvidia-smi; then
|
||||
declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
|
||||
elif command -v amd-smi; then
|
||||
declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}')
|
||||
fi
|
||||
echo "GPU type is $gpu_type"
|
||||
}
|
||||
|
||||
check_cpus() {
|
||||
# check the number of CPUs and NUMA Node and GPU type.
|
||||
declare -g numa_count=$(python3 -c "from numa import info;numa_size = info.get_num_configured_nodes(); print(numa_size)")
|
||||
if [[ $numa_count -gt 0 ]]; then
|
||||
echo "NUMA found."
|
||||
echo $numa_count
|
||||
else
|
||||
echo "Need at least 1 NUMA to run benchmarking."
|
||||
exit 1
|
||||
fi
|
||||
declare -g gpu_type="cpu"
|
||||
echo "GPU type is $gpu_type"
|
||||
}
|
||||
|
||||
check_hf_token() {
|
||||
# check if HF_TOKEN is available and valid
|
||||
if [[ -z "$HF_TOKEN" ]]; then
|
||||
echo "Error: HF_TOKEN is not set."
|
||||
exit 1
|
||||
elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
|
||||
echo "Error: HF_TOKEN does not start with 'hf_'."
|
||||
exit 1
|
||||
else
|
||||
echo "HF_TOKEN is set and valid."
|
||||
fi
|
||||
}
|
||||
|
||||
ensure_sharegpt_downloaded() {
|
||||
local FILE=ShareGPT_V3_unfiltered_cleaned_split.json
|
||||
if [ ! -f "$FILE" ]; then
|
||||
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/$FILE
|
||||
else
|
||||
echo "$FILE already exists."
|
||||
fi
|
||||
}
|
||||
|
||||
json2args() {
|
||||
# transforms the JSON string to command line args, and '_' is replaced to '-'
|
||||
# example:
|
||||
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
|
||||
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
|
||||
local json_string=$1
|
||||
local args=$(
|
||||
echo "$json_string" | jq -r '
|
||||
to_entries |
|
||||
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
|
||||
join(" ")
|
||||
'
|
||||
)
|
||||
echo "$args"
|
||||
}
|
||||
|
||||
json2envs() {
|
||||
# transforms the JSON string to environment variables.
|
||||
# example:
|
||||
# input: { "VLLM_CPU_KVCACHE_SPACE": 5 }
|
||||
# output: VLLM_CPU_KVCACHE_SPACE=5
|
||||
local json_string=$1
|
||||
local args=$(
|
||||
echo "$json_string" | jq -r '
|
||||
to_entries |
|
||||
map((.key ) + "=" + (.value | tostring)) |
|
||||
join(" ")
|
||||
'
|
||||
)
|
||||
echo "$args"
|
||||
}
|
||||
|
||||
wait_for_server() {
|
||||
# wait for vllm server to start
|
||||
# return 1 if vllm server crashes
|
||||
timeout 1200 bash -c '
|
||||
until curl -X POST localhost:8000/v1/completions; do
|
||||
sleep 1
|
||||
done' && return 0 || return 1
|
||||
}
|
||||
|
||||
kill_processes_launched_by_current_bash() {
|
||||
# Kill all python processes launched from current bash script
|
||||
current_shell_pid=$$
|
||||
processes=$(ps -eo pid,ppid,command | awk -v ppid="$current_shell_pid" -v proc="$1" '$2 == ppid && $3 ~ proc {print $1}')
|
||||
if [ -n "$processes" ]; then
|
||||
echo "Killing the following processes matching '$1':"
|
||||
echo "$processes"
|
||||
echo "$processes" | xargs kill -9
|
||||
else
|
||||
echo "No processes found matching '$1'."
|
||||
fi
|
||||
}
|
||||
|
||||
kill_gpu_processes() {
|
||||
|
||||
ps -aux
|
||||
lsof -t -i:8000 | xargs -r kill -9
|
||||
pgrep python3 | xargs -r kill -9
|
||||
|
||||
|
||||
# wait until GPU memory usage smaller than 1GB
|
||||
if command -v nvidia-smi; then
|
||||
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
|
||||
sleep 1
|
||||
done
|
||||
elif command -v amd-smi; then
|
||||
while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do
|
||||
sleep 1
|
||||
done
|
||||
fi
|
||||
|
||||
# remove vllm config file
|
||||
rm -rf ~/.config/vllm
|
||||
|
||||
}
|
||||
|
||||
upload_to_buildkite() {
|
||||
# upload the benchmarking results to buildkite
|
||||
|
||||
# if the agent binary is not found, skip uploading the results, exit 0
|
||||
# Check if buildkite-agent is available in the PATH or at /workspace/buildkite-agent
|
||||
if command -v buildkite-agent >/dev/null 2>&1; then
|
||||
BUILDKITE_AGENT_COMMAND="buildkite-agent"
|
||||
elif [ -f /workspace/buildkite-agent ]; then
|
||||
BUILDKITE_AGENT_COMMAND="/workspace/buildkite-agent"
|
||||
else
|
||||
echo "buildkite-agent binary not found. Skip uploading the results."
|
||||
return 0
|
||||
fi
|
||||
|
||||
# Use the determined command to annotate and upload artifacts
|
||||
$BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" < "$RESULTS_FOLDER/benchmark_results.md"
|
||||
$BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
|
||||
}
|
||||
|
||||
run_latency_tests() {
|
||||
# run latency tests using `benchmark_latency.py`
|
||||
# $1: a json file specifying latency test cases
|
||||
|
||||
local latency_test_file
|
||||
latency_test_file=$1
|
||||
|
||||
# Iterate over latency tests
|
||||
jq -c '.[]' "$latency_test_file" | while read -r params; do
|
||||
# get the test name, and append the GPU type back to it.
|
||||
test_name=$(echo "$params" | jq -r '.test_name')
|
||||
if [[ ! "$test_name" =~ ^latency_ ]]; then
|
||||
echo "In latency-test.json, test_name must start with \"latency_\"."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
||||
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
||||
echo "Skip test case $test_name."
|
||||
continue
|
||||
fi
|
||||
|
||||
# get arguments
|
||||
latency_params=$(echo "$params" | jq -r '.parameters')
|
||||
latency_args=$(json2args "$latency_params")
|
||||
latency_environment_variables=$(echo "$params" | jq -r '.environment_variables')
|
||||
latency_envs=$(json2envs "$latency_environment_variables")
|
||||
|
||||
# check if there is enough GPU to run the test
|
||||
tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
|
||||
if [ "$ON_CPU" == "1" ];then
|
||||
if [[ $numa_count -lt $tp ]]; then
|
||||
echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name."
|
||||
continue
|
||||
fi
|
||||
else
|
||||
if [[ $gpu_count -lt $tp ]]; then
|
||||
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
||||
continue
|
||||
fi
|
||||
fi
|
||||
|
||||
latency_command=" $latency_envs python3 benchmark_latency.py \
|
||||
--output-json $RESULTS_FOLDER/${test_name}.json \
|
||||
$latency_args"
|
||||
|
||||
echo "Running test case $test_name"
|
||||
echo "Latency command: $latency_command"
|
||||
|
||||
# recoding benchmarking command ang GPU command
|
||||
jq_output=$(jq -n \
|
||||
--arg latency "$latency_command" \
|
||||
--arg gpu "$gpu_type" \
|
||||
'{
|
||||
latency_command: $latency,
|
||||
gpu_type: $gpu
|
||||
}')
|
||||
echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
|
||||
|
||||
# run the benchmark
|
||||
eval "$latency_command"
|
||||
|
||||
kill_gpu_processes
|
||||
|
||||
done
|
||||
}
|
||||
|
||||
run_throughput_tests() {
|
||||
# run throughput tests using `benchmark_throughput.py`
|
||||
# $1: a json file specifying throughput test cases
|
||||
|
||||
local throughput_test_file
|
||||
throughput_test_file=$1
|
||||
|
||||
# Iterate over throughput tests
|
||||
jq -c '.[]' "$throughput_test_file" | while read -r params; do
|
||||
# get the test name, and append the GPU type back to it.
|
||||
test_name=$(echo "$params" | jq -r '.test_name')
|
||||
if [[ ! "$test_name" =~ ^throughput_ ]]; then
|
||||
echo "In throughput-test.json, test_name must start with \"throughput_\"."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
||||
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
||||
echo "Skip test case $test_name."
|
||||
continue
|
||||
fi
|
||||
|
||||
# get arguments
|
||||
throughput_params=$(echo "$params" | jq -r '.parameters')
|
||||
throughput_args=$(json2args "$throughput_params")
|
||||
throughput_environment_variables=$(echo "$params" | jq -r '.environment_variables')
|
||||
throughput_envs=$(json2envs "$throughput_environment_variables")
|
||||
|
||||
# check if there is enough GPU to run the test
|
||||
tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
|
||||
if [ "$ON_CPU" == "1" ];then
|
||||
if [[ $numa_count -lt $tp ]]; then
|
||||
echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name."
|
||||
continue
|
||||
fi
|
||||
else
|
||||
if [[ $gpu_count -lt $tp ]]; then
|
||||
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
||||
continue
|
||||
fi
|
||||
fi
|
||||
|
||||
throughput_command=" $throughput_envs python3 benchmark_throughput.py \
|
||||
--output-json $RESULTS_FOLDER/${test_name}.json \
|
||||
$throughput_args"
|
||||
|
||||
echo "Running test case $test_name"
|
||||
echo "Throughput command: $throughput_command"
|
||||
# recoding benchmarking command ang GPU command
|
||||
jq_output=$(jq -n \
|
||||
--arg command "$throughput_command" \
|
||||
--arg gpu "$gpu_type" \
|
||||
'{
|
||||
throughput_command: $command,
|
||||
gpu_type: $gpu
|
||||
}')
|
||||
echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
|
||||
|
||||
# run the benchmark
|
||||
eval "$throughput_command"
|
||||
|
||||
kill_gpu_processes
|
||||
|
||||
done
|
||||
}
|
||||
|
||||
run_serving_tests() {
|
||||
# run serving tests using `benchmark_serving.py`
|
||||
# $1: a json file specifying serving test cases
|
||||
|
||||
local serving_test_file
|
||||
serving_test_file=$1
|
||||
|
||||
# Iterate over serving tests
|
||||
jq -c '.[]' "$serving_test_file" | while read -r params; do
|
||||
# get the test name, and append the GPU type back to it.
|
||||
test_name=$(echo "$params" | jq -r '.test_name')
|
||||
if [[ ! "$test_name" =~ ^serving_ ]]; then
|
||||
echo "In serving-test.json, test_name must start with \"serving_\"."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
||||
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
||||
echo "Skip test case $test_name."
|
||||
continue
|
||||
fi
|
||||
|
||||
# get client and server arguments
|
||||
server_params=$(echo "$params" | jq -r '.server_parameters')
|
||||
server_envs=$(echo "$params" | jq -r '.server_environment_variables')
|
||||
client_params=$(echo "$params" | jq -r '.client_parameters')
|
||||
server_args=$(json2args "$server_params")
|
||||
server_envs=$(json2envs "$server_envs")
|
||||
client_args=$(json2args "$client_params")
|
||||
qps_list=$(echo "$params" | jq -r '.qps_list')
|
||||
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
||||
echo "Running over qps list $qps_list"
|
||||
|
||||
# check if there is enough resources to run the test
|
||||
tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
|
||||
if [ "$ON_CPU" == "1" ];then
|
||||
if [[ $numa_count -lt $tp ]]; then
|
||||
echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name."
|
||||
continue
|
||||
fi
|
||||
else
|
||||
if [[ $gpu_count -lt $tp ]]; then
|
||||
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
||||
continue
|
||||
fi
|
||||
fi
|
||||
|
||||
# check if server model and client model is aligned
|
||||
server_model=$(echo "$server_params" | jq -r '.model')
|
||||
client_model=$(echo "$client_params" | jq -r '.model')
|
||||
if [[ $server_model != "$client_model" ]]; then
|
||||
echo "Server model and client model must be the same. Skip testcase $test_name."
|
||||
continue
|
||||
fi
|
||||
|
||||
server_command="$server_envs python3 \
|
||||
-m vllm.entrypoints.openai.api_server \
|
||||
$server_args"
|
||||
|
||||
# run the server
|
||||
echo "Running test case $test_name"
|
||||
echo "Server command: $server_command"
|
||||
# support remote vllm server
|
||||
client_remote_args=""
|
||||
if [[ -z "${REMOTE_HOST}" ]]; then
|
||||
bash -c "$server_command" &
|
||||
server_pid=$!
|
||||
# wait until the server is alive
|
||||
if wait_for_server; then
|
||||
echo ""
|
||||
echo "vLLM server is up and running."
|
||||
else
|
||||
echo ""
|
||||
echo "vLLM failed to start within the timeout period."
|
||||
fi
|
||||
else
|
||||
server_command="Using Remote Server $REMOTE_HOST $REMOTE_PORT"
|
||||
if [[ ${REMOTE_PORT} ]]; then
|
||||
client_remote_args=" --host=$REMOTE_HOST --port=$REMOTE_PORT "
|
||||
else
|
||||
client_remote_args=" --host=$REMOTE_HOST "
|
||||
fi
|
||||
fi
|
||||
|
||||
# iterate over different QPS
|
||||
for qps in $qps_list; do
|
||||
# remove the surrounding single quote from qps
|
||||
if [[ "$qps" == *"inf"* ]]; then
|
||||
echo "qps was $qps"
|
||||
qps="inf"
|
||||
echo "now qps is $qps"
|
||||
fi
|
||||
|
||||
new_test_name=$test_name"_qps_"$qps
|
||||
|
||||
# pass the tensor parallel size to the client so that it can be displayed
|
||||
# on the benchmark dashboard
|
||||
client_command="python3 benchmark_serving.py \
|
||||
--save-result \
|
||||
--result-dir $RESULTS_FOLDER \
|
||||
--result-filename ${new_test_name}.json \
|
||||
--request-rate $qps \
|
||||
--metadata "tensor_parallel_size=$tp" \
|
||||
$client_args $client_remote_args "
|
||||
|
||||
echo "Running test case $test_name with qps $qps"
|
||||
echo "Client command: $client_command"
|
||||
|
||||
bash -c "$client_command"
|
||||
|
||||
# record the benchmarking commands
|
||||
jq_output=$(jq -n \
|
||||
--arg server "$server_command" \
|
||||
--arg client "$client_command" \
|
||||
--arg gpu "$gpu_type" \
|
||||
'{
|
||||
server_command: $server,
|
||||
client_command: $client,
|
||||
gpu_type: $gpu
|
||||
}')
|
||||
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
|
||||
|
||||
done
|
||||
|
||||
# clean up
|
||||
kill -9 $server_pid
|
||||
kill_gpu_processes
|
||||
done
|
||||
}
|
||||
|
||||
main() {
|
||||
local ARCH
|
||||
ARCH=''
|
||||
if [ "$ON_CPU" == "1" ];then
|
||||
check_cpus
|
||||
ARCH='-cpu'
|
||||
else
|
||||
check_gpus
|
||||
fi
|
||||
check_hf_token
|
||||
|
||||
# Set to v1 to run v1 benchmark
|
||||
if [[ "${ENGINE_VERSION:-v0}" == "v1" ]]; then
|
||||
export VLLM_USE_V1=1
|
||||
fi
|
||||
|
||||
# dependencies
|
||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||
(which jq) || (apt-get update && apt-get -y install jq)
|
||||
(which lsof) || (apt-get update && apt-get install -y lsof)
|
||||
|
||||
# get the current IP address, required by benchmark_serving.py
|
||||
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
|
||||
# turn of the reporting of the status of each request, to clean up the terminal output
|
||||
export VLLM_LOGGING_LEVEL="WARNING"
|
||||
|
||||
# prepare for benchmarking
|
||||
cd benchmarks || exit 1
|
||||
ensure_sharegpt_downloaded
|
||||
declare -g RESULTS_FOLDER=results/
|
||||
mkdir -p $RESULTS_FOLDER
|
||||
QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
|
||||
|
||||
# benchmarking
|
||||
run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
|
||||
run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}"
|
||||
run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/"${THROUGHPUT_JSON:-throughput-tests$ARCH.json}"
|
||||
|
||||
# postprocess benchmarking results
|
||||
pip install tabulate pandas
|
||||
python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py
|
||||
|
||||
upload_to_buildkite
|
||||
}
|
||||
|
||||
main "$@"
|
@ -1,82 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import datetime
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
from tabulate import tabulate
|
||||
|
||||
results_folder = Path("results/")
|
||||
|
||||
# serving results and the keys that will be printed into markdown
|
||||
serving_results = []
|
||||
serving_column_mapping = {
|
||||
"test_name": "Test name",
|
||||
"gpu_type": "GPU",
|
||||
"completed": "Successful req.",
|
||||
"request_throughput": "Tput (req/s)",
|
||||
"mean_ttft_ms": "Mean TTFT (ms)",
|
||||
"std_ttft_ms": "Std TTFT (ms)",
|
||||
"median_ttft_ms": "Median TTFT (ms)",
|
||||
"mean_itl_ms": "Mean ITL (ms)",
|
||||
"std_itl_ms": "Std ITL (ms)",
|
||||
"median_itl_ms": "Median ITL (ms)",
|
||||
"mean_tpot_ms": "Mean TPOT (ms)",
|
||||
"std_tpot_ms": "Std TPOT (ms)",
|
||||
"median_tpot_ms": "Median TPOT (ms)",
|
||||
"total_token_throughput": "Total Token Tput (tok/s)",
|
||||
"output_throughput": "Output Tput (tok/s)",
|
||||
"total_input_tokens": "Total input tokens",
|
||||
"total_output_tokens": "Total output tokens",
|
||||
"engine": "Engine",
|
||||
}
|
||||
|
||||
if __name__ == "__main__":
|
||||
# collect results
|
||||
for test_file in results_folder.glob("*.json"):
|
||||
with open(test_file) as f:
|
||||
raw_result = json.loads(f.read())
|
||||
|
||||
# attach the benchmarking command to raw_result
|
||||
with open(test_file.with_suffix(".commands")) as f:
|
||||
command = json.loads(f.read())
|
||||
raw_result.update(command)
|
||||
|
||||
# update the test name of this result
|
||||
raw_result.update({"test_name": test_file.stem})
|
||||
|
||||
# add the result to raw_result
|
||||
serving_results.append(raw_result)
|
||||
continue
|
||||
|
||||
serving_results = pd.DataFrame.from_dict(serving_results)
|
||||
|
||||
if not serving_results.empty:
|
||||
serving_results = serving_results[list(serving_column_mapping.keys())].rename(
|
||||
columns=serving_column_mapping
|
||||
)
|
||||
|
||||
serving_md_table_with_headers = tabulate(
|
||||
serving_results, headers="keys", tablefmt="pipe", showindex=False
|
||||
)
|
||||
# remove the first line of header
|
||||
serving_md_table_lines = serving_md_table_with_headers.split("\n")
|
||||
serving_md_table_without_header = "\n".join(serving_md_table_lines[2:])
|
||||
|
||||
prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
||||
prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE")
|
||||
|
||||
# document benchmarking results in markdown
|
||||
with open(results_folder / f"{prefix}_nightly_results.md", "w") as f:
|
||||
# document results with header.
|
||||
# for those who wants to reproduce our benchmark.
|
||||
f.write(serving_md_table_with_headers)
|
||||
f.write("\n")
|
||||
|
||||
# document benchmarking results in json
|
||||
with open(results_folder / f"{prefix}_nightly_results.json", "w") as f:
|
||||
results = serving_results.to_dict(orient="records")
|
||||
f.write(json.dumps(results))
|
@ -1,23 +0,0 @@
|
||||
#!/bin/sh
|
||||
TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-postmerge-repo:pull" | jq -r .token)
|
||||
if [[ "$BUILDKITE_BRANCH" == "main" ]]; then
|
||||
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT"
|
||||
else
|
||||
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
|
||||
fi
|
||||
|
||||
TIMEOUT_SECONDS=10
|
||||
|
||||
retries=0
|
||||
while [ $retries -lt 1000 ]; do
|
||||
if [ "$(curl -s --max-time "$TIMEOUT_SECONDS" -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" "$URL")" -eq 200 ]; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "Waiting for image to be available..."
|
||||
|
||||
retries=$((retries + 1))
|
||||
sleep 5
|
||||
done
|
||||
|
||||
exit 1
|
@ -1,23 +0,0 @@
|
||||
[
|
||||
{
|
||||
"test_name": "llama8B_tp1_genai_perf",
|
||||
"qps_list": [4,8,16,32],
|
||||
"common_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3-8B-Instruct",
|
||||
"tp": 1,
|
||||
"port": 8000,
|
||||
"num_prompts": 500,
|
||||
"reuse_server": false
|
||||
},
|
||||
"vllm_server_parameters": {
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": "",
|
||||
"gpu_memory_utilization": 0.9,
|
||||
"num_scheduler_steps": 10,
|
||||
"max_num_seqs": 512,
|
||||
"dtype": "bfloat16"
|
||||
},
|
||||
"genai_perf_input_parameters": {
|
||||
}
|
||||
}
|
||||
]
|
@ -1,30 +0,0 @@
|
||||
[
|
||||
{
|
||||
"test_name": "latency_llama8B_tp1",
|
||||
"environment_variables": {
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"load_format": "dummy",
|
||||
"num_iters_warmup": 5,
|
||||
"num_iters": 15
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "latency_llama8B_tp4",
|
||||
"environment_variables": {
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 4,
|
||||
"load_format": "dummy",
|
||||
"num_iters_warmup": 5,
|
||||
"num_iters": 15
|
||||
}
|
||||
}
|
||||
]
|
@ -1,32 +0,0 @@
|
||||
[
|
||||
{
|
||||
"test_name": "latency_llama8B_tp1",
|
||||
"parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"load_format": "dummy",
|
||||
"num_iters_warmup": 5,
|
||||
"num_iters": 15
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "latency_llama70B_tp4",
|
||||
"parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||
"tensor_parallel_size": 4,
|
||||
"load_format": "dummy",
|
||||
"num-iters-warmup": 5,
|
||||
"num-iters": 15
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "latency_mixtral8x7B_tp2",
|
||||
"parameters": {
|
||||
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||
"tensor_parallel_size": 2,
|
||||
"load_format": "dummy",
|
||||
"num-iters-warmup": 5,
|
||||
"num-iters": 15
|
||||
}
|
||||
}
|
||||
]
|
@ -1,323 +0,0 @@
|
||||
[
|
||||
{
|
||||
"test_name": "llama8B_tp1_sharegpt",
|
||||
"qps_list": [4,8,16,32,"inf"],
|
||||
"common_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3-8B-Instruct",
|
||||
"tp": 1,
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 500,
|
||||
"port": 8000,
|
||||
"reuse_server": false
|
||||
},
|
||||
"lmdeploy_server_parameters": {
|
||||
"dtype": "bfloat16"
|
||||
},
|
||||
"lmdeploy_client_parameters": {
|
||||
},
|
||||
"tgi_server_parameters": {
|
||||
},
|
||||
"tgi_client_parameters": {
|
||||
"endpoint": "/generate_stream"
|
||||
},
|
||||
"trt_server_parameters": {
|
||||
"model_type": "llama",
|
||||
"model_dtype": "bfloat16",
|
||||
"max_batch_size": 2048,
|
||||
"max_input_len": 4096,
|
||||
"max_seq_len": 6144,
|
||||
"max_num_tokens": 16384,
|
||||
"trt_llm_version": "v0.11.0"
|
||||
},
|
||||
"trt_client_parameters": {
|
||||
"endpoint": "/v2/models/ensemble/generate_stream"
|
||||
},
|
||||
"vllm_server_parameters": {
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": "",
|
||||
"gpu_memory_utilization": 0.9,
|
||||
"num_scheduler_steps": 10,
|
||||
"max_num_seqs": 512,
|
||||
"dtype": "bfloat16"
|
||||
},
|
||||
"vllm_client_parameters": {
|
||||
},
|
||||
"sglang_server_parameters": {
|
||||
"disable_radix_cache": "",
|
||||
"enable_torch_compile": "",
|
||||
"dtype": "bfloat16"
|
||||
},
|
||||
"sglang_client_parameters": {
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "llama8B_tp1_sonnet_512_16",
|
||||
"qps_list": [4,8,16,32,"inf"],
|
||||
"common_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3-8B-Instruct",
|
||||
"tp": 1,
|
||||
"dataset_name": "sonnet",
|
||||
"dataset_path": "./sonnet_4x.txt",
|
||||
"num_prompts": 500,
|
||||
"port": 8000,
|
||||
"sonnet_input_len": 512,
|
||||
"sonnet_output_len": 16,
|
||||
"sonnet_prefix_len": 50,
|
||||
"reuse_server": true
|
||||
},
|
||||
"lmdeploy_server_parameters": {
|
||||
"dtype": "bfloat16"
|
||||
},
|
||||
"lmdeploy_client_parameters": {
|
||||
},
|
||||
"tgi_server_parameters": {
|
||||
},
|
||||
"tgi_client_parameters": {
|
||||
"endpoint": "/generate_stream"
|
||||
},
|
||||
"trt_server_parameters": {
|
||||
"model_type": "llama",
|
||||
"model_dtype": "bfloat16",
|
||||
"max_batch_size": 2048,
|
||||
"max_input_len": 4096,
|
||||
"max_seq_len": 6144,
|
||||
"max_num_tokens": 16384,
|
||||
"trt_llm_version": "v0.11.0"
|
||||
},
|
||||
"trt_client_parameters": {
|
||||
"endpoint": "/v2/models/ensemble/generate_stream"
|
||||
},
|
||||
"vllm_server_parameters": {
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": "",
|
||||
"gpu_memory_utilization": 0.9,
|
||||
"num_scheduler_steps": 10,
|
||||
"max_num_seqs": 512,
|
||||
"dtype": "bfloat16"
|
||||
},
|
||||
"vllm_client_parameters": {
|
||||
},
|
||||
"sglang_server_parameters": {
|
||||
"disable_radix_cache": "",
|
||||
"enable_torch_compile": "",
|
||||
"dtype": "bfloat16"
|
||||
},
|
||||
"sglang_client_parameters": {
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "llama8B_tp1_sonnet_512_256",
|
||||
"qps_list": [4,8,16,32,"inf"],
|
||||
"common_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3-8B-Instruct",
|
||||
"tp": 1,
|
||||
"dataset_name": "sonnet",
|
||||
"dataset_path": "./sonnet_4x.txt",
|
||||
"num_prompts": 500,
|
||||
"port": 8000,
|
||||
"sonnet_input_len": 512,
|
||||
"sonnet_output_len": 256,
|
||||
"sonnet_prefix_len": 50,
|
||||
"reuse_server": true
|
||||
},
|
||||
"lmdeploy_server_parameters": {
|
||||
"dtype": "bfloat16"
|
||||
},
|
||||
"lmdeploy_client_parameters": {
|
||||
},
|
||||
"tgi_server_parameters": {
|
||||
},
|
||||
"tgi_client_parameters": {
|
||||
"endpoint": "/generate_stream"
|
||||
},
|
||||
"trt_server_parameters": {
|
||||
"model_type": "llama",
|
||||
"model_dtype": "bfloat16",
|
||||
"max_batch_size": 2048,
|
||||
"max_input_len": 4096,
|
||||
"max_seq_len": 6144,
|
||||
"max_num_tokens": 16384,
|
||||
"trt_llm_version": "v0.11.0"
|
||||
},
|
||||
"trt_client_parameters": {
|
||||
"endpoint": "/v2/models/ensemble/generate_stream"
|
||||
},
|
||||
"vllm_server_parameters": {
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": "",
|
||||
"gpu_memory_utilization": 0.9,
|
||||
"num_scheduler_steps": 10,
|
||||
"max_num_seqs": 512,
|
||||
"dtype": "bfloat16"
|
||||
},
|
||||
"vllm_client_parameters": {
|
||||
},
|
||||
"sglang_server_parameters": {
|
||||
"disable_radix_cache": "",
|
||||
"enable_torch_compile": "",
|
||||
"dtype": "bfloat16"
|
||||
},
|
||||
"sglang_client_parameters": {
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "llama70B_tp4_sharegpt",
|
||||
"qps_list": [4,8,16,32,"inf"],
|
||||
"common_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
|
||||
"tp": 4,
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 500,
|
||||
"port": 8000,
|
||||
"reuse_server": false
|
||||
},
|
||||
"lmdeploy_server_parameters": {
|
||||
"dtype": "bfloat16"
|
||||
},
|
||||
"lmdeploy_client_parameters": {
|
||||
},
|
||||
"tgi_server_parameters": {
|
||||
},
|
||||
"tgi_client_parameters": {
|
||||
"endpoint": "/generate_stream"
|
||||
},
|
||||
"trt_server_parameters": {
|
||||
"model_type": "llama",
|
||||
"model_dtype": "bfloat16",
|
||||
"max_batch_size": 2048,
|
||||
"max_input_len": 4096,
|
||||
"max_seq_len": 6144,
|
||||
"max_num_tokens": 16384,
|
||||
"trt_llm_version": "v0.11.0"
|
||||
},
|
||||
"trt_client_parameters": {
|
||||
"endpoint": "/v2/models/ensemble/generate_stream"
|
||||
},
|
||||
"vllm_server_parameters": {
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": "",
|
||||
"gpu_memory_utilization": 0.9,
|
||||
"num_scheduler_steps": 10,
|
||||
"max_num_seqs": 512,
|
||||
"dtype": "bfloat16"
|
||||
},
|
||||
"vllm_client_parameters": {
|
||||
},
|
||||
"sglang_server_parameters": {
|
||||
"disable_radix_cache": "",
|
||||
"dtype": "bfloat16"
|
||||
},
|
||||
"sglang_client_parameters": {
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "llama70B_tp4_sonnet_512_16",
|
||||
"qps_list": [4,8,16,32,"inf"],
|
||||
"common_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
|
||||
"tp": 4,
|
||||
"dataset_name": "sonnet",
|
||||
"dataset_path": "./sonnet_4x.txt",
|
||||
"num_prompts": 500,
|
||||
"port": 8000,
|
||||
"sonnet_input_len": 512,
|
||||
"sonnet_output_len": 16,
|
||||
"sonnet_prefix_len": 50,
|
||||
"reuse_server": true
|
||||
},
|
||||
"lmdeploy_server_parameters": {
|
||||
"dtype": "bfloat16"
|
||||
},
|
||||
"lmdeploy_client_parameters": {
|
||||
},
|
||||
"tgi_server_parameters": {
|
||||
},
|
||||
"tgi_client_parameters": {
|
||||
"endpoint": "/generate_stream"
|
||||
},
|
||||
"trt_server_parameters": {
|
||||
"model_type": "llama",
|
||||
"model_dtype": "bfloat16",
|
||||
"max_batch_size": 2048,
|
||||
"max_input_len": 4096,
|
||||
"max_seq_len": 6144,
|
||||
"max_num_tokens": 16384,
|
||||
"trt_llm_version": "v0.11.0"
|
||||
},
|
||||
"trt_client_parameters": {
|
||||
"endpoint": "/v2/models/ensemble/generate_stream"
|
||||
},
|
||||
"vllm_server_parameters": {
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": "",
|
||||
"gpu_memory_utilization": 0.9,
|
||||
"num_scheduler_steps": 10,
|
||||
"max_num_seqs": 512,
|
||||
"dtype": "bfloat16"
|
||||
},
|
||||
"vllm_client_parameters": {
|
||||
},
|
||||
"sglang_server_parameters": {
|
||||
"disable_radix_cache": "",
|
||||
"dtype": "bfloat16"
|
||||
},
|
||||
"sglang_client_parameters": {
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "llama70B_tp4_sonnet_512_256",
|
||||
"qps_list": [4,8,16,32,"inf"],
|
||||
"common_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
|
||||
"tp": 4,
|
||||
"dataset_name": "sonnet",
|
||||
"dataset_path": "./sonnet_4x.txt",
|
||||
"num_prompts": 500,
|
||||
"port": 8000,
|
||||
"sonnet_input_len": 512,
|
||||
"sonnet_output_len": 256,
|
||||
"sonnet_prefix_len": 50,
|
||||
"reuse_server": true
|
||||
},
|
||||
"lmdeploy_server_parameters": {
|
||||
"dtype": "bfloat16"
|
||||
},
|
||||
"lmdeploy_client_parameters": {
|
||||
},
|
||||
"tgi_server_parameters": {
|
||||
},
|
||||
"tgi_client_parameters": {
|
||||
"endpoint": "/generate_stream"
|
||||
},
|
||||
"trt_server_parameters": {
|
||||
"model_type": "llama",
|
||||
"model_dtype": "bfloat16",
|
||||
"max_batch_size": 2048,
|
||||
"max_input_len": 4096,
|
||||
"max_seq_len": 6144,
|
||||
"max_num_tokens": 16384,
|
||||
"trt_llm_version": "v0.11.0"
|
||||
},
|
||||
"trt_client_parameters": {
|
||||
"endpoint": "/v2/models/ensemble/generate_stream"
|
||||
},
|
||||
"vllm_server_parameters": {
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": "",
|
||||
"gpu_memory_utilization": 0.9,
|
||||
"num_scheduler_steps": 10,
|
||||
"max_num_seqs": 512,
|
||||
"dtype": "bfloat16"
|
||||
},
|
||||
"vllm_client_parameters": {
|
||||
},
|
||||
"sglang_server_parameters": {
|
||||
"disable_radix_cache": "",
|
||||
"dtype": "bfloat16"
|
||||
},
|
||||
"sglang_client_parameters": {
|
||||
}
|
||||
}
|
||||
]
|
@ -1,158 +0,0 @@
|
||||
[
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_sharegpt",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": "",
|
||||
"enforce_eager": "",
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"max_concurrency": 60,
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp2_sharegpt",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 2,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": "",
|
||||
"enforce_eager": "",
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"max_concurrency": 60,
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp4_sharegpt",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 4,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": "",
|
||||
"enforce_eager": "",
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"max_concurrency": 60,
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp4_random_1024_128",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 4,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": "",
|
||||
"enforce_eager": "",
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 1024,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"max_concurrency": 100,
|
||||
"num_prompts": 100
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_pp6_random_1024_128",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"pipeline_parallel_size": 6,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": "",
|
||||
"enforce_eager": "",
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 1024,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"max_concurrency": 100,
|
||||
"num_prompts": 100
|
||||
}
|
||||
}
|
||||
]
|
@ -1,81 +0,0 @@
|
||||
[
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_sharegpt",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"swap_space": 16,
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": "",
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama70B_tp4_sharegpt",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||
"tensor_parallel_size": 4,
|
||||
"swap_space": 16,
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": "",
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_mixtral8x7B_tp2_sharegpt",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"server_parameters": {
|
||||
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||
"tensor_parallel_size": 2,
|
||||
"swap_space": 16,
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": "",
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama70B_tp4_sharegpt_specdecode",
|
||||
"qps_list": [2],
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||
"disable_log_requests": "",
|
||||
"tensor_parallel_size": 4,
|
||||
"swap_space": 16,
|
||||
"speculative_config": {
|
||||
"model": "turboderp/Qwama-0.5B-Instruct",
|
||||
"num_speculative_tokens": 4,
|
||||
"draft_tensor_parallel_size": 1
|
||||
}
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
}
|
||||
]
|
@ -1,32 +0,0 @@
|
||||
[
|
||||
{
|
||||
"test_name": "throughput_llama8B_tp1",
|
||||
"environment_variables": {
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"load_format": "dummy",
|
||||
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200,
|
||||
"backend": "vllm"
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "throughput_llama8B_tp4",
|
||||
"environment_variables": {
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 4,
|
||||
"load_format": "dummy",
|
||||
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200,
|
||||
"backend": "vllm"
|
||||
}
|
||||
}
|
||||
]
|
@ -1,35 +0,0 @@
|
||||
[
|
||||
{
|
||||
"test_name": "throughput_llama8B_tp1",
|
||||
"parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"load_format": "dummy",
|
||||
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200,
|
||||
"backend": "vllm"
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "throughput_llama70B_tp4",
|
||||
"parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||
"tensor_parallel_size": 4,
|
||||
"load_format": "dummy",
|
||||
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200,
|
||||
"backend": "vllm"
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "throughput_mixtral8x7B_tp2",
|
||||
"parameters": {
|
||||
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||
"tensor_parallel_size": 2,
|
||||
"load_format": "dummy",
|
||||
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200,
|
||||
"backend": "vllm"
|
||||
}
|
||||
}
|
||||
]
|
@ -1,46 +0,0 @@
|
||||
# This local pyproject file is part of the migration from yapf to ruff format.
|
||||
# It uses the same core rules as the main pyproject.toml file, but with the
|
||||
# following differences:
|
||||
# - ruff line length is overridden to 88
|
||||
# - deprecated typing ignores (UP006, UP035) have been removed
|
||||
|
||||
[tool.ruff]
|
||||
line-length = 88
|
||||
|
||||
[tool.ruff.lint.per-file-ignores]
|
||||
"vllm/third_party/**" = ["ALL"]
|
||||
"vllm/version.py" = ["F401"]
|
||||
"vllm/_version.py" = ["ALL"]
|
||||
|
||||
[tool.ruff.lint]
|
||||
select = [
|
||||
# pycodestyle
|
||||
"E",
|
||||
# Pyflakes
|
||||
"F",
|
||||
# pyupgrade
|
||||
"UP",
|
||||
# flake8-bugbear
|
||||
"B",
|
||||
# flake8-simplify
|
||||
"SIM",
|
||||
# isort
|
||||
"I",
|
||||
# flake8-logging-format
|
||||
"G",
|
||||
]
|
||||
ignore = [
|
||||
# star imports
|
||||
"F405", "F403",
|
||||
# lambda expression assignment
|
||||
"E731",
|
||||
# Loop control variable not used within loop body
|
||||
"B007",
|
||||
# f-string format
|
||||
"UP032",
|
||||
# Can remove once 3.10+ is the minimum Python version
|
||||
"UP007",
|
||||
]
|
||||
|
||||
[tool.ruff.format]
|
||||
docstring-code-format = true
|
@ -1,124 +0,0 @@
|
||||
steps:
|
||||
- label: "Build wheel - CUDA 12.8"
|
||||
id: build-wheel-cuda-12-8
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- label: "Build wheel - CUDA 12.6"
|
||||
id: build-wheel-cuda-12-6
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
# Note(simon): We can always build CUDA 11.8 wheel to ensure the build is working.
|
||||
# However, this block can be uncommented to save some compute hours.
|
||||
# - block: "Build CUDA 11.8 wheel"
|
||||
# key: block-build-cu118-wheel
|
||||
|
||||
- label: "Build wheel - CUDA 11.8"
|
||||
# depends_on: block-build-cu118-wheel
|
||||
id: build-wheel-cuda-11-8
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- block: "Build release image"
|
||||
depends_on: ~
|
||||
key: block-release-image-build
|
||||
|
||||
- label: "Build release image"
|
||||
depends_on: block-release-image-build
|
||||
id: build-release-image
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||
|
||||
- label: "Annotate release workflow"
|
||||
depends_on:
|
||||
- build-release-image
|
||||
- build-wheel-cuda-12-8
|
||||
- build-wheel-cuda-12-6
|
||||
- build-wheel-cuda-11-8
|
||||
id: annotate-release-workflow
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "bash .buildkite/scripts/annotate-release.sh"
|
||||
|
||||
- label: "Build and publish TPU release image"
|
||||
depends_on: ~
|
||||
if: build.env("NIGHTLY") == "1"
|
||||
agents:
|
||||
queue: tpu_queue_postmerge
|
||||
commands:
|
||||
- "yes | docker system prune -a"
|
||||
- "git fetch --all"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ."
|
||||
- "docker push vllm/vllm-tpu:nightly"
|
||||
- "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
|
||||
plugins:
|
||||
- docker-login#v3.0.0:
|
||||
username: vllmbot
|
||||
password-env: DOCKERHUB_TOKEN
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- input: "Provide Release version here"
|
||||
id: input-release-version
|
||||
fields:
|
||||
- text: "What is the release version?"
|
||||
key: release-version
|
||||
|
||||
- block: "Build CPU release image"
|
||||
key: block-cpu-release-image-build
|
||||
depends_on: ~
|
||||
|
||||
- label: "Build and publish CPU release image"
|
||||
depends_on: block-cpu-release-image-build
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- block: "Build Neuron release image"
|
||||
key: block-neuron-release-image-build
|
||||
depends_on: ~
|
||||
|
||||
- label: "Build and publish Neuron release image"
|
||||
depends_on: block-neuron-release-image-build
|
||||
agents:
|
||||
queue: neuron-postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest"
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
73
.buildkite/run-amd-test.sh
Normal file
73
.buildkite/run-amd-test.sh
Normal file
@ -0,0 +1,73 @@
|
||||
# This script runs test inside the corresponding ROCm docker container.
|
||||
set -ex
|
||||
|
||||
# Print ROCm version
|
||||
echo "--- ROCm info"
|
||||
rocminfo
|
||||
|
||||
# cleanup older docker images
|
||||
cleanup_docker() {
|
||||
# Get Docker's root directory
|
||||
docker_root=$(docker info -f '{{.DockerRootDir}}')
|
||||
if [ -z "$docker_root" ]; then
|
||||
echo "Failed to determine Docker root directory."
|
||||
exit 1
|
||||
fi
|
||||
echo "Docker root directory: $docker_root"
|
||||
# Check disk usage of the filesystem where Docker's root directory is located
|
||||
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
|
||||
# Define the threshold
|
||||
threshold=70
|
||||
if [ "$disk_usage" -gt "$threshold" ]; then
|
||||
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
|
||||
# Remove dangling images (those that are not tagged and not used by any container)
|
||||
docker image prune -f
|
||||
# Remove unused volumes
|
||||
docker volume prune -f
|
||||
echo "Docker images and volumes cleanup completed."
|
||||
else
|
||||
echo "Disk usage is below $threshold%. No cleanup needed."
|
||||
fi
|
||||
}
|
||||
|
||||
# Call the cleanup docker function
|
||||
cleanup_docker
|
||||
|
||||
echo "--- Resetting GPUs"
|
||||
|
||||
echo "reset" > /opt/amdgpu/etc/gpu_state
|
||||
|
||||
while true; do
|
||||
sleep 3
|
||||
if grep -q clean /opt/amdgpu/etc/gpu_state; then
|
||||
echo "GPUs state is \"clean\""
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
echo "--- Building container"
|
||||
sha=$(git rev-parse --short HEAD)
|
||||
image_name=rocm_${sha}
|
||||
container_name=rocm_${sha}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)
|
||||
docker build \
|
||||
-t ${image_name} \
|
||||
-f Dockerfile.rocm \
|
||||
--progress plain \
|
||||
.
|
||||
|
||||
remove_docker_container() {
|
||||
docker rm -f ${container_name} || docker image rm -f ${image_name} || true
|
||||
}
|
||||
trap remove_docker_container EXIT
|
||||
|
||||
echo "--- Running container"
|
||||
|
||||
docker run \
|
||||
--device /dev/kfd --device /dev/dri \
|
||||
--network host \
|
||||
--rm \
|
||||
-e HF_TOKEN \
|
||||
--name ${container_name} \
|
||||
${image_name} \
|
||||
/bin/bash -c "${@}"
|
||||
|
@ -1,12 +1,10 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script is run by buildkite to run the benchmarks and upload the results to buildkite
|
||||
|
||||
set -ex
|
||||
set -o pipefail
|
||||
|
||||
# cd 2 levels into the working directory
|
||||
cd "$(dirname "${BASH_SOURCE[0]}")/../.."
|
||||
# cd into parent directory of this file
|
||||
cd "$(dirname "${BASH_SOURCE[0]}")/.."
|
||||
|
||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||
|
||||
@ -52,16 +50,16 @@ echo "### Serving Benchmarks" >> benchmark_results.md
|
||||
sed -n '1p' benchmark_serving.txt >> benchmark_results.md # first line
|
||||
echo "" >> benchmark_results.md
|
||||
echo '```' >> benchmark_results.md
|
||||
tail -n 24 benchmark_serving.txt >> benchmark_results.md # last 24 lines
|
||||
tail -n 20 benchmark_serving.txt >> benchmark_results.md # last 20 lines
|
||||
echo '```' >> benchmark_results.md
|
||||
|
||||
# if the agent binary is not found, skip uploading the results, exit 0
|
||||
if [ ! -f /usr/bin/buildkite-agent ]; then
|
||||
if [ ! -f /workspace/buildkite-agent ]; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# upload the results to buildkite
|
||||
buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
|
||||
/workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
|
||||
|
||||
# exit with the exit code of the benchmarks
|
||||
if [ $bench_latency_exit_code -ne 0 ]; then
|
||||
@ -77,4 +75,4 @@ if [ $bench_serving_exit_code -ne 0 ]; then
|
||||
fi
|
||||
|
||||
rm ShareGPT_V3_unfiltered_cleaned_split.json
|
||||
buildkite-agent artifact upload "*.json"
|
||||
/workspace/buildkite-agent artifact upload "*.json"
|
24
.buildkite/run-cpu-test.sh
Normal file
24
.buildkite/run-cpu-test.sh
Normal file
@ -0,0 +1,24 @@
|
||||
# This script build the CPU docker image and run the offline inference inside the container.
|
||||
# It serves a sanity check for compilation and basic model usage.
|
||||
set -ex
|
||||
|
||||
# Try building the docker image
|
||||
docker build -t cpu-test -f Dockerfile.cpu .
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() { docker rm -f cpu-test || true; }
|
||||
trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Run the image
|
||||
docker run -itd -v ~/.cache/huggingface:/root/.cache/huggingface --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test
|
||||
|
||||
# offline inference
|
||||
docker exec cpu-test bash -c "python3 examples/offline_inference.py"
|
||||
|
||||
# Run basic model test
|
||||
docker exec cpu-test bash -c "cd tests;
|
||||
pip install pytest Pillow protobuf
|
||||
bash ../.buildkite/download-images.sh
|
||||
cd ../
|
||||
pytest -v -s tests/models --ignore=tests/models/test_llava.py --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py"
|
51
.buildkite/run-neuron-test.sh
Normal file
51
.buildkite/run-neuron-test.sh
Normal file
@ -0,0 +1,51 @@
|
||||
# This script build the Neuron docker image and run the API server inside the container.
|
||||
# It serves a sanity check for compilation and basic model usage.
|
||||
set -e
|
||||
|
||||
# Try building the docker image
|
||||
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
|
||||
|
||||
# prune old image and containers to save disk space, and only once a day
|
||||
# by using a timestamp file in tmp.
|
||||
if [ -f /tmp/neuron-docker-build-timestamp ]; then
|
||||
last_build=$(cat /tmp/neuron-docker-build-timestamp)
|
||||
current_time=$(date +%s)
|
||||
if [ $((current_time - last_build)) -gt 86400 ]; then
|
||||
docker system prune -f
|
||||
echo $current_time > /tmp/neuron-docker-build-timestamp
|
||||
fi
|
||||
else
|
||||
echo $(date +%s) > /tmp/neuron-docker-build-timestamp
|
||||
fi
|
||||
|
||||
docker build -t neuron -f Dockerfile.neuron .
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() { docker rm -f neuron || true; }
|
||||
trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Run the image
|
||||
docker run --device=/dev/neuron0 --device=/dev/neuron1 --network host --name neuron neuron python3 -m vllm.entrypoints.api_server \
|
||||
--model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --max-num-seqs 8 --max-model-len 128 --block-size 128 --device neuron --tensor-parallel-size 2 &
|
||||
|
||||
# Wait for the server to start
|
||||
wait_for_server_to_start() {
|
||||
timeout=300
|
||||
counter=0
|
||||
|
||||
while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do
|
||||
sleep 1
|
||||
counter=$((counter + 1))
|
||||
if [ $counter -ge $timeout ]; then
|
||||
echo "Timeout after $timeout seconds"
|
||||
break
|
||||
fi
|
||||
done
|
||||
}
|
||||
wait_for_server_to_start
|
||||
|
||||
# Test a simple prompt
|
||||
curl -X POST -H "Content-Type: application/json" \
|
||||
localhost:8000/generate \
|
||||
-d '{"prompt": "San Francisco is a"}'
|
@ -1,31 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -ex
|
||||
|
||||
# Get release version and strip leading 'v' if present
|
||||
RELEASE_VERSION=$(buildkite-agent meta-data get release-version | sed 's/^v//')
|
||||
|
||||
if [ -z "$RELEASE_VERSION" ]; then
|
||||
echo "Error: RELEASE_VERSION is empty. 'release-version' metadata might not be set or is invalid."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
|
||||
To download the wheel:
|
||||
\`\`\`
|
||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
|
||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
|
||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu118/vllm-${RELEASE_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl .
|
||||
\`\`\`
|
||||
|
||||
To download and upload the image:
|
||||
|
||||
\`\`\`
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} vllm/vllm-openai
|
||||
docker tag vllm/vllm-openai vllm/vllm-openai:latest
|
||||
docker tag vllm/vllm-openai vllm/vllm-openai:v${RELEASE_VERSION}
|
||||
docker push vllm/vllm-openai:latest
|
||||
docker push vllm/vllm-openai:v${RELEASE_VERSION}
|
||||
\`\`\`
|
||||
EOF
|
@ -1,17 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Usage: ./ci_clean_log.sh ci.log
|
||||
# This script strips timestamps and color codes from CI log files.
|
||||
|
||||
# Check if argument is given
|
||||
if [ $# -lt 1 ]; then
|
||||
echo "Usage: $0 ci.log"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
INPUT_FILE="$1"
|
||||
|
||||
# Strip timestamps
|
||||
sed -i 's/^\[[0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}T[0-9]\{2\}:[0-9]\{2\}:[0-9]\{2\}Z\] //' "$INPUT_FILE"
|
||||
|
||||
# Strip colorization
|
||||
sed -i -r 's/\x1B\[[0-9;]*[mK]//g' "$INPUT_FILE"
|
@ -1,244 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script runs test inside the corresponding ROCm docker container.
|
||||
set -o pipefail
|
||||
|
||||
# Export Python path
|
||||
export PYTHONPATH=".."
|
||||
|
||||
# Print ROCm version
|
||||
echo "--- Confirming Clean Initial State"
|
||||
while true; do
|
||||
sleep 3
|
||||
if grep -q clean /opt/amdgpu/etc/gpu_state; then
|
||||
echo "GPUs state is \"clean\""
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
echo "--- ROCm info"
|
||||
rocminfo
|
||||
|
||||
# cleanup older docker images
|
||||
cleanup_docker() {
|
||||
# Get Docker's root directory
|
||||
docker_root=$(docker info -f '{{.DockerRootDir}}')
|
||||
if [ -z "$docker_root" ]; then
|
||||
echo "Failed to determine Docker root directory."
|
||||
exit 1
|
||||
fi
|
||||
echo "Docker root directory: $docker_root"
|
||||
# Check disk usage of the filesystem where Docker's root directory is located
|
||||
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
|
||||
# Define the threshold
|
||||
threshold=70
|
||||
if [ "$disk_usage" -gt "$threshold" ]; then
|
||||
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
|
||||
# Remove dangling images (those that are not tagged and not used by any container)
|
||||
docker image prune -f
|
||||
# Remove unused volumes / force the system prune for old images as well.
|
||||
docker volume prune -f && docker system prune --force --filter "until=72h" --all
|
||||
echo "Docker images and volumes cleanup completed."
|
||||
else
|
||||
echo "Disk usage is below $threshold%. No cleanup needed."
|
||||
fi
|
||||
}
|
||||
|
||||
# Call the cleanup docker function
|
||||
cleanup_docker
|
||||
|
||||
echo "--- Resetting GPUs"
|
||||
|
||||
echo "reset" > /opt/amdgpu/etc/gpu_state
|
||||
|
||||
while true; do
|
||||
sleep 3
|
||||
if grep -q clean /opt/amdgpu/etc/gpu_state; then
|
||||
echo "GPUs state is \"clean\""
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
echo "--- Pulling container"
|
||||
image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
|
||||
container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
||||
docker pull "${image_name}"
|
||||
|
||||
remove_docker_container() {
|
||||
docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
|
||||
}
|
||||
trap remove_docker_container EXIT
|
||||
|
||||
echo "--- Running container"
|
||||
|
||||
HF_CACHE="$(realpath ~)/huggingface"
|
||||
mkdir -p "${HF_CACHE}"
|
||||
HF_MOUNT="/root/.cache/huggingface"
|
||||
|
||||
commands=$@
|
||||
echo "Commands:$commands"
|
||||
|
||||
if [[ $commands == *"pytest -v -s basic_correctness/test_basic_correctness.py"* ]]; then
|
||||
commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s basic_correctness/test_basic_correctness.py"}
|
||||
fi
|
||||
|
||||
if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
|
||||
commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
|
||||
fi
|
||||
|
||||
if [[ $commands == *"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"* ]]; then
|
||||
commands=${commands//"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"/"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2 and not BambaForCausalLM and not Gemma2ForCausalLM and not Grok1ModelForCausalLM and not Zamba2ForCausalLM and not Gemma2Model and not GritLM'"}
|
||||
fi
|
||||
|
||||
if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
|
||||
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
|
||||
fi
|
||||
|
||||
if [[ $commands == *"pytest -v -s lora"* ]]; then
|
||||
commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
|
||||
fi
|
||||
|
||||
#ignore certain kernels tests
|
||||
if [[ $commands == *" kernels/core"* ]]; then
|
||||
commands="${commands} \
|
||||
--ignore=kernels/core/test_fused_quant_layernorm.py \
|
||||
--ignore=kernels/core/test_permute_cols.py"
|
||||
fi
|
||||
|
||||
if [[ $commands == *" kernels/attention"* ]]; then
|
||||
commands="${commands} \
|
||||
--ignore=kernels/attention/test_attention_selector.py \
|
||||
--ignore=kernels/attention/test_blocksparse_attention.py \
|
||||
--ignore=kernels/attention/test_encoder_decoder_attn.py \
|
||||
--ignore=kernels/attention/test_flash_attn.py \
|
||||
--ignore=kernels/attention/test_flashinfer.py \
|
||||
--ignore=kernels/attention/test_prefix_prefill.py \
|
||||
--ignore=kernels/attention/test_cascade_flash_attn.py \
|
||||
--ignore=kernels/attention/test_mha_attn.py \
|
||||
--ignore=kernels/attention/test_lightning_attn.py \
|
||||
--ignore=kernels/attention/test_attention.py"
|
||||
fi
|
||||
|
||||
if [[ $commands == *" kernels/quantization"* ]]; then
|
||||
commands="${commands} \
|
||||
--ignore=kernels/quantization/test_int8_quant.py \
|
||||
--ignore=kernels/quantization/test_aqlm.py \
|
||||
--ignore=kernels/quantization/test_machete_mm.py \
|
||||
--ignore=kernels/quantization/test_block_fp8.py \
|
||||
--ignore=kernels/quantization/test_block_int8.py \
|
||||
--ignore=kernels/quantization/test_marlin_gemm.py \
|
||||
--ignore=kernels/quantization/test_cutlass_scaled_mm.py \
|
||||
--ignore=kernels/quantization/test_int8_kernel.py"
|
||||
fi
|
||||
|
||||
if [[ $commands == *" kernels/mamba"* ]]; then
|
||||
commands="${commands} \
|
||||
--ignore=kernels/mamba/test_mamba_mixer2.py \
|
||||
--ignore=kernels/mamba/test_causal_conv1d.py \
|
||||
--ignore=kernels/mamba/test_mamba_ssm_ssd.py"
|
||||
fi
|
||||
|
||||
if [[ $commands == *" kernels/moe"* ]]; then
|
||||
commands="${commands} \
|
||||
--ignore=kernels/moe/test_moe.py \
|
||||
--ignore=kernels/moe/test_cutlass_moe.py \
|
||||
--ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
|
||||
fi
|
||||
|
||||
#ignore certain Entrypoints/openai tests
|
||||
if [[ $commands == *" entrypoints/openai "* ]]; then
|
||||
commands=${commands//" entrypoints/openai "/" entrypoints/openai \
|
||||
--ignore=entrypoints/openai/test_audio.py \
|
||||
--ignore=entrypoints/openai/test_shutdown.py \
|
||||
--ignore=entrypoints/openai/test_completion.py \
|
||||
--ignore=entrypoints/openai/test_sleep.py \
|
||||
--ignore=entrypoints/openai/test_models.py \
|
||||
--ignore=entrypoints/openai/test_lora_adapters.py \
|
||||
--ignore=entrypoints/openai/test_return_tokens_as_ids.py \
|
||||
--ignore=entrypoints/openai/test_root_path.py \
|
||||
--ignore=entrypoints/openai/test_tokenization.py \
|
||||
--ignore=entrypoints/openai/test_prompt_validation.py "}
|
||||
fi
|
||||
|
||||
#ignore certain Entrypoints/llm tests
|
||||
if [[ $commands == *" entrypoints/llm "* ]]; then
|
||||
commands=${commands//" entrypoints/llm "/" entrypoints/llm \
|
||||
--ignore=entrypoints/llm/test_chat.py \
|
||||
--ignore=entrypoints/llm/test_accuracy.py \
|
||||
--ignore=entrypoints/llm/test_init.py \
|
||||
--ignore=entrypoints/llm/test_generate_multiple_loras.py \
|
||||
--ignore=entrypoints/llm/test_prompt_validation.py "}
|
||||
fi
|
||||
|
||||
#Obsolete currently
|
||||
##ignore certain Entrypoints/llm tests
|
||||
#if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
|
||||
# commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
|
||||
#fi
|
||||
|
||||
# --ignore=entrypoints/openai/test_encoder_decoder.py \
|
||||
# --ignore=entrypoints/openai/test_embedding.py \
|
||||
# --ignore=entrypoints/openai/test_oot_registration.py
|
||||
# --ignore=entrypoints/openai/test_accuracy.py \
|
||||
# --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13
|
||||
|
||||
|
||||
PARALLEL_JOB_COUNT=8
|
||||
MYPYTHONPATH=".."
|
||||
|
||||
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
|
||||
if [[ $commands == *"--shard-id="* ]]; then
|
||||
# assign job count as the number of shards used
|
||||
commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
|
||||
for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
|
||||
# assign shard-id for each shard
|
||||
commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
|
||||
echo "Shard ${GPU} commands:$commands_gpu"
|
||||
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
|
||||
docker run \
|
||||
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
|
||||
--network=host \
|
||||
--shm-size=16gb \
|
||||
--rm \
|
||||
-e HIP_VISIBLE_DEVICES="${GPU}" \
|
||||
-e HF_TOKEN \
|
||||
-e AWS_ACCESS_KEY_ID \
|
||||
-e AWS_SECRET_ACCESS_KEY \
|
||||
-v "${HF_CACHE}:${HF_MOUNT}" \
|
||||
-e "HF_HOME=${HF_MOUNT}" \
|
||||
-e "PYTHONPATH=${MYPYTHONPATH}" \
|
||||
--name "${container_name}_${GPU}" \
|
||||
"${image_name}" \
|
||||
/bin/bash -c "${commands_gpu}" \
|
||||
|& while read -r line; do echo ">>Shard $GPU: $line"; done &
|
||||
PIDS+=($!)
|
||||
done
|
||||
#wait for all processes to finish and collect exit codes
|
||||
for pid in "${PIDS[@]}"; do
|
||||
wait "${pid}"
|
||||
STATUS+=($?)
|
||||
done
|
||||
for st in "${STATUS[@]}"; do
|
||||
if [[ ${st} -ne 0 ]]; then
|
||||
echo "One of the processes failed with $st"
|
||||
exit "${st}"
|
||||
fi
|
||||
done
|
||||
else
|
||||
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
|
||||
docker run \
|
||||
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
|
||||
--network=host \
|
||||
--shm-size=16gb \
|
||||
--rm \
|
||||
-e HIP_VISIBLE_DEVICES=0 \
|
||||
-e HF_TOKEN \
|
||||
-e AWS_ACCESS_KEY_ID \
|
||||
-e AWS_SECRET_ACCESS_KEY \
|
||||
-v "${HF_CACHE}:${HF_MOUNT}" \
|
||||
-e "HF_HOME=${HF_MOUNT}" \
|
||||
-e "PYTHONPATH=${MYPYTHONPATH}" \
|
||||
--name "${container_name}" \
|
||||
"${image_name}" \
|
||||
/bin/bash -c "${commands}"
|
||||
fi
|
@ -1,49 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script build the CPU docker image and run the offline inference inside the container.
|
||||
# It serves a sanity check for compilation and basic model usage.
|
||||
set -ex
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() {
|
||||
if [[ -n "$container_id" ]]; then
|
||||
podman stop --all -t0
|
||||
podman rm -f "$container_id" || true
|
||||
fi
|
||||
podman system prune -f
|
||||
}
|
||||
trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Try building the docker image
|
||||
podman build -t cpu-test-ubi9-ppc -f docker/Dockerfile.ppc64le .
|
||||
|
||||
# Run the image
|
||||
container_id=$(podman run -itd --entrypoint /bin/bash -v /tmp/:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN cpu-test-ubi9-ppc)
|
||||
|
||||
function cpu_tests() {
|
||||
|
||||
# offline inference
|
||||
podman exec -it "$container_id" bash -c "
|
||||
set -e
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
||||
|
||||
# Run basic model test
|
||||
podman exec -it "$container_id" bash -c "
|
||||
set -e
|
||||
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
|
||||
pip install sentence-transformers datamodel_code_generator
|
||||
pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
|
||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
|
||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
|
||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
|
||||
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
|
||||
pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model"
|
||||
}
|
||||
|
||||
# All of CPU tests are expected to be finished less than 40 mins.
|
||||
|
||||
export container_id
|
||||
export -f cpu_tests
|
||||
timeout 40m bash -c cpu_tests
|
||||
|
@ -1,13 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script build the CPU docker image and run the offline inference inside the container.
|
||||
# It serves a sanity check for compilation and basic model usage.
|
||||
set -ex
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() { docker rm -f cpu-test || true; docker system prune -f; }
|
||||
trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Try building the docker image
|
||||
docker build -t cpu-test -f docker/Dockerfile.s390x .
|
@ -1,102 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script build the CPU docker image and run the offline inference inside the container.
|
||||
# It serves a sanity check for compilation and basic model usage.
|
||||
set -ex
|
||||
|
||||
# allow to bind to different cores
|
||||
CORE_RANGE=${CORE_RANGE:-48-95}
|
||||
OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
|
||||
NUMA_NODE=${NUMA_NODE:-1}
|
||||
|
||||
export CMAKE_BUILD_PARALLEL_LEVEL=32
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() {
|
||||
set -e;
|
||||
docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;
|
||||
}
|
||||
trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Try building the docker image
|
||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
|
||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
|
||||
|
||||
# Run the image, setting --shm-size=4g for tensor parallel.
|
||||
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
|
||||
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
|
||||
|
||||
function cpu_tests() {
|
||||
set -e
|
||||
export NUMA_NODE=$2
|
||||
|
||||
# list packages
|
||||
docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
|
||||
set -e
|
||||
pip list"
|
||||
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pip list"
|
||||
|
||||
# offline inference
|
||||
docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
|
||||
set -e
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
||||
|
||||
# Run basic model test
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
|
||||
pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
|
||||
pytest -v -s tests/models/language/generation -m cpu_model
|
||||
VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model
|
||||
pytest -v -s tests/models/language/pooling -m cpu_model
|
||||
pytest -v -s tests/models/multimodal/generation \
|
||||
--ignore=tests/models/multimodal/generation/test_mllama.py \
|
||||
--ignore=tests/models/multimodal/generation/test_pixtral.py \
|
||||
-m cpu_model"
|
||||
|
||||
# Run compressed-tensor test
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pytest -s -v \
|
||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
|
||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
|
||||
|
||||
# Run AWQ test
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
VLLM_USE_V1=0 pytest -s -v \
|
||||
tests/quantization/test_ipex_quant.py"
|
||||
|
||||
# Run chunked-prefill and prefix-cache test
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pytest -s -v -k cpu_model \
|
||||
tests/basic_correctness/test_chunked_prefill.py"
|
||||
|
||||
# online serving
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half &
|
||||
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
|
||||
VLLM_CPU_CI_ENV=0 python3 benchmarks/benchmark_serving.py \
|
||||
--backend vllm \
|
||||
--dataset-name random \
|
||||
--model facebook/opt-125m \
|
||||
--num-prompts 20 \
|
||||
--endpoint /v1/completions \
|
||||
--tokenizer facebook/opt-125m"
|
||||
|
||||
# Run multi-lora tests
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pytest -s -v \
|
||||
tests/lora/test_qwen2vl.py"
|
||||
}
|
||||
|
||||
# All of CPU tests are expected to be finished less than 40 mins.
|
||||
export -f cpu_tests
|
||||
timeout 1.5h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
|
@ -1,30 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script build the GH200 docker image and run the offline inference inside the container.
|
||||
# It serves a sanity check for compilation and basic model usage.
|
||||
set -ex
|
||||
|
||||
# Skip the new torch installation during build since we are using the specified version for arm64 in the Dockerfile
|
||||
python3 use_existing_torch.py
|
||||
|
||||
# Try building the docker image
|
||||
DOCKER_BUILDKIT=1 docker build . \
|
||||
--file docker/Dockerfile \
|
||||
--target vllm-openai \
|
||||
--platform "linux/arm64" \
|
||||
-t gh200-test \
|
||||
--build-arg max_jobs=66 \
|
||||
--build-arg nvcc_threads=2 \
|
||||
--build-arg RUN_WHEEL_CHECK=false \
|
||||
--build-arg torch_cuda_arch_list="9.0+PTX" \
|
||||
--build-arg vllm_fa_cmake_gpu_arches="90-real"
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() { docker rm -f gh200-test || true; }
|
||||
trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Run the image and test offline inference
|
||||
docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
|
||||
python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B
|
||||
'
|
@ -1,58 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script build the CPU docker image and run the offline inference inside the container.
|
||||
# It serves a sanity check for compilation and basic model usage.
|
||||
set -exuo pipefail
|
||||
|
||||
# Try building the docker image
|
||||
cat <<EOF | docker build -t hpu-plugin-v1-test-env -f - .
|
||||
FROM 1.22-413-pt2.7.1:latest
|
||||
|
||||
COPY ./ /workspace/vllm
|
||||
|
||||
WORKDIR /workspace/vllm
|
||||
|
||||
RUN pip install -v -r requirements/hpu.txt
|
||||
RUN pip install git+https://github.com/vllm-project/vllm-gaudi.git
|
||||
|
||||
ENV no_proxy=localhost,127.0.0.1
|
||||
ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
|
||||
|
||||
RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install
|
||||
|
||||
# install development dependencies (for testing)
|
||||
RUN python3 -m pip install -e tests/vllm_test_utils
|
||||
|
||||
WORKDIR /workspace/
|
||||
|
||||
RUN git clone https://github.com/vllm-project/vllm-gaudi.git
|
||||
|
||||
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
|
||||
|
||||
EOF
|
||||
|
||||
# Setup cleanup
|
||||
# certain versions of HPU software stack have a bug that can
|
||||
# override the exit code of the script, so we need to use
|
||||
# separate remove_docker_containers and remove_docker_containers_and_exit
|
||||
# functions, while other platforms only need one remove_docker_container
|
||||
# function.
|
||||
EXITCODE=1
|
||||
remove_docker_containers() { docker rm -f hpu-plugin-v1-test || true; }
|
||||
trap 'remove_docker_containers; exit $EXITCODE;' EXIT
|
||||
remove_docker_containers
|
||||
|
||||
echo "Running HPU plugin v1 test"
|
||||
docker run --rm --runtime=habana --name=hpu-plugin-v1-test --network=host \
|
||||
-e HABANA_VISIBLE_DEVICES=all \
|
||||
hpu-plugin-v1-test-env \
|
||||
/bin/bash "/workspace/vllm-gaudi/tests/upstream_tests/ci_tests.sh"
|
||||
|
||||
EXITCODE=$?
|
||||
if [ $EXITCODE -eq 0 ]; then
|
||||
echo "Test with basic model passed"
|
||||
else
|
||||
echo "Test with basic model FAILED with exit code: $EXITCODE" >&2
|
||||
fi
|
||||
|
||||
# The trap will handle the container removal and final exit.
|
@ -1,64 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script build the Neuron docker image and run the API server inside the container.
|
||||
# It serves a sanity check for compilation and basic model usage.
|
||||
set -e
|
||||
set -v
|
||||
|
||||
image_name="neuron/vllm-ci"
|
||||
container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
||||
|
||||
HF_CACHE="$(realpath ~)/huggingface"
|
||||
mkdir -p "${HF_CACHE}"
|
||||
HF_MOUNT="/root/.cache/huggingface"
|
||||
HF_TOKEN=$(aws secretsmanager get-secret-value --secret-id "ci/vllm-neuron/hf-token" --region us-west-2 --query 'SecretString' --output text | jq -r .VLLM_NEURON_CI_HF_TOKEN)
|
||||
|
||||
NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
|
||||
mkdir -p "${NEURON_COMPILE_CACHE_URL}"
|
||||
NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
|
||||
|
||||
# Try building the docker image
|
||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws
|
||||
|
||||
# prune old image and containers to save disk space, and only once a day
|
||||
# by using a timestamp file in tmp.
|
||||
if [ -f /tmp/neuron-docker-build-timestamp ]; then
|
||||
last_build=$(cat /tmp/neuron-docker-build-timestamp)
|
||||
current_time=$(date +%s)
|
||||
if [ $((current_time - last_build)) -gt 86400 ]; then
|
||||
# Remove dangling images (those that are not tagged and not used by any container)
|
||||
docker image prune -f
|
||||
# Remove unused volumes / force the system prune for old images as well.
|
||||
docker volume prune -f && docker system prune -f
|
||||
echo "$current_time" > /tmp/neuron-docker-build-timestamp
|
||||
fi
|
||||
else
|
||||
date "+%s" > /tmp/neuron-docker-build-timestamp
|
||||
fi
|
||||
|
||||
docker build -t "${image_name}" -f docker/Dockerfile.neuron .
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() {
|
||||
docker image rm -f "${image_name}" || true;
|
||||
}
|
||||
trap remove_docker_container EXIT
|
||||
|
||||
# Run the image
|
||||
docker run --rm -it --device=/dev/neuron0 --network bridge \
|
||||
-v "${HF_CACHE}:${HF_MOUNT}" \
|
||||
-e "HF_HOME=${HF_MOUNT}" \
|
||||
-e "HF_TOKEN=${HF_TOKEN}" \
|
||||
-v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
|
||||
-e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
|
||||
--name "${container_name}" \
|
||||
${image_name} \
|
||||
/bin/bash -c "
|
||||
set -e; # Exit on first error
|
||||
python3 /workspace/vllm/examples/offline_inference/neuron.py;
|
||||
python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys;
|
||||
for f in /workspace/vllm/tests/neuron/2_core/*.py; do
|
||||
echo \"Running test file: \$f\";
|
||||
python3 -m pytest \$f -v --capture=tee-sys;
|
||||
done
|
||||
"
|
@ -1,187 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -xu
|
||||
|
||||
|
||||
remove_docker_container() {
|
||||
docker rm -f tpu-test || true;
|
||||
docker rm -f vllm-tpu || true;
|
||||
}
|
||||
|
||||
trap remove_docker_container EXIT
|
||||
|
||||
# Remove the container that might not be cleaned up in the previous run.
|
||||
remove_docker_container
|
||||
|
||||
# Build the docker image.
|
||||
docker build -f docker/Dockerfile.tpu -t vllm-tpu .
|
||||
|
||||
# Set up cleanup.
|
||||
cleanup_docker() {
|
||||
# Get Docker's root directory
|
||||
docker_root=$(docker info -f '{{.DockerRootDir}}')
|
||||
if [ -z "$docker_root" ]; then
|
||||
echo "Failed to determine Docker root directory."
|
||||
exit 1
|
||||
fi
|
||||
echo "Docker root directory: $docker_root"
|
||||
# Check disk usage of the filesystem where Docker's root directory is located
|
||||
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
|
||||
# Define the threshold
|
||||
threshold=70
|
||||
if [ "$disk_usage" -gt "$threshold" ]; then
|
||||
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
|
||||
# Remove dangling images (those that are not tagged and not used by any container)
|
||||
docker image prune -f
|
||||
# Remove unused volumes / force the system prune for old images as well.
|
||||
docker volume prune -f && docker system prune --force --filter "until=72h" --all
|
||||
echo "Docker images and volumes cleanup completed."
|
||||
else
|
||||
echo "Disk usage is below $threshold%. No cleanup needed."
|
||||
fi
|
||||
}
|
||||
cleanup_docker
|
||||
|
||||
# For HF_TOKEN.
|
||||
source /etc/environment
|
||||
|
||||
docker run --privileged --net host --shm-size=16G -it \
|
||||
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
|
||||
vllm-tpu /bin/bash -c '
|
||||
set -e # Exit immediately if a command exits with a non-zero status.
|
||||
set -u # Treat unset variables as an error.
|
||||
|
||||
echo "--- Starting script inside Docker container ---"
|
||||
|
||||
# Create results directory
|
||||
RESULTS_DIR=$(mktemp -d)
|
||||
# If mktemp fails, set -e will cause the script to exit.
|
||||
echo "Results will be stored in: $RESULTS_DIR"
|
||||
|
||||
# Install dependencies
|
||||
echo "--- Installing Python dependencies ---"
|
||||
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
||||
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
||||
&& python3 -m pip install --progress-bar off lm_eval[api]==0.4.4
|
||||
echo "--- Python dependencies installed ---"
|
||||
export VLLM_USE_V1=1
|
||||
export VLLM_XLA_CHECK_RECOMPILATION=1
|
||||
export VLLM_XLA_CACHE_PATH=
|
||||
echo "Using VLLM V1"
|
||||
|
||||
echo "--- Hardware Information ---"
|
||||
tpu-info
|
||||
echo "--- Starting Tests ---"
|
||||
set +e
|
||||
overall_script_exit_code=0
|
||||
|
||||
# --- Test Definitions ---
|
||||
# If a test fails, this function will print logs and will not cause the main script to exit.
|
||||
run_test() {
|
||||
local test_num=$1
|
||||
local test_name=$2
|
||||
local test_command=$3
|
||||
local log_file="$RESULTS_DIR/test_${test_num}.log"
|
||||
local actual_exit_code
|
||||
|
||||
echo "--- TEST_$test_num: Running $test_name ---"
|
||||
|
||||
# Execute the test command.
|
||||
eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2)
|
||||
actual_exit_code=$?
|
||||
|
||||
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log
|
||||
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log
|
||||
|
||||
if [ "$actual_exit_code" -ne 0 ]; then
|
||||
echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2
|
||||
echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2
|
||||
if [ -f "$log_file" ]; then
|
||||
cat "$log_file" >&2
|
||||
else
|
||||
echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2
|
||||
fi
|
||||
echo "--- End of log for TEST_$test_num ($test_name) ---" >&2
|
||||
return "$actual_exit_code" # Return the failure code
|
||||
else
|
||||
echo "TEST_$test_num ($test_name) PASSED."
|
||||
return 0 # Return success
|
||||
fi
|
||||
}
|
||||
|
||||
# Helper function to call run_test and update the overall script exit code
|
||||
run_and_track_test() {
|
||||
local test_num_arg="$1"
|
||||
local test_name_arg="$2"
|
||||
local test_command_arg="$3"
|
||||
|
||||
# Run the test
|
||||
run_test "$test_num_arg" "$test_name_arg" "$test_command_arg"
|
||||
local test_specific_exit_code=$?
|
||||
|
||||
# If the test failed, set the overall script exit code to 1
|
||||
if [ "$test_specific_exit_code" -ne 0 ]; then
|
||||
# No need for extra echo here, run_test already logged the failure.
|
||||
overall_script_exit_code=1
|
||||
fi
|
||||
}
|
||||
|
||||
# --- Actual Test Execution ---
|
||||
run_and_track_test 0 "test_perf.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_perf.py"
|
||||
run_and_track_test 1 "test_compilation.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py"
|
||||
run_and_track_test 2 "test_basic.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py"
|
||||
run_and_track_test 3 "test_accuracy.py::test_lm_eval_accuracy_v1_engine" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine"
|
||||
run_and_track_test 4 "test_quantization_accuracy.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py"
|
||||
run_and_track_test 5 "examples/offline_inference/tpu.py" \
|
||||
"python3 /workspace/vllm/examples/offline_inference/tpu.py"
|
||||
run_and_track_test 6 "test_tpu_model_runner.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py"
|
||||
run_and_track_test 7 "test_sampler.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py"
|
||||
run_and_track_test 8 "test_topk_topp_sampler.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py"
|
||||
run_and_track_test 9 "test_multimodal.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py"
|
||||
run_and_track_test 10 "test_pallas.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
|
||||
run_and_track_test 11 "test_struct_output_generate.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
|
||||
run_and_track_test 12 "test_moe_pallas.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
|
||||
run_and_track_test 13 "test_lora.py" \
|
||||
"VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
|
||||
run_and_track_test 14 "test_tpu_qkv_linear.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
|
||||
run_and_track_test 15 "test_spmd_model_weight_loading.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
|
||||
run_and_track_test 16 "test_kv_cache_update_kernel.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py"
|
||||
|
||||
# After all tests have been attempted, exit with the overall status.
|
||||
if [ "$overall_script_exit_code" -ne 0 ]; then
|
||||
echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---"
|
||||
else
|
||||
echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---"
|
||||
fi
|
||||
exit "$overall_script_exit_code"
|
||||
' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct.
|
||||
|
||||
# Capture the exit code of the docker run command
|
||||
DOCKER_RUN_EXIT_CODE=$?
|
||||
|
||||
# The trap will run for cleanup.
|
||||
# Exit the main script with the Docker run command's exit code.
|
||||
if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then
|
||||
echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE."
|
||||
exit "$DOCKER_RUN_EXIT_CODE"
|
||||
else
|
||||
echo "Docker run command completed successfully."
|
||||
exit 0
|
||||
fi
|
||||
# TODO: This test fails because it uses RANDOM_SEED sampling
|
||||
# pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
|
@ -1,32 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script build the CPU docker image and run the offline inference inside the container.
|
||||
# It serves a sanity check for compilation and basic model usage.
|
||||
set -ex
|
||||
|
||||
image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
|
||||
container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
||||
|
||||
# Try building the docker image
|
||||
docker build -t ${image_name} -f docker/Dockerfile.xpu .
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() {
|
||||
docker rm -f "${container_name}" || true;
|
||||
docker image rm -f "${image_name}" || true;
|
||||
docker system prune -f || true;
|
||||
}
|
||||
trap remove_docker_container EXIT
|
||||
|
||||
# Run the image and test offline inference/tensor parallel
|
||||
docker run \
|
||||
--device /dev/dri \
|
||||
-v /dev/dri/by-path:/dev/dri/by-path \
|
||||
--entrypoint="" \
|
||||
--name "${container_name}" \
|
||||
"${image_name}" \
|
||||
sh -c '
|
||||
VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
|
||||
VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
|
||||
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
|
||||
'
|
@ -1,18 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Usage: ./rerun_test.sh path/to/test.py::test_name
|
||||
|
||||
# Check if argument is given
|
||||
if [ $# -lt 1 ]; then
|
||||
echo "Usage: $0 path/to/test.py::test_name"
|
||||
echo "Example: $0 tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
TEST=$1
|
||||
COUNT=1
|
||||
|
||||
while pytest -sv "$TEST"; do
|
||||
COUNT=$((COUNT + 1))
|
||||
echo "RUN NUMBER ${COUNT}"
|
||||
done
|
@ -1,108 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -euox pipefail
|
||||
|
||||
if [[ $# -lt 4 ]]; then
|
||||
echo "Usage: .buildkite/scripts/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
WORKING_DIR=$1
|
||||
NUM_NODES=$2
|
||||
NUM_GPUS=$3
|
||||
DOCKER_IMAGE=$4
|
||||
|
||||
shift 4
|
||||
COMMANDS=("$@")
|
||||
if [ ${#COMMANDS[@]} -ne "$NUM_NODES" ]; then
|
||||
echo "The number of commands must be equal to the number of nodes."
|
||||
echo "Number of nodes: $NUM_NODES"
|
||||
echo "Number of commands: ${#COMMANDS[@]}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "List of commands"
|
||||
for command in "${COMMANDS[@]}"; do
|
||||
echo "$command"
|
||||
done
|
||||
|
||||
start_network() {
|
||||
docker network create --subnet=192.168.10.0/24 docker-net
|
||||
}
|
||||
|
||||
start_nodes() {
|
||||
for node in $(seq 0 $(($NUM_NODES-1))); do
|
||||
GPU_DEVICES='"device='
|
||||
for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
|
||||
DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
|
||||
GPU_DEVICES+=$(($DEVICE_NUM))
|
||||
if [ "$node_gpu" -lt $(($NUM_GPUS - 1)) ]; then
|
||||
GPU_DEVICES+=','
|
||||
fi
|
||||
done
|
||||
GPU_DEVICES+='"'
|
||||
|
||||
# start the container in detached mode
|
||||
# things to note:
|
||||
# 1. --shm-size=10.24gb is required. don't use --ipc=host
|
||||
# 2. pass HF_TOKEN to the container
|
||||
# 3. map the huggingface cache directory to the container
|
||||
# 3. assign ip addresses to the containers (head node: 192.168.10.10, worker nodes:
|
||||
# starting from 192.168.10.11)
|
||||
docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN \
|
||||
-v ~/.cache/huggingface:/root/.cache/huggingface --name "node$node" \
|
||||
--network docker-net --ip 192.168.10.$((10 + $node)) --rm "$DOCKER_IMAGE" \
|
||||
/bin/bash -c "tail -f /dev/null"
|
||||
|
||||
# organize containers into a ray cluster
|
||||
if [ "$node" -eq 0 ]; then
|
||||
# start the ray head node
|
||||
docker exec -d "node$node" /bin/bash -c "ray start --head --port=6379 --block"
|
||||
# wait for the head node to be ready
|
||||
sleep 10
|
||||
else
|
||||
# start the ray worker nodes, and connect them to the head node
|
||||
docker exec -d "node$node" /bin/bash -c "ray start --address=192.168.10.10:6379 --block"
|
||||
fi
|
||||
done
|
||||
|
||||
# wait for the cluster to be ready
|
||||
sleep 10
|
||||
|
||||
# print the cluster status
|
||||
docker exec node0 /bin/bash -c "ray status"
|
||||
}
|
||||
|
||||
run_nodes() {
|
||||
# important: iterate in reverse order to start the head node last
|
||||
# we start the worker nodes first, in detached mode, and then start the head node
|
||||
# in the foreground, so that the output of the head node is visible in the buildkite logs
|
||||
for node in $(seq $(($NUM_NODES - 1)) -1 0); do
|
||||
GPU_DEVICES='"device='
|
||||
for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
|
||||
DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
|
||||
GPU_DEVICES+=$(($DEVICE_NUM))
|
||||
if [ "$node_gpu" -lt $(($NUM_GPUS - 1)) ]; then
|
||||
GPU_DEVICES+=','
|
||||
fi
|
||||
done
|
||||
GPU_DEVICES+='"'
|
||||
echo "Running node$node with GPU devices: $GPU_DEVICES"
|
||||
if [ "$node" -ne 0 ]; then
|
||||
docker exec -d "node$node" /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
|
||||
else
|
||||
docker exec "node$node" /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
|
||||
fi
|
||||
done
|
||||
}
|
||||
cleanup() {
|
||||
for node in $(seq 0 $(($NUM_NODES-1))); do
|
||||
docker stop "node$node"
|
||||
done
|
||||
docker network rm docker-net
|
||||
}
|
||||
trap cleanup EXIT
|
||||
start_network
|
||||
start_nodes
|
||||
run_nodes
|
||||
|
@ -1,24 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
docker_root=$(docker info -f '{{.DockerRootDir}}')
|
||||
if [ -z "$docker_root" ]; then
|
||||
echo "Failed to determine Docker root directory."
|
||||
exit 1
|
||||
fi
|
||||
echo "Docker root directory: $docker_root"
|
||||
# Check disk usage of the filesystem where Docker's root directory is located
|
||||
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
|
||||
# Define the threshold
|
||||
threshold=70
|
||||
if [ "$disk_usage" -gt "$threshold" ]; then
|
||||
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
|
||||
# Remove dangling images (those that are not tagged and not used by any container)
|
||||
docker image prune -f
|
||||
# Remove unused volumes / force the system prune for old images as well.
|
||||
docker volume prune -f && docker system prune --force --filter "until=72h" --all
|
||||
echo "Docker images and volumes cleanup completed."
|
||||
else
|
||||
echo "Disk usage is below $threshold%. No cleanup needed."
|
||||
fi
|
@ -1,14 +0,0 @@
|
||||
# Environment config
|
||||
TEST_NAME=llama8b
|
||||
CONTAINER_NAME=vllm-tpu
|
||||
|
||||
# vllm config
|
||||
MODEL=meta-llama/Llama-3.1-8B-Instruct
|
||||
MAX_NUM_SEQS=256
|
||||
MAX_NUM_BATCHED_TOKENS=1024
|
||||
TENSOR_PARALLEL_SIZE=1
|
||||
MAX_MODEL_LEN=2048
|
||||
DOWNLOAD_DIR=/mnt/disks/persist
|
||||
EXPECTED_THROUGHPUT=8.0
|
||||
INPUT_LEN=1800
|
||||
OUTPUT_LEN=128
|
@ -1,102 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
if [ ! -f "$1" ]; then
|
||||
echo "Error: The env file '$1' does not exist."
|
||||
exit 1 # Exit the script with a non-zero status to indicate an error
|
||||
fi
|
||||
|
||||
ENV_FILE=$1
|
||||
|
||||
# For testing on local vm, use `set -a` to export all variables
|
||||
source /etc/environment
|
||||
source $ENV_FILE
|
||||
|
||||
remove_docker_container() {
|
||||
docker rm -f tpu-test || true;
|
||||
docker rm -f vllm-tpu || true;
|
||||
docker rm -f $CONTAINER_NAME || true;
|
||||
}
|
||||
|
||||
trap remove_docker_container EXIT
|
||||
|
||||
# Remove the container that might not be cleaned up in the previous run.
|
||||
remove_docker_container
|
||||
|
||||
# Build docker image.
|
||||
# TODO: build the image outside the script and share the image with other
|
||||
# tpu test if building time is too long.
|
||||
DOCKER_BUILDKIT=1 docker build \
|
||||
--build-arg max_jobs=16 \
|
||||
--build-arg USE_SCCACHE=1 \
|
||||
--build-arg GIT_REPO_CHECK=0 \
|
||||
--tag vllm/vllm-tpu-bm \
|
||||
--progress plain -f docker/Dockerfile.tpu .
|
||||
|
||||
LOG_ROOT=$(mktemp -d)
|
||||
# If mktemp fails, set -e will cause the script to exit.
|
||||
echo "Results will be stored in: $LOG_ROOT"
|
||||
|
||||
if [ -z "$HF_TOKEN" ]; then
|
||||
echo "Error: HF_TOKEN is not set or is empty."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Make sure mounted disk or dir exists
|
||||
if [ ! -d "$DOWNLOAD_DIR" ]; then
|
||||
echo "Error: Folder $DOWNLOAD_DIR does not exist. This is useually a mounted drive. If no mounted drive, just create a folder."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Run model $MODEL"
|
||||
echo
|
||||
|
||||
echo "starting docker...$CONTAINER_NAME"
|
||||
echo
|
||||
docker run \
|
||||
-v $DOWNLOAD_DIR:$DOWNLOAD_DIR \
|
||||
--env-file $ENV_FILE \
|
||||
-e HF_TOKEN="$HF_TOKEN" \
|
||||
-e TARGET_COMMIT=$BUILDKITE_COMMIT \
|
||||
-e MODEL=$MODEL \
|
||||
-e WORKSPACE=/workspace \
|
||||
--name $CONTAINER_NAME \
|
||||
-d \
|
||||
--privileged \
|
||||
--network host \
|
||||
-v /dev/shm:/dev/shm \
|
||||
vllm/vllm-tpu-bm tail -f /dev/null
|
||||
|
||||
echo "run script..."
|
||||
echo
|
||||
docker exec "$CONTAINER_NAME" /bin/bash -c ".buildkite/scripts/tpu/run_bm.sh"
|
||||
|
||||
echo "copy result back..."
|
||||
VLLM_LOG="$LOG_ROOT/$TEST_NAME"_vllm_log.txt
|
||||
BM_LOG="$LOG_ROOT/$TEST_NAME"_bm_log.txt
|
||||
docker cp "$CONTAINER_NAME:/workspace/vllm_log.txt" "$VLLM_LOG"
|
||||
docker cp "$CONTAINER_NAME:/workspace/bm_log.txt" "$BM_LOG"
|
||||
|
||||
throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g')
|
||||
echo "throughput for $TEST_NAME at $BUILDKITE_COMMIT: $throughput"
|
||||
|
||||
if [ "$BUILDKITE" = "true" ]; then
|
||||
echo "Running inside Buildkite"
|
||||
buildkite-agent artifact upload "$VLLM_LOG"
|
||||
buildkite-agent artifact upload "$BM_LOG"
|
||||
else
|
||||
echo "Not running inside Buildkite"
|
||||
fi
|
||||
|
||||
#
|
||||
# compare the throughput with EXPECTED_THROUGHPUT
|
||||
# and assert meeting the expectation
|
||||
#
|
||||
if [[ -z "$throughput" || ! "$throughput" =~ ^[0-9]+([.][0-9]+)?$ ]]; then
|
||||
echo "Failed to get the throughput"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if (( $(echo "$throughput < $EXPECTED_THROUGHPUT" | bc -l) )); then
|
||||
echo "Error: throughput($throughput) is less than expected($EXPECTED_THROUGHPUT)"
|
||||
exit 1
|
||||
fi
|
@ -1,14 +0,0 @@
|
||||
# Environment config
|
||||
TEST_NAME=llama8bw8a8
|
||||
CONTAINER_NAME=vllm-tpu
|
||||
|
||||
# vllm config
|
||||
MODEL=RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8
|
||||
MAX_NUM_SEQS=128
|
||||
MAX_NUM_BATCHED_TOKENS=1024
|
||||
TENSOR_PARALLEL_SIZE=1
|
||||
MAX_MODEL_LEN=2048
|
||||
DOWNLOAD_DIR=/mnt/disks/persist
|
||||
EXPECTED_THROUGHPUT=10.0
|
||||
INPUT_LEN=1800
|
||||
OUTPUT_LEN=128
|
@ -1,94 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
VLLM_LOG="$WORKSPACE/vllm_log.txt"
|
||||
BM_LOG="$WORKSPACE/bm_log.txt"
|
||||
|
||||
if [ -n "$TARGET_COMMIT" ]; then
|
||||
head_hash=$(git rev-parse HEAD)
|
||||
if [ "$TARGET_COMMIT" != "$head_hash" ]; then
|
||||
echo "Error: target commit $TARGET_COMMIT does not match HEAD: $head_hash"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
echo "model: $MODEL"
|
||||
echo
|
||||
|
||||
#
|
||||
# create a log folder
|
||||
#
|
||||
mkdir "$WORKSPACE/log"
|
||||
|
||||
# TODO: Move to image building.
|
||||
pip install pandas
|
||||
pip install datasets
|
||||
|
||||
#
|
||||
# create sonnet_4x
|
||||
#
|
||||
echo "Create sonnet_4x.txt"
|
||||
echo "" > benchmarks/sonnet_4x.txt
|
||||
for _ in {1..4}
|
||||
do
|
||||
cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt
|
||||
done
|
||||
|
||||
#
|
||||
# start vllm service in backend
|
||||
#
|
||||
echo "lanching vllm..."
|
||||
echo "logging to $VLLM_LOG"
|
||||
echo
|
||||
|
||||
VLLM_USE_V1=1 vllm serve $MODEL \
|
||||
--seed 42 \
|
||||
--disable-log-requests \
|
||||
--max-num-seqs $MAX_NUM_SEQS \
|
||||
--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
|
||||
--tensor-parallel-size $TENSOR_PARALLEL_SIZE \
|
||||
--no-enable-prefix-caching \
|
||||
--download_dir $DOWNLOAD_DIR \
|
||||
--max-model-len $MAX_MODEL_LEN > "$VLLM_LOG" 2>&1 &
|
||||
|
||||
|
||||
echo "wait for 20 minutes.."
|
||||
echo
|
||||
# sleep 1200
|
||||
# wait for 10 minutes...
|
||||
for i in {1..120}; do
|
||||
# TODO: detect other type of errors.
|
||||
if grep -Fq "raise RuntimeError" "$VLLM_LOG"; then
|
||||
echo "Detected RuntimeError, exiting."
|
||||
exit 1
|
||||
elif grep -Fq "Application startup complete" "$VLLM_LOG"; then
|
||||
echo "Application started"
|
||||
break
|
||||
else
|
||||
echo "wait for 10 seconds..."
|
||||
sleep 10
|
||||
fi
|
||||
done
|
||||
|
||||
#
|
||||
# run test
|
||||
#
|
||||
echo "run benchmark test..."
|
||||
echo "logging to $BM_LOG"
|
||||
echo
|
||||
python benchmarks/benchmark_serving.py \
|
||||
--backend vllm \
|
||||
--model $MODEL \
|
||||
--dataset-name sonnet \
|
||||
--dataset-path benchmarks/sonnet_4x.txt \
|
||||
--sonnet-input-len $INPUT_LEN \
|
||||
--sonnet-output-len $OUTPUT_LEN \
|
||||
--ignore-eos > "$BM_LOG"
|
||||
|
||||
echo "completed..."
|
||||
echo
|
||||
|
||||
throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g')
|
||||
echo "throughput: $throughput"
|
||||
echo
|
@ -1,78 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -ex
|
||||
|
||||
# Assume wheels are in artifacts/dist/*.whl
|
||||
wheel_files=(artifacts/dist/*.whl)
|
||||
|
||||
# Check that exactly one wheel is found
|
||||
if [[ ${#wheel_files[@]} -ne 1 ]]; then
|
||||
echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Get the single wheel file
|
||||
wheel="${wheel_files[0]}"
|
||||
|
||||
# Rename 'linux' to 'manylinux1' in the wheel filename
|
||||
new_wheel="${wheel/linux/manylinux1}"
|
||||
mv -- "$wheel" "$new_wheel"
|
||||
wheel="$new_wheel"
|
||||
|
||||
# Extract the version from the wheel
|
||||
version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
|
||||
echo "Version: $version"
|
||||
|
||||
normal_wheel="$wheel" # Save the original wheel filename
|
||||
|
||||
# If the version contains "dev", rename it to v1.0.0.dev for consistency
|
||||
if [[ $version == *dev* ]]; then
|
||||
suffix="${version##*.}"
|
||||
if [[ $suffix == cu* ]]; then
|
||||
new_version="1.0.0.dev+${suffix}"
|
||||
else
|
||||
new_version="1.0.0.dev"
|
||||
fi
|
||||
new_wheel="${wheel/$version/$new_version}"
|
||||
# use cp to keep both files in the artifacts directory
|
||||
cp -- "$wheel" "$new_wheel"
|
||||
wheel="$new_wheel"
|
||||
version="$new_version"
|
||||
fi
|
||||
|
||||
# Upload the wheel to S3
|
||||
python3 .buildkite/generate_index.py --wheel "$normal_wheel"
|
||||
|
||||
# generate index for this commit
|
||||
aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
||||
aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
||||
|
||||
if [[ $normal_wheel == *"cu118"* ]]; then
|
||||
# if $normal_wheel matches cu118, do not upload the index.html
|
||||
echo "Skipping index files for cu118 wheels"
|
||||
elif [[ $normal_wheel == *"cu126"* ]]; then
|
||||
# if $normal_wheel matches cu126, do not upload the index.html
|
||||
echo "Skipping index files for cu126 wheels"
|
||||
else
|
||||
# only upload index.html for cu128 wheels (default wheels)
|
||||
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
|
||||
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
|
||||
fi
|
||||
|
||||
# generate index for nightly
|
||||
aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
|
||||
aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
|
||||
|
||||
if [[ $normal_wheel == *"cu118"* ]]; then
|
||||
# if $normal_wheel matches cu118, do not upload the index.html
|
||||
echo "Skipping index files for cu118 wheels"
|
||||
elif [[ $normal_wheel == *"cu126"* ]]; then
|
||||
# if $normal_wheel matches cu126, do not upload the index.html
|
||||
echo "Skipping index files for cu126 wheels"
|
||||
else
|
||||
# only upload index.html for cu128 wheels (default wheels)
|
||||
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
|
||||
fi
|
||||
|
||||
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
|
||||
aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html"
|
@ -1,834 +1,166 @@
|
||||
# In this file, you can add more tests to run either by adding a new step or
|
||||
# adding a new command to an existing step. See different options here for examples.
|
||||
|
||||
# This script will be feed into Jinja template in `test-template-aws.j2` at
|
||||
# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2
|
||||
# to generate the final pipeline yaml file.
|
||||
|
||||
# Documentation
|
||||
# label(str): the name of the test. emoji allowed.
|
||||
# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
|
||||
# torch_nightly(bool): whether to run this on vllm against torch nightly pipeline.
|
||||
# fast_check_only(bool): run this test on fastcheck pipeline only
|
||||
# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
|
||||
# command(str): the single command to run for tests. incompatible with commands.
|
||||
# commands(list): the list of commands to run for test. incompatbile with command.
|
||||
# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
|
||||
# gpu(str): override the GPU selection for the test. default is on L4 GPUs. currently only supports a100
|
||||
# num_gpus(int): override the number of GPUs for the test. default to 1 GPU. currently support 2,4.
|
||||
# num_nodes(int): whether to simulate multi-node setup by launch multiple containers on one host,
|
||||
# in this case, commands must be specified. the first command runs on first host, the second
|
||||
# command runs on the second host.
|
||||
# working_dir(str): specify the place where command should execute, default to /vllm-workspace/tests
|
||||
# source_file_dependencies(list): the list of prefix to opt-in the test for, if empty, the test will always run.
|
||||
|
||||
# When adding a test
|
||||
# - If the test belong to an existing group, add it there
|
||||
# - If the test is short, add to any existing step
|
||||
# - If the test takes more than 10min, then it is okay to create a new step.
|
||||
# Note that all steps execute in parallel.
|
||||
# This script will be feed into Jinja template in `test-template.j2` to generate
|
||||
# the final pipeline yaml file.
|
||||
|
||||
steps:
|
||||
##### fast check tests #####
|
||||
|
||||
- label: Documentation Build # 2min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
working_dir: "/vllm-workspace/test_docs"
|
||||
fast_check: true
|
||||
no_gpu: True
|
||||
commands:
|
||||
- pip install -r ../requirements/docs.txt
|
||||
# TODO: add `--strict` once warnings in docstrings are fixed
|
||||
- mkdocs build
|
||||
|
||||
- label: Pytorch Nightly Dependency Override Check # 2min
|
||||
# if this test fails, it means the nightly torch version is not compatible with some
|
||||
# of the dependencies. Please check the error message and add the package to whitelist
|
||||
# in /vllm/tools/generate_nightly_torch_test.py
|
||||
soft_fail: true
|
||||
source_file_dependencies:
|
||||
- requirements/nightly_torch_test.txt
|
||||
commands:
|
||||
- bash standalone_tests/pytorch_nightly_dependency.sh
|
||||
|
||||
- label: Async Engine, Inputs, Utils, Worker Test # 24min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/mq_llm_engine
|
||||
- tests/async_engine
|
||||
- tests/test_inputs
|
||||
- tests/multimodal
|
||||
- tests/test_utils
|
||||
- tests/worker
|
||||
- tests/standalone_tests/lazy_imports.py
|
||||
commands:
|
||||
- python3 standalone_tests/lazy_imports.py
|
||||
- pytest -v -s mq_llm_engine # MQLLMEngine
|
||||
- pytest -v -s async_engine # AsyncLLMEngine
|
||||
- NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
|
||||
- pytest -v -s test_inputs.py
|
||||
- pytest -v -s test_outputs.py
|
||||
- pytest -v -s multimodal
|
||||
- pytest -v -s test_utils.py # Utils
|
||||
- pytest -v -s worker # Worker
|
||||
|
||||
- label: Python-only Installation Test
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- tests/standalone_tests/python_only_compile.sh
|
||||
- setup.py
|
||||
commands:
|
||||
- bash standalone_tests/python_only_compile.sh
|
||||
|
||||
- label: Basic Correctness Test # 30min
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
fast_check: true
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/basic_correctness/test_basic_correctness
|
||||
- tests/basic_correctness/test_cpu_offload
|
||||
- tests/basic_correctness/test_preemption
|
||||
- tests/basic_correctness/test_cumem.py
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s basic_correctness/test_cumem.py
|
||||
- pytest -v -s basic_correctness/test_basic_correctness.py
|
||||
- pytest -v -s basic_correctness/test_cpu_offload.py
|
||||
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
|
||||
|
||||
- label: Chunked Prefill Test
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/basic_correctness/test_chunked_prefill
|
||||
commands:
|
||||
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
|
||||
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
|
||||
|
||||
- label: Core Test # 10min
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
fast_check: true
|
||||
source_file_dependencies:
|
||||
- vllm/core
|
||||
- vllm/distributed
|
||||
- tests/core
|
||||
commands:
|
||||
- pytest -v -s core
|
||||
|
||||
- label: Entrypoints Test # 40min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
fast_check: true
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/entrypoints/llm
|
||||
- tests/entrypoints/openai
|
||||
- tests/entrypoints/test_chat_utils
|
||||
- tests/entrypoints/offline_mode
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
|
||||
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
|
||||
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
||||
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
|
||||
- VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
|
||||
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/
|
||||
- pytest -v -s entrypoints/test_chat_utils.py
|
||||
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
||||
|
||||
- label: Distributed Tests (4 GPUs) # 10min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/
|
||||
- vllm/core/
|
||||
- tests/distributed/test_utils
|
||||
- tests/distributed/test_pynccl
|
||||
- tests/distributed/test_events
|
||||
- tests/spec_decode/e2e/test_integration_dist_tp4
|
||||
- tests/compile/test_basic_correctness
|
||||
- examples/offline_inference/rlhf.py
|
||||
- examples/offline_inference/rlhf_colocate.py
|
||||
- tests/examples/offline_inference/data_parallel.py
|
||||
- tests/v1/test_async_llm_dp.py
|
||||
- tests/v1/test_external_lb_dp.py
|
||||
- tests/v1/engine/test_engine_core_client.py
|
||||
commands:
|
||||
# test with tp=2 and external_dp=2
|
||||
- VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||
# test with tp=2 and pp=2
|
||||
- PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||
# test with internal dp
|
||||
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
|
||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
|
||||
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
|
||||
- pytest -v -s distributed/test_utils.py
|
||||
- pytest -v -s compile/test_basic_correctness.py
|
||||
- pytest -v -s distributed/test_pynccl.py
|
||||
- pytest -v -s distributed/test_events.py
|
||||
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
|
||||
# TODO: create a dedicated test section for multi-GPU example tests
|
||||
# when we have multiple distributed example tests
|
||||
- pushd ../examples/offline_inference
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
|
||||
- popd
|
||||
|
||||
- label: EPLB Algorithm Test
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/eplb
|
||||
- tests/distributed/test_eplb_algo.py
|
||||
commands:
|
||||
- pytest -v -s distributed/test_eplb_algo.py
|
||||
|
||||
- label: EPLB Execution Test # 5min
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/eplb
|
||||
- tests/distributed/test_eplb_execute.py
|
||||
commands:
|
||||
- pytest -v -s distributed/test_eplb_execute.py
|
||||
|
||||
- label: Metrics, Tracing Test # 10min
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
num_gpus: 2
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/metrics
|
||||
- tests/tracing
|
||||
commands:
|
||||
- pytest -v -s metrics
|
||||
- "pip install \
|
||||
'opentelemetry-sdk>=1.26.0' \
|
||||
'opentelemetry-api>=1.26.0' \
|
||||
'opentelemetry-exporter-otlp>=1.26.0' \
|
||||
'opentelemetry-semantic-conventions-ai>=0.4.1'"
|
||||
- pytest -v -s tracing
|
||||
|
||||
##### fast check tests #####
|
||||
##### 1 GPU test #####
|
||||
|
||||
- label: Regression Test # 5min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/test_regression
|
||||
commands:
|
||||
- pip install modelscope
|
||||
- pytest -v -s test_regression.py
|
||||
- label: Regression Test
|
||||
mirror_hardwares: [amd]
|
||||
command: pytest -v -s test_regression.py
|
||||
working_dir: "/vllm-workspace/tests" # optional
|
||||
|
||||
- label: Engine Test # 10min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/engine
|
||||
- tests/tokenization
|
||||
- tests/test_sequence
|
||||
- tests/test_config
|
||||
- tests/test_logger
|
||||
- tests/test_vllm_port
|
||||
commands:
|
||||
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
|
||||
# OOM in the CI unless we run this separately
|
||||
- pytest -v -s tokenization
|
||||
- label: AsyncEngine Test
|
||||
#mirror_hardwares: [amd]
|
||||
command: pytest -v -s async_engine
|
||||
|
||||
- label: V1 Test
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/v1
|
||||
- label: Basic Correctness Test
|
||||
mirror_hardwares: [amd]
|
||||
commands:
|
||||
# split the test to avoid interference
|
||||
- pytest -v -s v1/core
|
||||
- pytest -v -s v1/engine
|
||||
- pytest -v -s v1/entrypoints
|
||||
- pytest -v -s v1/sample
|
||||
- pytest -v -s v1/worker
|
||||
- pytest -v -s v1/structured_output
|
||||
- pytest -v -s v1/spec_decode
|
||||
- pytest -v -s v1/kv_connector/unit
|
||||
- pytest -v -s v1/test_serial_utils.py
|
||||
- pytest -v -s v1/test_utils.py
|
||||
- pytest -v -s v1/test_oracle.py
|
||||
- pytest -v -s v1/test_metrics_reader.py
|
||||
# TODO: accuracy does not match, whether setting
|
||||
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
|
||||
- pytest -v -s v1/e2e
|
||||
# Integration test for streaming correctness (requires special branch).
|
||||
- pip install -U git+https://github.com/robertgshaw2-neuralmagic/lm-evaluation-harness.git@streaming-api
|
||||
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
||||
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_basic_correctness.py
|
||||
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py
|
||||
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
|
||||
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
|
||||
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
|
||||
|
||||
- label: Examples Test # 25min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
- label: Core Test
|
||||
mirror_hardwares: [amd]
|
||||
command: pytest -v -s core
|
||||
|
||||
- label: Distributed Comm Ops Test
|
||||
#mirror_hardwares: [amd]
|
||||
command: pytest -v -s distributed/test_comm_ops.py
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
|
||||
- label: Distributed Tests
|
||||
mirror_hardwares: [amd]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
commands:
|
||||
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
|
||||
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
|
||||
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
|
||||
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
|
||||
- pytest -v -s spec_decode/e2e/test_integration_dist.py
|
||||
|
||||
- label: Distributed Tests (Multiple Groups)
|
||||
#mirror_hardwares: [amd]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
commands:
|
||||
- pytest -v -s distributed/test_pynccl.py
|
||||
|
||||
- label: Engine Test
|
||||
mirror_hardwares: [amd]
|
||||
command: pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py
|
||||
|
||||
- label: Entrypoints Test
|
||||
mirror_hardwares: [amd]
|
||||
|
||||
commands:
|
||||
- pytest -v -s test_inputs.py
|
||||
- pytest -v -s entrypoints -m llm
|
||||
- pytest -v -s entrypoints -m openai
|
||||
|
||||
- label: Examples Test
|
||||
working_dir: "/vllm-workspace/examples"
|
||||
source_file_dependencies:
|
||||
- vllm/entrypoints
|
||||
- examples/
|
||||
mirror_hardwares: [amd]
|
||||
commands:
|
||||
- pip install tensorizer # for tensorizer test
|
||||
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
|
||||
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
||||
- python3 offline_inference/basic/chat.py
|
||||
- python3 offline_inference/prefix_caching.py
|
||||
- python3 offline_inference/llm_engine_example.py
|
||||
- python3 offline_inference/audio_language.py --seed 0
|
||||
- python3 offline_inference/vision_language.py --seed 0
|
||||
- python3 offline_inference/vision_language_embedding.py --seed 0
|
||||
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
||||
- VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||
- python3 offline_inference/encoder_decoder.py
|
||||
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
||||
- python3 offline_inference/basic/classify.py
|
||||
- python3 offline_inference/basic/embed.py
|
||||
- python3 offline_inference/basic/score.py
|
||||
- VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
|
||||
# install aws cli for llava_example.py
|
||||
# install tensorizer for tensorize_vllm_model.py
|
||||
- pip install awscli tensorizer
|
||||
- python3 offline_inference.py
|
||||
- python3 offline_inference_with_prefix.py
|
||||
- python3 llm_engine_example.py
|
||||
- python3 llava_example.py
|
||||
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||
|
||||
- label: Prefix Caching Test # 9min
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/prefix_caching
|
||||
- label: Kernels Test %N
|
||||
#mirror_hardwares: [amd]
|
||||
command: pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||
parallelism: 4
|
||||
|
||||
- label: Models Test
|
||||
#mirror_hardwares: [amd]
|
||||
commands:
|
||||
- bash ../.buildkite/download-images.sh
|
||||
- pytest -v -s models --ignore=models/test_llava.py
|
||||
|
||||
- label: Llava Test
|
||||
mirror_hardwares: [amd]
|
||||
commands:
|
||||
- bash ../.buildkite/download-images.sh
|
||||
- pytest -v -s models/test_llava.py
|
||||
|
||||
- label: Prefix Caching Test
|
||||
mirror_hardwares: [amd]
|
||||
commands:
|
||||
- pytest -v -s prefix_caching
|
||||
|
||||
- label: Samplers Test
|
||||
#mirror_hardwares: [amd]
|
||||
command: pytest -v -s samplers
|
||||
|
||||
- label: Platform Tests (CUDA)
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/cuda
|
||||
commands:
|
||||
- pytest -v -s cuda/test_cuda_context.py
|
||||
- label: LogitsProcessor Test
|
||||
mirror_hardwares: [amd]
|
||||
command: pytest -v -s test_logits_processor.py
|
||||
|
||||
- label: Samplers Test # 36min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/layers
|
||||
- vllm/sampling_metadata.py
|
||||
- tests/samplers
|
||||
- tests/conftest.py
|
||||
commands:
|
||||
- pytest -v -s samplers
|
||||
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
|
||||
- label: Utils Test
|
||||
command: pytest -v -s test_utils.py
|
||||
|
||||
- label: Speculative decoding tests # 40min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- vllm/spec_decode
|
||||
- tests/spec_decode
|
||||
- vllm/model_executor/models/eagle.py
|
||||
commands:
|
||||
- pytest -v -s spec_decode/e2e/test_multistep_correctness.py
|
||||
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py --ignore=spec_decode/e2e/test_mtp_correctness.py
|
||||
- pytest -v -s spec_decode/e2e/test_eagle_correctness.py
|
||||
- label: Worker Test
|
||||
mirror_hardwares: [amd]
|
||||
command: pytest -v -s worker
|
||||
|
||||
- label: LoRA Test %N # 15min each
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
source_file_dependencies:
|
||||
- vllm/lora
|
||||
- tests/lora
|
||||
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
|
||||
- label: Speculative decoding tests
|
||||
#mirror_hardwares: [amd]
|
||||
command: pytest -v -s spec_decode
|
||||
|
||||
- label: LoRA Test %N
|
||||
#mirror_hardwares: [amd]
|
||||
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
|
||||
parallelism: 4
|
||||
|
||||
- label: PyTorch Compilation Unit Tests
|
||||
mirror_hardwares: [amdexperimental]
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/compile
|
||||
- label: LoRA Long Context (Distributed)
|
||||
#mirror_hardwares: [amd]
|
||||
num_gpus: 4
|
||||
# This test runs llama 13B, so it is required to run on 4 GPUs.
|
||||
commands:
|
||||
- pytest -v -s compile/test_pass_manager.py
|
||||
- pytest -v -s compile/test_fusion.py
|
||||
- pytest -v -s compile/test_fusion_attn.py
|
||||
- pytest -v -s compile/test_silu_mul_quant_fusion.py
|
||||
- pytest -v -s compile/test_sequence_parallelism.py
|
||||
- pytest -v -s compile/test_async_tp.py
|
||||
# Temporarily run this way because we cannot clean up GPU mem usage
|
||||
# for multi GPU tests.
|
||||
# TODO(sang): Fix it.
|
||||
- pytest -v -s lora/test_long_context.py::test_rotary_emb_replaced
|
||||
- pytest -v -s lora/test_long_context.py::test_batched_rope_kernel
|
||||
- pytest -v -s lora/test_long_context.py::test_self_consistency
|
||||
- pytest -v -s lora/test_long_context.py::test_quality
|
||||
- pytest -v -s lora/test_long_context.py::test_max_len
|
||||
|
||||
- label: PyTorch Fullgraph Smoke Test # 9min
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/compile
|
||||
commands:
|
||||
- pytest -v -s compile/test_basic_correctness.py
|
||||
# these tests need to be separated, cannot combine
|
||||
- pytest -v -s compile/piecewise/test_simple.py
|
||||
- pytest -v -s compile/piecewise/test_toy_llama.py
|
||||
- pytest -v -s compile/piecewise/test_full_cudagraph.py
|
||||
- label: Tensorizer Test
|
||||
#mirror_hardwares: [amd]
|
||||
command: apt-get install curl libsodium23 && pytest -v -s tensorizer_loader
|
||||
|
||||
- label: PyTorch Fullgraph Test # 18min
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/compile
|
||||
commands:
|
||||
- pytest -v -s compile/test_full_graph.py
|
||||
|
||||
- label: Kernels Core Operation Test
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- tests/kernels/core
|
||||
commands:
|
||||
- pytest -v -s kernels/core
|
||||
|
||||
- label: Kernels Attention Test %N
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
source_file_dependencies:
|
||||
- csrc/attention/
|
||||
- vllm/attention
|
||||
- vllm/v1/attention
|
||||
- tests/kernels/attention
|
||||
commands:
|
||||
- pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||
parallelism: 2
|
||||
|
||||
- label: Kernels Quantization Test %N
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/
|
||||
- vllm/model_executor/layers/quantization
|
||||
- tests/kernels/quantization
|
||||
commands:
|
||||
- pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||
parallelism: 2
|
||||
|
||||
- label: Kernels MoE Test
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- csrc/moe/
|
||||
- tests/kernels/moe
|
||||
- vllm/model_executor/layers/fused_moe/
|
||||
commands:
|
||||
- pytest -v -s kernels/moe
|
||||
|
||||
- label: Kernels Mamba Test
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- csrc/mamba/
|
||||
- tests/kernels/mamba
|
||||
commands:
|
||||
- pytest -v -s kernels/mamba
|
||||
|
||||
- label: Tensorizer Test # 11min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
soft_fail: true
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/model_loader
|
||||
- tests/tensorizer_loader
|
||||
- tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
||||
commands:
|
||||
- apt-get update && apt-get install -y curl libsodium23
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s tensorizer_loader
|
||||
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
|
||||
|
||||
- label: Model Executor Test
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
soft_fail: true
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor
|
||||
- tests/model_executor
|
||||
commands:
|
||||
- apt-get update && apt-get install -y curl libsodium23
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s model_executor
|
||||
|
||||
- label: Benchmarks # 9min
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
working_dir: "/vllm-workspace/.buildkite"
|
||||
source_file_dependencies:
|
||||
- benchmarks/
|
||||
commands:
|
||||
- bash scripts/run-benchmarks.sh
|
||||
|
||||
- label: Benchmarks CLI Test # 10min
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/benchmarks/
|
||||
commands:
|
||||
- pytest -v -s benchmarks/
|
||||
- label: Metrics Test
|
||||
mirror_hardwares: [amd]
|
||||
command: pytest -v -s metrics
|
||||
|
||||
- label: Quantization Test
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
- tests/quantization
|
||||
#mirror_hardwares: [amd]
|
||||
command: pytest -v -s quantization
|
||||
|
||||
- label: Benchmarks
|
||||
working_dir: "/vllm-workspace/.buildkite"
|
||||
mirror_hardwares: [amd]
|
||||
commands:
|
||||
# temporary install here since we need nightly, will move to requirements/test.in
|
||||
# after torchao 0.12 release
|
||||
- pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
|
||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
|
||||
- pip install aiohttp
|
||||
- bash run-benchmarks.sh
|
||||
|
||||
- label: LM Eval Small Models # 53min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
- label: Documentation Build
|
||||
working_dir: "/vllm-workspace/test_docs/docs"
|
||||
no_gpu: True
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
|
||||
|
||||
- label: OpenAI API correctness
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/entrypoints/openai/
|
||||
- vllm/model_executor/models/whisper.py
|
||||
commands: # LMEval+Transcription WER check
|
||||
- pytest -s entrypoints/openai/correctness/
|
||||
|
||||
- label: Encoder Decoder tests # 5min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/encoder_decoder
|
||||
commands:
|
||||
- pytest -v -s encoder_decoder
|
||||
|
||||
- label: OpenAI-Compatible Tool Use # 20 min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
fast_check: false
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/tool_use
|
||||
- tests/mistral_tool_use
|
||||
commands:
|
||||
- pytest -v -s tool_use
|
||||
- pytest -v -s mistral_tool_use
|
||||
|
||||
##### models test #####
|
||||
|
||||
- label: Basic Models Test # 24min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models
|
||||
commands:
|
||||
- pytest -v -s models/test_transformers.py
|
||||
- pytest -v -s models/test_registry.py
|
||||
- pytest -v -s models/test_utils.py
|
||||
- pytest -v -s models/test_vision.py
|
||||
- pytest -v -s models/test_initialization.py
|
||||
|
||||
- label: Language Models Test (Standard)
|
||||
mirror_hardwares: [amdexperimental]
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/language
|
||||
commands:
|
||||
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
|
||||
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
|
||||
- pip freeze | grep -E 'torch'
|
||||
- pytest -v -s models/language -m core_model
|
||||
|
||||
- label: Language Models Test (Hybrid) # 35 min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/language/generation
|
||||
commands:
|
||||
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
|
||||
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
|
||||
- pytest -v -s models/language/generation -m hybrid_model
|
||||
|
||||
- label: Language Models Test (Extended Generation) # 1hr20min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/language/generation
|
||||
commands:
|
||||
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
|
||||
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
|
||||
- pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
|
||||
|
||||
- label: Language Models Test (Extended Pooling) # 36min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/language/pooling
|
||||
commands:
|
||||
- pytest -v -s models/language/pooling -m 'not core_model'
|
||||
|
||||
- label: Multi-Modal Models Test (Standard)
|
||||
mirror_hardwares: [amdexperimental]
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/multimodal
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pip freeze | grep -E 'torch'
|
||||
- pytest -v -s models/multimodal/processing
|
||||
- pytest -v -s --ignore models/multimodal/generation/test_whisper.py models/multimodal -m core_model
|
||||
- cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
||||
|
||||
- label: Multi-Modal Models Test (Extended) 1
|
||||
mirror_hardwares: [amdexperimental]
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/multimodal
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing models/multimodal -m 'not core_model'
|
||||
|
||||
- label: Multi-Modal Models Test (Extended) 2
|
||||
mirror_hardwares: [amdexperimental]
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/multimodal
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
|
||||
|
||||
- label: Multi-Modal Models Test (Extended) 3
|
||||
mirror_hardwares: [amdexperimental]
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/multimodal
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
|
||||
|
||||
- label: Quantized Models Test
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/layers/quantization
|
||||
- tests/models/quantization
|
||||
commands:
|
||||
- pytest -v -s models/quantization
|
||||
|
||||
# This test is used only in PR development phase to test individual models and should never run on main
|
||||
- label: Custom Models Test
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
optional: true
|
||||
commands:
|
||||
- echo 'Testing custom models...'
|
||||
# PR authors can temporarily add commands below to test individual models
|
||||
# e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
|
||||
# *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
|
||||
|
||||
##### 1 GPU test #####
|
||||
##### multi gpus test #####
|
||||
|
||||
- label: Distributed Comm Ops Test # 7min
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
source_file_dependencies:
|
||||
- vllm/distributed
|
||||
- tests/distributed
|
||||
commands:
|
||||
- pytest -v -s distributed/test_comm_ops.py
|
||||
- pytest -v -s distributed/test_shm_broadcast.py
|
||||
|
||||
- label: 2 Node Tests (4 GPUs in total) # 16min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
num_nodes: 2
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/
|
||||
- vllm/engine/
|
||||
- vllm/executor/
|
||||
- vllm/model_executor/models/
|
||||
- tests/distributed/
|
||||
- tests/examples/offline_inference/data_parallel.py
|
||||
commands:
|
||||
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
|
||||
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
|
||||
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
|
||||
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
|
||||
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
|
||||
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
|
||||
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
|
||||
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
|
||||
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
|
||||
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
|
||||
|
||||
- label: Distributed Tests (2 GPUs) # 40min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/
|
||||
- vllm/engine/
|
||||
- vllm/executor/
|
||||
- vllm/model_executor/models/
|
||||
- tests/distributed/
|
||||
- vllm/compilation
|
||||
- vllm/worker/worker_base.py
|
||||
- vllm/worker/worker.py
|
||||
- vllm/worker/model_runner.py
|
||||
- entrypoints/llm/test_collective_rpc.py
|
||||
- tests/v1/test_async_llm_dp.py
|
||||
- tests/v1/test_external_lb_dp.py
|
||||
- tests/v1/entrypoints/openai/test_multi_api_servers.py
|
||||
- vllm/v1/engine/
|
||||
commands:
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
|
||||
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
|
||||
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
||||
- pytest -v -s ./compile/test_basic_correctness.py
|
||||
- pytest -v -s ./compile/test_wrapper.py
|
||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
||||
# Avoid importing model tests that cause CUDA reinitialization error
|
||||
- pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
|
||||
- pytest models/language -v -s -m 'distributed(num_gpus=2)'
|
||||
- pytest models/multimodal -v -s -m 'distributed(num_gpus=2)'
|
||||
# test sequence parallel
|
||||
- pytest -v -s distributed/test_sequence_parallel.py
|
||||
# this test fails consistently.
|
||||
# TODO: investigate and fix
|
||||
# - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
|
||||
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
|
||||
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
|
||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
||||
|
||||
- label: Plugin Tests (2 GPUs) # 40min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
source_file_dependencies:
|
||||
- vllm/plugins/
|
||||
- tests/plugins/
|
||||
commands:
|
||||
# begin platform plugin and general plugin tests, all the code in-between runs on dummy platform
|
||||
- pip install -e ./plugins/vllm_add_dummy_platform
|
||||
- pytest -v -s plugins_tests/test_platform_plugins.py
|
||||
- pip uninstall vllm_add_dummy_platform -y
|
||||
# end platform plugin tests
|
||||
# other tests continue here:
|
||||
- pytest -v -s plugins_tests/test_scheduler_plugins.py
|
||||
- pip install -e ./plugins/vllm_add_dummy_model
|
||||
- pytest -v -s distributed/test_distributed_oot.py
|
||||
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
|
||||
- pytest -v -s models/test_oot_registration.py # it needs a clean process
|
||||
- pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
|
||||
|
||||
- label: Multi-step Tests (4 GPUs) # 36min
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/layers/sampler.py
|
||||
- vllm/sequence.py
|
||||
- vllm/worker/worker_base.py
|
||||
- vllm/worker/worker.py
|
||||
- vllm/worker/multi_step_worker.py
|
||||
- vllm/worker/model_runner_base.py
|
||||
- vllm/worker/model_runner.py
|
||||
- vllm/worker/multi_step_model_runner.py
|
||||
- vllm/engine
|
||||
- tests/multi_step
|
||||
commands:
|
||||
# this test is quite flaky
|
||||
# TODO: investigate and fix.
|
||||
# - pytest -v -s multi_step/test_correctness_async_llm.py
|
||||
- pytest -v -s multi_step/test_correctness_llm.py
|
||||
|
||||
- label: Pipeline Parallelism Test # 45min
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/
|
||||
- vllm/engine/
|
||||
- vllm/executor/
|
||||
- vllm/model_executor/models/
|
||||
- tests/distributed/
|
||||
commands:
|
||||
- pytest -v -s distributed/test_pp_cudagraph.py
|
||||
- pytest -v -s distributed/test_pipeline_parallel.py
|
||||
|
||||
- label: LoRA TP Test (Distributed)
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
num_gpus: 4
|
||||
source_file_dependencies:
|
||||
- vllm/lora
|
||||
- tests/lora
|
||||
commands:
|
||||
# FIXIT: find out which code initialize cuda before running the test
|
||||
# before the fix, we need to use spawn to test it
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
# There is some Tensor Parallelism related processing logic in LoRA that
|
||||
# requires multi-GPU testing for validation.
|
||||
- pytest -v -s -x lora/test_chatglm3_tp.py
|
||||
- pytest -v -s -x lora/test_llama_tp.py
|
||||
|
||||
|
||||
- label: Weight Loading Multiple GPU Test # 33min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/weight_loading
|
||||
commands:
|
||||
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
|
||||
|
||||
- label: Weight Loading Multiple GPU Test - Large Models # optional
|
||||
mirror_hardwares: [amdexperimental]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
gpu: a100
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/weight_loading
|
||||
commands:
|
||||
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
|
||||
|
||||
|
||||
##### multi gpus test #####
|
||||
##### A100 test #####
|
||||
|
||||
- label: Distributed Tests (A100) # optional
|
||||
gpu: a100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
commands:
|
||||
# NOTE: don't test llama model here, it seems hf implementation is buggy
|
||||
# see https://github.com/vllm-project/vllm/pull/5689 for details
|
||||
- pytest -v -s distributed/test_custom_all_reduce.py
|
||||
- torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
|
||||
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
||||
- pytest -v -s -x lora/test_mixtral.py
|
||||
|
||||
- label: LM Eval Large Models # optional
|
||||
gpu: a100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
||||
- pip install -r requirements-docs.txt
|
||||
- SPHINXOPTS=\"-W\" make html
|
||||
|
59
.buildkite/test-template-aws.j2
Normal file
59
.buildkite/test-template-aws.j2
Normal file
@ -0,0 +1,59 @@
|
||||
{% set docker_image = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT" %}
|
||||
{% set default_working_dir = "/vllm-workspace/tests" %}
|
||||
|
||||
steps:
|
||||
- label: ":docker: build image"
|
||||
agents:
|
||||
queue: cpu_queue
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ."
|
||||
- "docker push {{ docker_image }}"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
retry:
|
||||
automatic:
|
||||
- exit_status: -1 # Agent was lost
|
||||
limit: 5
|
||||
- exit_status: -10 # Agent was lost
|
||||
limit: 5
|
||||
- wait
|
||||
|
||||
{% for step in steps %}
|
||||
- label: "{{ step.label }}"
|
||||
agents:
|
||||
{% if step.no_gpu %}
|
||||
queue: cpu_queue
|
||||
{% elif step.num_gpus == 2 or step.num_gpus == 4 %}
|
||||
queue: gpu_4_queue
|
||||
{% else %}
|
||||
queue: gpu_1_queue
|
||||
{% endif %}
|
||||
soft_fail: true
|
||||
{% if step.parallelism %}
|
||||
parallelism: {{ step.parallelism }}
|
||||
{% endif %}
|
||||
retry:
|
||||
automatic:
|
||||
- exit_status: -1 # Agent was lost
|
||||
limit: 5
|
||||
- exit_status: -10 # Agent was lost
|
||||
limit: 5
|
||||
plugins:
|
||||
- docker#v5.2.0:
|
||||
image: {{ docker_image }}
|
||||
always-pull: true
|
||||
propagate-environment: true
|
||||
{% if not step.no_gpu %}
|
||||
gpus: all
|
||||
{% endif %}
|
||||
command: ["bash", "-c", "cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}"]
|
||||
environment:
|
||||
- VLLM_USAGE_SOURCE=ci-test
|
||||
- HF_TOKEN
|
||||
{% if step.label == "Speculative decoding tests" %}
|
||||
- VLLM_ATTENTION_BACKEND=XFORMERS
|
||||
{% endif %}
|
||||
volumes:
|
||||
- /dev/shm:/dev/shm
|
||||
{% endfor %}
|
95
.buildkite/test-template.j2
Normal file
95
.buildkite/test-template.j2
Normal file
@ -0,0 +1,95 @@
|
||||
{% set docker_image = "us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:$BUILDKITE_COMMIT" %}
|
||||
{% set default_num_gpu = 1 %}
|
||||
{% set default_working_dir = "/vllm-workspace/tests" %}
|
||||
|
||||
steps:
|
||||
- label: ":docker: build image"
|
||||
commands:
|
||||
- "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ."
|
||||
- "docker push {{ docker_image }}"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
retry:
|
||||
automatic:
|
||||
- exit_status: -1 # Agent was lost
|
||||
limit: 5
|
||||
- exit_status: -10 # Agent was lost
|
||||
limit: 5
|
||||
- wait
|
||||
|
||||
- group: "AMD Tests"
|
||||
depends_on: ~
|
||||
steps:
|
||||
{% for step in steps %}
|
||||
{% if step.mirror_hardwares and "amd" in step.mirror_hardwares %}
|
||||
- label: "AMD: {{ step.label }}"
|
||||
agents:
|
||||
queue: amd
|
||||
command: bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" ; ")) | safe }}"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
|
||||
- label: "Neuron Test"
|
||||
depends_on: ~
|
||||
agents:
|
||||
queue: neuron
|
||||
command: bash .buildkite/run-neuron-test.sh
|
||||
soft_fail: true
|
||||
|
||||
- label: "Intel Test"
|
||||
depends_on: ~
|
||||
agents:
|
||||
queue: intel
|
||||
command: bash .buildkite/run-cpu-test.sh
|
||||
|
||||
{% for step in steps %}
|
||||
- label: "{{ step.label }}"
|
||||
agents:
|
||||
queue: kubernetes
|
||||
soft_fail: {{ step.soft_fail or false }}
|
||||
{% if step.parallelism %}
|
||||
parallelism: {{ step.parallelism }}
|
||||
{% endif %}
|
||||
retry:
|
||||
automatic:
|
||||
- exit_status: -1 # Agent was lost
|
||||
limit: 5
|
||||
- exit_status: -10 # Agent was lost
|
||||
limit: 5
|
||||
plugins:
|
||||
- kubernetes:
|
||||
podSpec:
|
||||
{% if step.num_gpus %}
|
||||
priorityClassName: gpu-priority-cls-{{ step.num_gpus }}
|
||||
{% endif %}
|
||||
volumes:
|
||||
- name: dshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
containers:
|
||||
- image: "{{ docker_image }}"
|
||||
command: ["bash"]
|
||||
args:
|
||||
- '-c'
|
||||
- "'cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}'"
|
||||
{% if not step.no_gpu %}
|
||||
resources:
|
||||
requests:
|
||||
nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
|
||||
limits:
|
||||
nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
|
||||
{% endif %}
|
||||
env:
|
||||
- name: VLLM_USAGE_SOURCE
|
||||
value: ci-test
|
||||
- name: HF_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: hf-token-secret
|
||||
key: token
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
{% endfor %}
|
@ -1,33 +1 @@
|
||||
/.venv
|
||||
/build
|
||||
dist
|
||||
vllm/*.so
|
||||
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
.mypy_cache
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
/build/
|
||||
cmake-build-*/
|
||||
CMakeUserPresets.json
|
||||
develop-eggs/
|
||||
/dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
54
.github/CODEOWNERS
vendored
54
.github/CODEOWNERS
vendored
@ -1,54 +0,0 @@
|
||||
# See https://help.github.com/articles/about-codeowners/
|
||||
# for more info about CODEOWNERS file
|
||||
|
||||
# This lists cover the "core" components of vLLM that require careful review
|
||||
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||
/vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||
/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||
/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
|
||||
/vllm/model_executor/guided_decoding @mgoin @russellb @aarnphm
|
||||
/vllm/multimodal @DarkLight1337 @ywang96
|
||||
/vllm/vllm_flash_attn @LucasWilkinson
|
||||
/vllm/lora @jeejeelee
|
||||
/vllm/reasoning @aarnphm
|
||||
/vllm/entrypoints @aarnphm
|
||||
CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
||||
|
||||
# Any change to the VllmConfig changes can have a large user-facing impact,
|
||||
# so spam a lot of people
|
||||
/vllm/config.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor
|
||||
|
||||
# vLLM V1
|
||||
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
|
||||
/vllm/v1/structured_output @mgoin @russellb @aarnphm
|
||||
|
||||
# Test ownership
|
||||
/.buildkite/lm-eval-harness @mgoin @simon-mo
|
||||
/tests/async_engine @njhill @robertgshaw2-redhat @simon-mo
|
||||
/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac
|
||||
/tests/distributed/test_multi_node_assignment.py @youkaichao
|
||||
/tests/distributed/test_pipeline_parallel.py @youkaichao
|
||||
/tests/distributed/test_same_node.py @youkaichao
|
||||
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm
|
||||
/tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb @aarnphm
|
||||
/tests/kernels @tlrmchlsmth @WoosukKwon
|
||||
/tests/model_executor/test_guided_processors.py @mgoin @russellb
|
||||
/tests/models @DarkLight1337 @ywang96
|
||||
/tests/multi_step @alexm-redhat @comaniac
|
||||
/tests/multimodal @DarkLight1337 @ywang96
|
||||
/tests/prefix_caching @comaniac @KuntaiDu
|
||||
/tests/quantization @mgoin @robertgshaw2-redhat
|
||||
/tests/spec_decode @njhill @LiuXiaoxuanPKU
|
||||
/tests/test_inputs.py @DarkLight1337 @ywang96
|
||||
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
|
||||
/tests/v1/structured_output @mgoin @russellb @aarnphm
|
||||
/tests/weight_loading @mgoin @youkaichao
|
||||
/tests/lora @jeejeelee
|
||||
|
||||
# Docs
|
||||
/docs @hmellor
|
||||
mkdocs.yaml @hmellor
|
2
.github/FUNDING.yml
vendored
2
.github/FUNDING.yml
vendored
@ -1,2 +0,0 @@
|
||||
github: [vllm-project]
|
||||
open_collective: vllm
|
7
.github/ISSUE_TEMPLATE/100-documentation.yml
vendored
7
.github/ISSUE_TEMPLATE/100-documentation.yml
vendored
@ -20,10 +20,3 @@ body:
|
||||
attributes:
|
||||
value: >
|
||||
Thanks for contributing 🎉!
|
||||
- type: checkboxes
|
||||
id: askllm
|
||||
attributes:
|
||||
label: Before submitting a new issue...
|
||||
options:
|
||||
- label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
|
||||
required: true
|
||||
|
9
.github/ISSUE_TEMPLATE/200-installation.yml
vendored
9
.github/ISSUE_TEMPLATE/200-installation.yml
vendored
@ -14,7 +14,7 @@ body:
|
||||
description: |
|
||||
Please run the following and paste the output below.
|
||||
```sh
|
||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
|
||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
|
||||
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
||||
python collect_env.py
|
||||
```
|
||||
@ -38,10 +38,3 @@ body:
|
||||
attributes:
|
||||
value: >
|
||||
Thanks for contributing 🎉!
|
||||
- type: checkboxes
|
||||
id: askllm
|
||||
attributes:
|
||||
label: Before submitting a new issue...
|
||||
options:
|
||||
- label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
|
||||
required: true
|
||||
|
9
.github/ISSUE_TEMPLATE/300-usage.yml
vendored
9
.github/ISSUE_TEMPLATE/300-usage.yml
vendored
@ -14,7 +14,7 @@ body:
|
||||
description: |
|
||||
Please run the following and paste the output below.
|
||||
```sh
|
||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
|
||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
|
||||
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
||||
python collect_env.py
|
||||
```
|
||||
@ -36,10 +36,3 @@ body:
|
||||
attributes:
|
||||
value: >
|
||||
Thanks for contributing 🎉!
|
||||
- type: checkboxes
|
||||
id: askllm
|
||||
attributes:
|
||||
label: Before submitting a new issue...
|
||||
options:
|
||||
- label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
|
||||
required: true
|
||||
|
@ -8,36 +8,21 @@ body:
|
||||
attributes:
|
||||
value: >
|
||||
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
⚠️ **SECURITY WARNING:** Please review any text you paste to ensure it does not contain sensitive information such as:
|
||||
- API tokens or keys (e.g., Hugging Face tokens, OpenAI API keys)
|
||||
- Passwords or authentication credentials
|
||||
- Private URLs or endpoints
|
||||
- Personal or confidential data
|
||||
|
||||
Consider redacting or replacing sensitive values with placeholders like `<YOUR_TOKEN_HERE>` when sharing configuration or code examples.
|
||||
- type: textarea
|
||||
attributes:
|
||||
label: Your current environment
|
||||
description: |
|
||||
Please run the following and paste the output below.
|
||||
```sh
|
||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
|
||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
|
||||
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
||||
python collect_env.py
|
||||
```
|
||||
It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
|
||||
value: |
|
||||
<details>
|
||||
<summary>The output of <code>python collect_env.py</code></summary>
|
||||
|
||||
```text
|
||||
Your output of `python collect_env.py` here
|
||||
The output of `python collect_env.py`
|
||||
```
|
||||
|
||||
</details>
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
@ -85,24 +70,17 @@ body:
|
||||
```
|
||||
|
||||
```
|
||||
The error message you got, with the full traceback and the error logs with [dump_input.py:##] if present.
|
||||
The error message you got, with the full traceback.
|
||||
```
|
||||
validations:
|
||||
required: true
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
⚠️ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the model's output:
|
||||
value: >
|
||||
⚠️ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the models' output:
|
||||
|
||||
- Try the counterpart of `transformers` first. If the error appears, please go to [their issues](https://github.com/huggingface/transformers/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc).
|
||||
|
||||
- If the error only appears in vllm, please provide the detailed script of how you run `transformers` and `vllm`, also highlight the difference and what you expect.
|
||||
|
||||
Thanks for reporting 🙏!
|
||||
- type: checkboxes
|
||||
id: askllm
|
||||
attributes:
|
||||
label: Before submitting a new issue...
|
||||
options:
|
||||
- label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
|
||||
required: true
|
||||
Thanks for contributing 🎉!
|
69
.github/ISSUE_TEMPLATE/450-ci-failure.yml
vendored
69
.github/ISSUE_TEMPLATE/450-ci-failure.yml
vendored
@ -1,69 +0,0 @@
|
||||
name: 🧪 CI failure report
|
||||
description: Report a failing test.
|
||||
title: "[CI Failure]: "
|
||||
labels: ["ci-failure"]
|
||||
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: >
|
||||
#### Include the name of the failing Buildkite step and test file in the title.
|
||||
- type: input
|
||||
attributes:
|
||||
label: Name of failing test
|
||||
description: |
|
||||
Paste in the fully-qualified name of the failing test from the logs.
|
||||
placeholder: |
|
||||
`path/to/test_file.py::test_name[params]`
|
||||
validations:
|
||||
required: true
|
||||
- type: checkboxes
|
||||
attributes:
|
||||
label: Basic information
|
||||
description: Select all items that apply to the failing test.
|
||||
options:
|
||||
- label: Flaky test
|
||||
- label: Can reproduce locally
|
||||
- label: Caused by external libraries (e.g. bug in `transformers`)
|
||||
- type: textarea
|
||||
attributes:
|
||||
label: 🧪 Describe the failing test
|
||||
description: |
|
||||
Please provide a clear and concise description of the failing test.
|
||||
placeholder: |
|
||||
A clear and concise description of the failing test.
|
||||
|
||||
```
|
||||
The error message you got, with the full traceback and the error logs with [dump_input.py:##] if present.
|
||||
```
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
attributes:
|
||||
label: 📝 History of failing test
|
||||
description: |
|
||||
Since when did the test start to fail?
|
||||
You can look up its history via [Buildkite Test Suites](https://buildkite.com/organizations/vllm/analytics/suites/ci-1/tests?branch=main).
|
||||
|
||||
If you have time, identify the PR that caused the test to fail on main. You can do so via the following methods:
|
||||
|
||||
- Use Buildkite Test Suites to find the PR where the test failure first occurred, and reproduce the failure locally.
|
||||
|
||||
- Run [`git bisect`](https://git-scm.com/docs/git-bisect) locally.
|
||||
|
||||
- Manually unblock Buildkite steps for suspected PRs on main and check the results. (authorized users only)
|
||||
placeholder: |
|
||||
Approximate timeline and/or problematic PRs
|
||||
|
||||
A link to the Buildkite analytics of the failing test (if available)
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
attributes:
|
||||
label: CC List.
|
||||
description: >
|
||||
The list of people you want to CC. Usually, this includes those who worked on the PR that failed the test.
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: >
|
||||
Thanks for reporting 🙏!
|
@ -29,10 +29,3 @@ body:
|
||||
attributes:
|
||||
value: >
|
||||
Thanks for contributing 🎉!
|
||||
- type: checkboxes
|
||||
id: askllm
|
||||
attributes:
|
||||
label: Before submitting a new issue...
|
||||
options:
|
||||
- label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
|
||||
required: true
|
@ -9,7 +9,7 @@ body:
|
||||
value: >
|
||||
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
|
||||
|
||||
#### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/index.html first to understand how to add a new model.
|
||||
#### We also highly recommend you read https://docs.vllm.ai/en/latest/models/adding_model.html first to understand how to add a new model.
|
||||
- type: textarea
|
||||
attributes:
|
||||
label: The model to consider.
|
||||
@ -31,10 +31,3 @@ body:
|
||||
attributes:
|
||||
value: >
|
||||
Thanks for contributing 🎉!
|
||||
- type: checkboxes
|
||||
id: askllm
|
||||
attributes:
|
||||
label: Before submitting a new issue...
|
||||
options:
|
||||
- label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
|
||||
required: true
|
@ -35,7 +35,7 @@ body:
|
||||
description: |
|
||||
Please run the following and paste the output below.
|
||||
```sh
|
||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
|
||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
|
||||
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
||||
python collect_env.py
|
||||
```
|
||||
@ -50,10 +50,3 @@ body:
|
||||
attributes:
|
||||
value: >
|
||||
Thanks for contributing 🎉!
|
||||
- type: checkboxes
|
||||
id: askllm
|
||||
attributes:
|
||||
label: Before submitting a new issue...
|
||||
options:
|
||||
- label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
|
||||
required: true
|
7
.github/ISSUE_TEMPLATE/750-RFC.yml
vendored
7
.github/ISSUE_TEMPLATE/750-RFC.yml
vendored
@ -47,10 +47,3 @@ body:
|
||||
attributes:
|
||||
value: >
|
||||
Thanks for contributing 🎉!
|
||||
- type: checkboxes
|
||||
id: askllm
|
||||
attributes:
|
||||
label: Before submitting a new issue...
|
||||
options:
|
||||
- label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
|
||||
required: true
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user