mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 14:53:52 +08:00
[CI]add genai-perf benchmark in nightly benchmark (#10704)
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
This commit is contained in:
@ -301,6 +301,104 @@ run_serving_tests() {
|
||||
kill_gpu_processes
|
||||
}
|
||||
|
||||
run_genai_perf_tests() {
|
||||
# run genai-perf tests
|
||||
|
||||
# $1: a json file specifying genai-perf test cases
|
||||
local genai_perf_test_file
|
||||
genai_perf_test_file=$1
|
||||
|
||||
# Iterate over genai-perf tests
|
||||
jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
|
||||
# get the test name, and append the GPU type back to it.
|
||||
test_name=$(echo "$params" | jq -r '.test_name')
|
||||
|
||||
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
||||
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
||||
echo "Skip test case $test_name."
|
||||
continue
|
||||
fi
|
||||
|
||||
# prepend the current serving engine to the test name
|
||||
test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
|
||||
|
||||
# get common parameters
|
||||
common_params=$(echo "$params" | jq -r '.common_parameters')
|
||||
model=$(echo "$common_params" | jq -r '.model')
|
||||
tp=$(echo "$common_params" | jq -r '.tp')
|
||||
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
|
||||
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
|
||||
port=$(echo "$common_params" | jq -r '.port')
|
||||
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
|
||||
reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
|
||||
|
||||
# get client and server arguments
|
||||
server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
|
||||
qps_list=$(echo "$params" | jq -r '.qps_list')
|
||||
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
||||
echo "Running over qps list $qps_list"
|
||||
|
||||
# check if there is enough GPU to run the test
|
||||
if [[ $gpu_count -lt $tp ]]; then
|
||||
echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
||||
continue
|
||||
fi
|
||||
|
||||
if [[ $reuse_server == "true" ]]; then
|
||||
echo "Reuse previous server for test case $test_name"
|
||||
else
|
||||
kill_gpu_processes
|
||||
bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
|
||||
"$server_params" "$common_params"
|
||||
fi
|
||||
|
||||
if wait_for_server; then
|
||||
echo ""
|
||||
echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
|
||||
else
|
||||
echo ""
|
||||
echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
|
||||
break
|
||||
fi
|
||||
|
||||
# iterate over different QPS
|
||||
for qps in $qps_list; do
|
||||
# remove the surrounding single quote from qps
|
||||
if [[ "$qps" == *"inf"* ]]; then
|
||||
echo "qps was $qps"
|
||||
qps=$num_prompts
|
||||
echo "now qps is $qps"
|
||||
fi
|
||||
|
||||
new_test_name=$test_name"_qps_"$qps
|
||||
backend=$CURRENT_LLM_SERVING_ENGINE
|
||||
|
||||
if [[ "$backend" == *"vllm"* ]]; then
|
||||
backend="vllm"
|
||||
fi
|
||||
#TODO: add output dir.
|
||||
client_command="genai-perf profile \
|
||||
-m $model \
|
||||
--service-kind openai \
|
||||
--backend vllm \
|
||||
--endpoint-type chat \
|
||||
--streaming \
|
||||
--url localhost:$port \
|
||||
--request-rate $qps \
|
||||
--num-prompts $num_prompts \
|
||||
"
|
||||
|
||||
echo "Client command: $client_command"
|
||||
|
||||
eval "$client_command"
|
||||
|
||||
#TODO: process/record outputs
|
||||
done
|
||||
done
|
||||
|
||||
kill_gpu_processes
|
||||
|
||||
}
|
||||
|
||||
prepare_dataset() {
|
||||
|
||||
@ -328,12 +426,17 @@ main() {
|
||||
|
||||
pip install -U transformers
|
||||
|
||||
pip install -r requirements-dev.txt
|
||||
which genai-perf
|
||||
|
||||
# check storage
|
||||
df -h
|
||||
|
||||
ensure_installed wget
|
||||
ensure_installed curl
|
||||
ensure_installed jq
|
||||
# genai-perf dependency
|
||||
ensure_installed libb64-0d
|
||||
|
||||
prepare_dataset
|
||||
|
||||
@ -345,6 +448,10 @@ main() {
|
||||
# run the test
|
||||
run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json"
|
||||
|
||||
# run genai-perf tests
|
||||
run_genai_perf_tests "$BENCHMARK_ROOT/tests/genai-perf-tests.json"
|
||||
mv artifacts/ $RESULTS_FOLDER/
|
||||
|
||||
# upload benchmark results to buildkite
|
||||
python3 -m pip install tabulate pandas
|
||||
python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py"
|
||||
|
23
.buildkite/nightly-benchmarks/tests/genai-perf-tests.json
Normal file
23
.buildkite/nightly-benchmarks/tests/genai-perf-tests.json
Normal file
@ -0,0 +1,23 @@
|
||||
[
|
||||
{
|
||||
"test_name": "llama8B_tp1_genai_perf",
|
||||
"qps_list": [4,8,16,32],
|
||||
"common_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3-8B-Instruct",
|
||||
"tp": 1,
|
||||
"port": 8000,
|
||||
"num_prompts": 500,
|
||||
"reuse_server": false
|
||||
},
|
||||
"vllm_server_parameters": {
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": "",
|
||||
"gpu_memory_utilization": 0.9,
|
||||
"num_scheduler_steps": 10,
|
||||
"max_num_seqs": 512,
|
||||
"dtype": "bfloat16"
|
||||
},
|
||||
"genai_perf_input_parameters": {
|
||||
}
|
||||
}
|
||||
]
|
@ -29,4 +29,7 @@ lm-eval[api]==0.4.4 # required for model evaluation test
|
||||
bitsandbytes>=0.45.0
|
||||
buildkite-test-collector==0.1.9
|
||||
|
||||
genai_perf==0.0.8
|
||||
tritonclient==2.51.0
|
||||
|
||||
numpy < 2.0.0
|
||||
|
@ -37,7 +37,7 @@ audioread==3.0.1
|
||||
# via librosa
|
||||
awscli==1.35.23
|
||||
# via -r requirements-test.in
|
||||
bitsandbytes>=0.45.0
|
||||
bitsandbytes==0.45.0
|
||||
# via -r requirements-test.in
|
||||
black==24.10.0
|
||||
# via datamodel-code-generator
|
||||
@ -75,6 +75,8 @@ colorama==0.4.6
|
||||
# tqdm-multiprocess
|
||||
contourpy==1.3.0
|
||||
# via matplotlib
|
||||
cramjam==2.9.0
|
||||
# via fastparquet
|
||||
cupy-cuda12x==13.3.0
|
||||
# via ray
|
||||
cycler==0.12.1
|
||||
@ -109,6 +111,8 @@ email-validator==2.2.0
|
||||
# via pydantic
|
||||
evaluate==0.4.3
|
||||
# via lm-eval
|
||||
fastparquet==2024.11.0
|
||||
# via genai-perf
|
||||
fastrlock==0.8.2
|
||||
# via cupy-cuda12x
|
||||
filelock==3.16.1
|
||||
@ -130,8 +134,11 @@ fsspec[http]==2024.9.0
|
||||
# via
|
||||
# datasets
|
||||
# evaluate
|
||||
# fastparquet
|
||||
# huggingface-hub
|
||||
# torch
|
||||
genai-perf==0.0.8
|
||||
# via -r requirements-test.in
|
||||
genson==1.3.0
|
||||
# via datamodel-code-generator
|
||||
h11==0.14.0
|
||||
@ -186,6 +193,8 @@ jsonschema==4.23.0
|
||||
# ray
|
||||
jsonschema-specifications==2024.10.1
|
||||
# via jsonschema
|
||||
kaleido==0.2.1
|
||||
# via genai-perf
|
||||
kiwisolver==1.4.7
|
||||
# via matplotlib
|
||||
lazy-loader==0.4
|
||||
@ -200,6 +209,8 @@ lm-eval[api]==0.4.4
|
||||
# via -r requirements-test.in
|
||||
lxml==5.3.0
|
||||
# via sacrebleu
|
||||
markdown-it-py==3.0.0
|
||||
# via rich
|
||||
markupsafe==3.0.2
|
||||
# via jinja2
|
||||
matplotlib==3.9.2
|
||||
@ -209,6 +220,8 @@ mbstrdecoder==1.1.3
|
||||
# dataproperty
|
||||
# pytablewriter
|
||||
# typepy
|
||||
mdurl==0.1.2
|
||||
# via markdown-it-py
|
||||
mistral-common[opencv]==1.5.1
|
||||
# via
|
||||
# -r requirements-test.in
|
||||
@ -249,6 +262,8 @@ numpy==1.26.4
|
||||
# datasets
|
||||
# decord
|
||||
# evaluate
|
||||
# fastparquet
|
||||
# genai-perf
|
||||
# librosa
|
||||
# matplotlib
|
||||
# mistral-common
|
||||
@ -256,15 +271,18 @@ numpy==1.26.4
|
||||
# numexpr
|
||||
# opencv-python-headless
|
||||
# pandas
|
||||
# patsy
|
||||
# peft
|
||||
# rouge-score
|
||||
# sacrebleu
|
||||
# scikit-learn
|
||||
# scipy
|
||||
# soxr
|
||||
# statsmodels
|
||||
# tensorizer
|
||||
# torchvision
|
||||
# transformers
|
||||
# tritonclient
|
||||
nvidia-cublas-cu12==12.4.5.8
|
||||
# via
|
||||
# nvidia-cudnn-cu12
|
||||
@ -306,30 +324,39 @@ packaging==24.1
|
||||
# datamodel-code-generator
|
||||
# datasets
|
||||
# evaluate
|
||||
# fastparquet
|
||||
# huggingface-hub
|
||||
# lazy-loader
|
||||
# matplotlib
|
||||
# peft
|
||||
# plotly
|
||||
# pooch
|
||||
# pytest
|
||||
# pytest-rerunfailures
|
||||
# ray
|
||||
# statsmodels
|
||||
# transformers
|
||||
# typepy
|
||||
pandas==2.2.3
|
||||
# via
|
||||
# datasets
|
||||
# evaluate
|
||||
# fastparquet
|
||||
# genai-perf
|
||||
# statsmodels
|
||||
pathspec==0.12.1
|
||||
# via black
|
||||
pathvalidate==3.2.1
|
||||
# via pytablewriter
|
||||
patsy==1.0.1
|
||||
# via statsmodels
|
||||
peft==0.13.2
|
||||
# via
|
||||
# -r requirements-test.in
|
||||
# lm-eval
|
||||
pillow==10.4.0
|
||||
# via
|
||||
# genai-perf
|
||||
# matplotlib
|
||||
# mistral-common
|
||||
# sentence-transformers
|
||||
@ -338,6 +365,8 @@ platformdirs==4.3.6
|
||||
# via
|
||||
# black
|
||||
# pooch
|
||||
plotly==5.24.1
|
||||
# via genai-perf
|
||||
pluggy==1.5.0
|
||||
# via pytest
|
||||
pooch==1.8.2
|
||||
@ -360,7 +389,9 @@ psutil==6.1.0
|
||||
py==1.11.0
|
||||
# via pytest-forked
|
||||
pyarrow==18.0.0
|
||||
# via datasets
|
||||
# via
|
||||
# datasets
|
||||
# genai-perf
|
||||
pyasn1==0.6.1
|
||||
# via rsa
|
||||
pybind11==2.13.6
|
||||
@ -373,6 +404,8 @@ pydantic[email]==2.9.2
|
||||
# mistral-common
|
||||
pydantic-core==2.23.4
|
||||
# via pydantic
|
||||
pygments==2.18.0
|
||||
# via rich
|
||||
pyparsing==3.2.0
|
||||
# via matplotlib
|
||||
pytablewriter==1.2.0
|
||||
@ -381,14 +414,18 @@ pytest==8.3.3
|
||||
# via
|
||||
# -r requirements-test.in
|
||||
# buildkite-test-collector
|
||||
# genai-perf
|
||||
# pytest-asyncio
|
||||
# pytest-forked
|
||||
# pytest-mock
|
||||
# pytest-rerunfailures
|
||||
# pytest-shard
|
||||
pytest-asyncio==0.24.0
|
||||
# via -r requirements-test.in
|
||||
pytest-forked==1.6.0
|
||||
# via -r requirements-test.in
|
||||
pytest-mock==3.14.0
|
||||
# via genai-perf
|
||||
pytest-rerunfailures==14.0
|
||||
# via -r requirements-test.in
|
||||
pytest-shard==0.1.2
|
||||
@ -399,6 +436,8 @@ python-dateutil==2.9.0.post0
|
||||
# matplotlib
|
||||
# pandas
|
||||
# typepy
|
||||
python-rapidjson==1.20
|
||||
# via tritonclient
|
||||
pytz==2024.2
|
||||
# via
|
||||
# pandas
|
||||
@ -409,9 +448,11 @@ pyyaml==6.0.2
|
||||
# awscli
|
||||
# datamodel-code-generator
|
||||
# datasets
|
||||
# genai-perf
|
||||
# huggingface-hub
|
||||
# peft
|
||||
# ray
|
||||
# responses
|
||||
# timm
|
||||
# transformers
|
||||
ray[adag]==2.40.0
|
||||
@ -438,8 +479,13 @@ requests==2.32.3
|
||||
# mistral-common
|
||||
# pooch
|
||||
# ray
|
||||
# responses
|
||||
# tiktoken
|
||||
# transformers
|
||||
responses==0.25.3
|
||||
# via genai-perf
|
||||
rich==13.9.4
|
||||
# via genai-perf
|
||||
rouge-score==0.1.2
|
||||
# via lm-eval
|
||||
rpds-py==0.20.1
|
||||
@ -470,6 +516,7 @@ scipy==1.13.1
|
||||
# librosa
|
||||
# scikit-learn
|
||||
# sentence-transformers
|
||||
# statsmodels
|
||||
sentence-transformers==3.2.1
|
||||
# via -r requirements-test.in
|
||||
sentencepiece==0.2.0
|
||||
@ -490,6 +537,8 @@ soxr==0.5.0.post1
|
||||
# via librosa
|
||||
sqlitedict==2.1.0
|
||||
# via lm-eval
|
||||
statsmodels==0.14.4
|
||||
# via genai-perf
|
||||
sympy==1.13.1
|
||||
# via torch
|
||||
tabledata==1.3.3
|
||||
@ -499,7 +548,9 @@ tabulate==0.9.0
|
||||
tcolorpy==0.1.6
|
||||
# via pytablewriter
|
||||
tenacity==9.0.0
|
||||
# via lm-eval
|
||||
# via
|
||||
# lm-eval
|
||||
# plotly
|
||||
tensorizer==2.9.0
|
||||
# via -r requirements-test.in
|
||||
threadpoolctl==3.5.0
|
||||
@ -540,6 +591,7 @@ tqdm-multiprocess==0.0.11
|
||||
# via lm-eval
|
||||
transformers==4.47.0
|
||||
# via
|
||||
# genai-perf
|
||||
# lm-eval
|
||||
# peft
|
||||
# sentence-transformers
|
||||
@ -548,6 +600,10 @@ transformers-stream-generator==0.0.5
|
||||
# via -r requirements-test.in
|
||||
triton==3.1.0
|
||||
# via torch
|
||||
tritonclient==2.51.0
|
||||
# via
|
||||
# -r requirements-test.in
|
||||
# genai-perf
|
||||
typepy[datetime]==1.3.2
|
||||
# via
|
||||
# dataproperty
|
||||
@ -555,6 +611,7 @@ typepy[datetime]==1.3.2
|
||||
# tabledata
|
||||
typing-extensions==4.12.2
|
||||
# via
|
||||
# bitsandbytes
|
||||
# huggingface-hub
|
||||
# librosa
|
||||
# mistral-common
|
||||
@ -563,10 +620,12 @@ typing-extensions==4.12.2
|
||||
# torch
|
||||
tzdata==2024.2
|
||||
# via pandas
|
||||
urllib3==1.26.20
|
||||
urllib3==2.2.3
|
||||
# via
|
||||
# botocore
|
||||
# requests
|
||||
# responses
|
||||
# tritonclient
|
||||
word2number==1.1
|
||||
# via lm-eval
|
||||
xxhash==3.5.0
|
||||
|
Reference in New Issue
Block a user