mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 23:03:52 +08:00
Compare commits
71 Commits
Author | SHA1 | Date | |
---|---|---|---|
a5dd03c1eb | |||
c18b3b8e8b | |||
9528e3a05e | |||
9fb52e523a | |||
e202dd2736 | |||
43813e6361 | |||
cede942b87 | |||
fe1e924811 | |||
4548c03c50 | |||
40b86aa05e | |||
432870829d | |||
f73d02aadc | |||
c5ebe040ac | |||
8d763cb891 | |||
cf4cd53982 | |||
32c9be2200 | |||
8aeaa910a2 | |||
906e05d840 | |||
ef9a2990ae | |||
7e90870491 | |||
d3f05c9248 | |||
c108781c85 | |||
3d184b95b8 | |||
2f35a022e6 | |||
ffe00ef77a | |||
5561681d04 | |||
fbd62d8750 | |||
2e26f9156a | |||
9e5452ee34 | |||
0e3fe896e2 | |||
1caca5a589 | |||
783921d889 | |||
4a98edff1f | |||
a7bab0c9e5 | |||
25950dca9b | |||
a4113b035c | |||
7e1665b089 | |||
8d1096e7db | |||
8d775dd30a | |||
78fe77534b | |||
2f2fcb31b8 | |||
1dba2c4ebe | |||
71d6de3a26 | |||
536fd33003 | |||
619b9f5c7e | |||
d1b689c445 | |||
9854dc9040 | |||
ff5c60fad8 | |||
6f1229f91d | |||
1819fbda63 | |||
7f0367109e | |||
fb14d53cf6 | |||
b024a42e93 | |||
cb97f2bfc5 | |||
359200f6ac | |||
220aee902a | |||
67d25eca05 | |||
363528de27 | |||
4ff61ababa | |||
0ec3779df7 | |||
b616f6a53d | |||
2e25bb12a8 | |||
9965c47d0d | |||
059d4cdb49 | |||
bdb84e26b0 | |||
3dd359147d | |||
657f2f301a | |||
a1aafc827a | |||
139508a418 | |||
d265414dbc | |||
48fb076cbc |
@ -11,7 +11,7 @@ See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performanc
|
||||
|
||||
## Performance benchmark quick overview
|
||||
|
||||
**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!), with different models.
|
||||
**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) and Intel® Xeon® Processors, with different models.
|
||||
|
||||
**Benchmarking Duration**: about 1hr.
|
||||
|
||||
@ -31,13 +31,27 @@ Performance benchmark will be triggered when:
|
||||
- A PR being merged into vllm.
|
||||
- Every commit for those PRs with `perf-benchmarks` label AND `ready` label.
|
||||
|
||||
Manually Trigger the benchmark
|
||||
|
||||
```bash
|
||||
bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
||||
```
|
||||
|
||||
Runtime environment variables:
|
||||
- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
|
||||
- `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
|
||||
- `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
|
||||
- `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file).
|
||||
- `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string.
|
||||
- `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.
|
||||
|
||||
Nightly benchmark will be triggered when:
|
||||
- Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
|
||||
|
||||
## Performance benchmark details
|
||||
|
||||
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
|
||||
|
||||
> NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead.
|
||||
### Latency test
|
||||
|
||||
Here is an example of one test inside `latency-tests.json`:
|
||||
@ -119,6 +133,30 @@ If you do not see the table, please wait till the benchmark finish running.
|
||||
The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
|
||||
The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
|
||||
|
||||
The `compare-json-results.py` helps to compare benchmark results JSON files converted using `convert-results-json-to-markdown.py`.
|
||||
When run, benchmark script generates results under `benchmark/results` folder, along with the `benchmark_results.md` and `benchmark_results.json`.
|
||||
`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.
|
||||
|
||||
Here is an example using the script to compare result_a and result_b without detail test name.
|
||||
`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json --ignore_test_name`
|
||||
|
||||
| | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio |
|
||||
|----|----------------------------------------|----------------------------------------|----------|
|
||||
| 0 | 142.633982 | 156.526018 | 1.097396 |
|
||||
| 1 | 241.620334 | 294.018783 | 1.216863 |
|
||||
| 2 | 218.298905 | 262.664916 | 1.203235 |
|
||||
| 3 | 242.743860 | 299.816190 | 1.235113 |
|
||||
|
||||
Here is an example using the script to compare result_a and result_b with detail test name.
|
||||
`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json`
|
||||
| | results_a/benchmark_results.json_name | results_a/benchmark_results.json | results_b/benchmark_results.json_name | results_b/benchmark_results.json | perf_ratio |
|
||||
|---|---------------------------------------------|----------------------------------------|---------------------------------------------|----------------------------------------|----------|
|
||||
| 0 | serving_llama8B_tp1_sharegpt_qps_1 | 142.633982 | serving_llama8B_tp1_sharegpt_qps_1 | 156.526018 | 1.097396 |
|
||||
| 1 | serving_llama8B_tp1_sharegpt_qps_16 | 241.620334 | serving_llama8B_tp1_sharegpt_qps_16 | 294.018783 | 1.216863 |
|
||||
| 2 | serving_llama8B_tp1_sharegpt_qps_4 | 218.298905 | serving_llama8B_tp1_sharegpt_qps_4 | 262.664916 | 1.203235 |
|
||||
| 3 | serving_llama8B_tp1_sharegpt_qps_inf | 242.743860 | serving_llama8B_tp1_sharegpt_qps_inf | 299.816190 | 1.235113 |
|
||||
| 4 | serving_llama8B_tp2_random_1024_128_qps_1 | 96.613390 | serving_llama8B_tp4_random_1024_128_qps_1 | 108.404853 | 1.122048 |
|
||||
|
||||
## Nightly test details
|
||||
|
||||
See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines.
|
||||
|
@ -4,7 +4,8 @@
|
||||
- Input length: 32 tokens.
|
||||
- Output length: 128 tokens.
|
||||
- Batch size: fixed (8).
|
||||
- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
||||
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
||||
- CPU Models: llama-3.1 8B.
|
||||
- Evaluation metrics: end-to-end latency (mean, median, p99).
|
||||
|
||||
{latency_tests_markdown_table}
|
||||
@ -14,7 +15,8 @@
|
||||
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
|
||||
- Output length: the corresponding output length of these 200 prompts.
|
||||
- Batch size: dynamically determined by vllm to achieve maximum throughput.
|
||||
- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
||||
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
||||
- CPU Models: llama-3.1 8B.
|
||||
- Evaluation metrics: throughput.
|
||||
|
||||
{throughput_tests_markdown_table}
|
||||
@ -25,12 +27,18 @@
|
||||
- Output length: the corresponding output length of these 200 prompts.
|
||||
- Batch size: dynamically determined by vllm and the arrival pattern of the requests.
|
||||
- **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
|
||||
- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
||||
- We also added a speculative decoding test for llama-3 70B, under QPS 2
|
||||
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
||||
- We also added a speculative decoding test for llama-3 70B on GPU, under QPS 2
|
||||
- CPU Models: llama-3.1 8B.
|
||||
- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
|
||||
- For CPU, we added random dataset tests to benchmark fixed input/output length with 100 prompts.
|
||||
|
||||
{serving_tests_markdown_table}
|
||||
|
||||
## Platform Information
|
||||
|
||||
{platform_markdown_table}
|
||||
|
||||
## json version of the benchmarking tables
|
||||
|
||||
This section contains the data of the markdown tables above in JSON format.
|
||||
|
@ -0,0 +1,66 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import argparse
|
||||
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def compare_data_columns(
|
||||
files, name_column, data_column, drop_column, ignore_test_name=False
|
||||
):
|
||||
print("\ncompare_data_column: " + data_column)
|
||||
frames = []
|
||||
compare_frames = []
|
||||
for file in files:
|
||||
data_df = pd.read_json(file)
|
||||
serving_df = data_df.dropna(subset=[drop_column], ignore_index=True)
|
||||
if ignore_test_name is False:
|
||||
serving_df = serving_df.rename(columns={name_column: file + "_name"})
|
||||
frames.append(serving_df[file + "_name"])
|
||||
serving_df = serving_df.rename(columns={data_column: file})
|
||||
frames.append(serving_df[file])
|
||||
compare_frames.append(serving_df[file])
|
||||
if len(compare_frames) >= 2:
|
||||
# Compare numbers among two files
|
||||
ratio_df = compare_frames[1] / compare_frames[0]
|
||||
frames.append(ratio_df)
|
||||
compare_frames.pop(1)
|
||||
|
||||
concat_df = pd.concat(frames, axis=1)
|
||||
return concat_df
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"-f", "--file", action="append", type=str, help="input file name"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ignore_test_name", action="store_true", help="ignore_test_name or not"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
files = args.file
|
||||
print("comparing : " + ", ".join(files))
|
||||
|
||||
drop_column = "P99"
|
||||
name_column = "Test name"
|
||||
data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
|
||||
html_msgs_for_data_cols = [
|
||||
"Compare Output Tokens /n",
|
||||
"Median TTFT /n",
|
||||
"Median TPOT /n",
|
||||
]
|
||||
ignore_test_name = args.ignore_test_name
|
||||
with open("perf_comparison.html", "w") as text_file:
|
||||
for i in range(len(data_cols_to_compare)):
|
||||
output_df = compare_data_columns(
|
||||
files,
|
||||
name_column,
|
||||
data_cols_to_compare[i],
|
||||
drop_column,
|
||||
ignore_test_name=ignore_test_name,
|
||||
)
|
||||
print(output_df)
|
||||
html = output_df.to_html()
|
||||
text_file.write(html_msgs_for_data_cols[i])
|
||||
text_file.write(html)
|
@ -3,9 +3,11 @@
|
||||
|
||||
import json
|
||||
import os
|
||||
from importlib import util
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
import psutil
|
||||
from tabulate import tabulate
|
||||
|
||||
results_folder = Path("results/")
|
||||
@ -29,11 +31,11 @@ throughput_results = []
|
||||
throughput_results_column_mapping = {
|
||||
"test_name": "Test name",
|
||||
"gpu_type": "GPU",
|
||||
# "num_requests": "# of req.",
|
||||
# "total_num_tokens": "Total # of tokens",
|
||||
# "elapsed_time": "Elapsed time (s)",
|
||||
"num_requests": "# of req.",
|
||||
"total_num_tokens": "Total # of tokens",
|
||||
"elapsed_time": "Elapsed time (s)",
|
||||
"requests_per_second": "Tput (req/s)",
|
||||
# "tokens_per_second": "Tput (tok/s)",
|
||||
"tokens_per_second": "Tput (tok/s)",
|
||||
}
|
||||
|
||||
# serving results and the keys that will be printed into markdown
|
||||
@ -41,16 +43,18 @@ serving_results = []
|
||||
serving_column_mapping = {
|
||||
"test_name": "Test name",
|
||||
"gpu_type": "GPU",
|
||||
# "completed": "# of req.",
|
||||
"completed": "# of req.",
|
||||
"request_throughput": "Tput (req/s)",
|
||||
# "input_throughput": "Input Tput (tok/s)",
|
||||
# "output_throughput": "Output Tput (tok/s)",
|
||||
"total_token_throughput": "Total Token Tput (tok/s)",
|
||||
"output_throughput": "Output Tput (tok/s)",
|
||||
"total_input_tokens": "Total input tokens",
|
||||
"total_output_tokens": "Total output tokens",
|
||||
"mean_ttft_ms": "Mean TTFT (ms)",
|
||||
"median_ttft_ms": "Median TTFT (ms)",
|
||||
"p99_ttft_ms": "P99 TTFT (ms)",
|
||||
# "mean_tpot_ms": "Mean TPOT (ms)",
|
||||
# "median_tpot_ms": "Median",
|
||||
# "p99_tpot_ms": "P99",
|
||||
"mean_tpot_ms": "Mean TPOT (ms)",
|
||||
"median_tpot_ms": "Median",
|
||||
"p99_tpot_ms": "P99",
|
||||
"mean_itl_ms": "Mean ITL (ms)",
|
||||
"median_itl_ms": "Median ITL (ms)",
|
||||
"p99_itl_ms": "P99 ITL (ms)",
|
||||
@ -75,6 +79,20 @@ def results_to_json(latency, throughput, serving):
|
||||
)
|
||||
|
||||
|
||||
def get_size_with_unit(bytes, suffix="B"):
|
||||
"""
|
||||
Scale bytes to its proper format
|
||||
e.g:
|
||||
1253656 => '1.20MB'
|
||||
1253656678 => '1.17GB'
|
||||
"""
|
||||
factor = 1024
|
||||
for unit in ["", "K", "M", "G", "T", "P"]:
|
||||
if bytes < factor:
|
||||
return f"{bytes:.2f}{unit}{suffix}"
|
||||
bytes /= factor
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# collect results
|
||||
for test_file in results_folder.glob("*.json"):
|
||||
@ -155,6 +173,27 @@ if __name__ == "__main__":
|
||||
serving_results = pd.DataFrame.from_dict(serving_results)
|
||||
throughput_results = pd.DataFrame.from_dict(throughput_results)
|
||||
|
||||
svmem = psutil.virtual_memory()
|
||||
platform_data = {
|
||||
"Physical cores": [psutil.cpu_count(logical=False)],
|
||||
"Total cores": [psutil.cpu_count(logical=True)],
|
||||
"Total Memory": [get_size_with_unit(svmem.total)],
|
||||
}
|
||||
|
||||
if util.find_spec("numa") is not None:
|
||||
from numa import info
|
||||
|
||||
platform_data["Total NUMA nodes"] = [info.get_num_configured_nodes()]
|
||||
|
||||
if util.find_spec("cpuinfo") is not None:
|
||||
from cpuinfo import get_cpu_info
|
||||
|
||||
platform_data["CPU Brand"] = [get_cpu_info()["brand_raw"]]
|
||||
|
||||
platform_results = pd.DataFrame.from_dict(
|
||||
platform_data, orient="index", columns=["Platform Info"]
|
||||
)
|
||||
|
||||
raw_results_json = results_to_json(
|
||||
latency_results, throughput_results, serving_results
|
||||
)
|
||||
@ -200,6 +239,9 @@ if __name__ == "__main__":
|
||||
throughput_md_table = tabulate(
|
||||
throughput_results, headers="keys", tablefmt="pipe", showindex=False
|
||||
)
|
||||
platform_md_table = tabulate(
|
||||
platform_results, headers="keys", tablefmt="pipe", showindex=True
|
||||
)
|
||||
|
||||
# document the result
|
||||
with open(results_folder / "benchmark_results.md", "w") as f:
|
||||
@ -211,6 +253,7 @@ if __name__ == "__main__":
|
||||
latency_tests_markdown_table=latency_md_table,
|
||||
throughput_tests_markdown_table=throughput_md_table,
|
||||
serving_tests_markdown_table=serving_md_table,
|
||||
platform_markdown_table=platform_md_table,
|
||||
benchmarking_results_in_json_string=processed_results_json,
|
||||
)
|
||||
f.write(results)
|
||||
|
@ -31,6 +31,20 @@ check_gpus() {
|
||||
echo "GPU type is $gpu_type"
|
||||
}
|
||||
|
||||
check_cpus() {
|
||||
# check the number of CPUs and NUMA Node and GPU type.
|
||||
declare -g numa_count=$(python3 -c "from numa import info;numa_size = info.get_num_configured_nodes(); print(numa_size)")
|
||||
if [[ $numa_count -gt 0 ]]; then
|
||||
echo "NUMA found."
|
||||
echo $numa_count
|
||||
else
|
||||
echo "Need at least 1 NUMA to run benchmarking."
|
||||
exit 1
|
||||
fi
|
||||
declare -g gpu_type="cpu"
|
||||
echo "GPU type is $gpu_type"
|
||||
}
|
||||
|
||||
check_hf_token() {
|
||||
# check if HF_TOKEN is available and valid
|
||||
if [[ -z "$HF_TOKEN" ]]; then
|
||||
@ -69,6 +83,22 @@ json2args() {
|
||||
echo "$args"
|
||||
}
|
||||
|
||||
json2envs() {
|
||||
# transforms the JSON string to environment variables.
|
||||
# example:
|
||||
# input: { "VLLM_CPU_KVCACHE_SPACE": 5 }
|
||||
# output: VLLM_CPU_KVCACHE_SPACE=5
|
||||
local json_string=$1
|
||||
local args=$(
|
||||
echo "$json_string" | jq -r '
|
||||
to_entries |
|
||||
map((.key ) + "=" + (.value | tostring)) |
|
||||
join(" ")
|
||||
'
|
||||
)
|
||||
echo "$args"
|
||||
}
|
||||
|
||||
wait_for_server() {
|
||||
# wait for vllm server to start
|
||||
# return 1 if vllm server crashes
|
||||
@ -158,15 +188,24 @@ run_latency_tests() {
|
||||
# get arguments
|
||||
latency_params=$(echo "$params" | jq -r '.parameters')
|
||||
latency_args=$(json2args "$latency_params")
|
||||
latency_environment_variables=$(echo "$params" | jq -r '.environment_variables')
|
||||
latency_envs=$(json2envs "$latency_environment_variables")
|
||||
|
||||
# check if there is enough GPU to run the test
|
||||
tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
|
||||
if [ "$ON_CPU" == "1" ];then
|
||||
if [[ $numa_count -lt $tp ]]; then
|
||||
echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name."
|
||||
continue
|
||||
fi
|
||||
else
|
||||
if [[ $gpu_count -lt $tp ]]; then
|
||||
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
||||
continue
|
||||
fi
|
||||
fi
|
||||
|
||||
latency_command="python3 benchmark_latency.py \
|
||||
latency_command=" $latency_envs python3 benchmark_latency.py \
|
||||
--output-json $RESULTS_FOLDER/${test_name}.json \
|
||||
$latency_args"
|
||||
|
||||
@ -216,15 +255,24 @@ run_throughput_tests() {
|
||||
# get arguments
|
||||
throughput_params=$(echo "$params" | jq -r '.parameters')
|
||||
throughput_args=$(json2args "$throughput_params")
|
||||
throughput_environment_variables=$(echo "$params" | jq -r '.environment_variables')
|
||||
throughput_envs=$(json2envs "$throughput_environment_variables")
|
||||
|
||||
# check if there is enough GPU to run the test
|
||||
tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
|
||||
if [ "$ON_CPU" == "1" ];then
|
||||
if [[ $numa_count -lt $tp ]]; then
|
||||
echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name."
|
||||
continue
|
||||
fi
|
||||
else
|
||||
if [[ $gpu_count -lt $tp ]]; then
|
||||
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
||||
continue
|
||||
fi
|
||||
fi
|
||||
|
||||
throughput_command="python3 benchmark_throughput.py \
|
||||
throughput_command=" $throughput_envs python3 benchmark_throughput.py \
|
||||
--output-json $RESULTS_FOLDER/${test_name}.json \
|
||||
$throughput_args"
|
||||
|
||||
@ -272,19 +320,28 @@ run_serving_tests() {
|
||||
|
||||
# get client and server arguments
|
||||
server_params=$(echo "$params" | jq -r '.server_parameters')
|
||||
server_envs=$(echo "$params" | jq -r '.server_environment_variables')
|
||||
client_params=$(echo "$params" | jq -r '.client_parameters')
|
||||
server_args=$(json2args "$server_params")
|
||||
server_envs=$(json2envs "$server_envs")
|
||||
client_args=$(json2args "$client_params")
|
||||
qps_list=$(echo "$params" | jq -r '.qps_list')
|
||||
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
||||
echo "Running over qps list $qps_list"
|
||||
|
||||
# check if there is enough GPU to run the test
|
||||
# check if there is enough resources to run the test
|
||||
tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
|
||||
if [ "$ON_CPU" == "1" ];then
|
||||
if [[ $numa_count -lt $tp ]]; then
|
||||
echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name."
|
||||
continue
|
||||
fi
|
||||
else
|
||||
if [[ $gpu_count -lt $tp ]]; then
|
||||
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
||||
continue
|
||||
fi
|
||||
fi
|
||||
|
||||
# check if server model and client model is aligned
|
||||
server_model=$(echo "$server_params" | jq -r '.model')
|
||||
@ -294,23 +351,33 @@ run_serving_tests() {
|
||||
continue
|
||||
fi
|
||||
|
||||
server_command="python3 \
|
||||
server_command="$server_envs python3 \
|
||||
-m vllm.entrypoints.openai.api_server \
|
||||
$server_args"
|
||||
|
||||
# run the server
|
||||
echo "Running test case $test_name"
|
||||
echo "Server command: $server_command"
|
||||
# support remote vllm server
|
||||
client_remote_args=""
|
||||
if [[ -z "${REMOTE_HOST}" ]]; then
|
||||
bash -c "$server_command" &
|
||||
server_pid=$!
|
||||
|
||||
# wait until the server is alive
|
||||
if wait_for_server; then
|
||||
echo ""
|
||||
echo "vllm server is up and running."
|
||||
echo "vLLM server is up and running."
|
||||
else
|
||||
echo ""
|
||||
echo "vllm failed to start within the timeout period."
|
||||
echo "vLLM failed to start within the timeout period."
|
||||
fi
|
||||
else
|
||||
server_command="Using Remote Server $REMOTE_HOST $REMOTE_PORT"
|
||||
if [[ ${REMOTE_PORT} ]]; then
|
||||
client_remote_args=" --host=$REMOTE_HOST --port=$REMOTE_PORT "
|
||||
else
|
||||
client_remote_args=" --host=$REMOTE_HOST "
|
||||
fi
|
||||
fi
|
||||
|
||||
# iterate over different QPS
|
||||
@ -332,7 +399,7 @@ run_serving_tests() {
|
||||
--result-filename ${new_test_name}.json \
|
||||
--request-rate $qps \
|
||||
--metadata "tensor_parallel_size=$tp" \
|
||||
$client_args"
|
||||
$client_args $client_remote_args "
|
||||
|
||||
echo "Running test case $test_name with qps $qps"
|
||||
echo "Client command: $client_command"
|
||||
@ -360,7 +427,14 @@ run_serving_tests() {
|
||||
}
|
||||
|
||||
main() {
|
||||
local ARCH
|
||||
ARCH=''
|
||||
if [ "$ON_CPU" == "1" ];then
|
||||
check_cpus
|
||||
ARCH='-cpu'
|
||||
else
|
||||
check_gpus
|
||||
fi
|
||||
check_hf_token
|
||||
|
||||
# Set to v1 to run v1 benchmark
|
||||
@ -386,9 +460,9 @@ main() {
|
||||
QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
|
||||
|
||||
# benchmarking
|
||||
run_serving_tests $QUICK_BENCHMARK_ROOT/tests/serving-tests.json
|
||||
run_latency_tests $QUICK_BENCHMARK_ROOT/tests/latency-tests.json
|
||||
run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/throughput-tests.json
|
||||
run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
|
||||
run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}"
|
||||
run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/"${THROUGHPUT_JSON:-throughput-tests$ARCH.json}"
|
||||
|
||||
# postprocess benchmarking results
|
||||
pip install tabulate pandas
|
||||
|
30
.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json
Normal file
30
.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json
Normal file
@ -0,0 +1,30 @@
|
||||
[
|
||||
{
|
||||
"test_name": "latency_llama8B_tp1",
|
||||
"environment_variables": {
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"load_format": "dummy",
|
||||
"num_iters_warmup": 5,
|
||||
"num_iters": 15
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "latency_llama8B_tp4",
|
||||
"environment_variables": {
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 4,
|
||||
"load_format": "dummy",
|
||||
"num_iters_warmup": 5,
|
||||
"num_iters": 15
|
||||
}
|
||||
}
|
||||
]
|
158
.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json
Normal file
158
.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json
Normal file
@ -0,0 +1,158 @@
|
||||
[
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_sharegpt",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": "",
|
||||
"enforce_eager": "",
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"max_concurrency": 60,
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp2_sharegpt",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 2,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": "",
|
||||
"enforce_eager": "",
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"max_concurrency": 60,
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp4_sharegpt",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 4,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": "",
|
||||
"enforce_eager": "",
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"max_concurrency": 60,
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp4_random_1024_128",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 4,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": "",
|
||||
"enforce_eager": "",
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 1024,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"max_concurrency": 100,
|
||||
"num_prompts": 100
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_pp6_random_1024_128",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"pipeline_parallel_size": 6,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": "",
|
||||
"enforce_eager": "",
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 1024,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"max_concurrency": 100,
|
||||
"num_prompts": 100
|
||||
}
|
||||
}
|
||||
]
|
@ -0,0 +1,32 @@
|
||||
[
|
||||
{
|
||||
"test_name": "throughput_llama8B_tp1",
|
||||
"environment_variables": {
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"load_format": "dummy",
|
||||
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200,
|
||||
"backend": "vllm"
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "throughput_llama8B_tp4",
|
||||
"environment_variables": {
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 4,
|
||||
"load_format": "dummy",
|
||||
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200,
|
||||
"backend": "vllm"
|
||||
}
|
||||
}
|
||||
]
|
@ -52,7 +52,7 @@ steps:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||
|
||||
- label: "Annotate release workflow"
|
||||
@ -101,7 +101,7 @@ steps:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
|
||||
env:
|
||||
|
@ -107,10 +107,9 @@ fi
|
||||
|
||||
if [[ $commands == *" kernels/attention"* ]]; then
|
||||
commands="${commands} \
|
||||
--ignore=kernels/attention/stest_attention_selector.py \
|
||||
--ignore=kernels/attention/test_attention_selector.py \
|
||||
--ignore=kernels/attention/test_blocksparse_attention.py \
|
||||
--ignore=kernels/attention/test_encoder_decoder_attn.py \
|
||||
--ignore=kernels/attention/test_attention_selector.py \
|
||||
--ignore=kernels/attention/test_flash_attn.py \
|
||||
--ignore=kernels/attention/test_flashinfer.py \
|
||||
--ignore=kernels/attention/test_prefix_prefill.py \
|
||||
|
14
.buildkite/scripts/tpu/quantized_v6e_1.env
Normal file
14
.buildkite/scripts/tpu/quantized_v6e_1.env
Normal file
@ -0,0 +1,14 @@
|
||||
# Environment config
|
||||
TEST_NAME=llama8bw8a8
|
||||
CONTAINER_NAME=vllm-tpu
|
||||
|
||||
# vllm config
|
||||
MODEL=RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8
|
||||
MAX_NUM_SEQS=128
|
||||
MAX_NUM_BATCHED_TOKENS=1024
|
||||
TENSOR_PARALLEL_SIZE=1
|
||||
MAX_MODEL_LEN=2048
|
||||
DOWNLOAD_DIR=/mnt/disks/persist
|
||||
EXPECTED_THROUGHPUT=10.0
|
||||
INPUT_LEN=1800
|
||||
OUTPUT_LEN=128
|
@ -155,6 +155,7 @@ steps:
|
||||
- examples/offline_inference/rlhf_colocate.py
|
||||
- tests/examples/offline_inference/data_parallel.py
|
||||
- tests/v1/test_async_llm_dp.py
|
||||
- tests/v1/test_external_lb_dp.py
|
||||
- tests/v1/engine/test_engine_core_client.py
|
||||
commands:
|
||||
# test with tp=2 and external_dp=2
|
||||
@ -163,8 +164,9 @@ steps:
|
||||
# test with tp=2 and pp=2
|
||||
- PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||
# test with internal dp
|
||||
- python3 ../examples/offline_inference/data_parallel.py
|
||||
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
|
||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
|
||||
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
|
||||
- pytest -v -s distributed/test_utils.py
|
||||
- pytest -v -s compile/test_basic_correctness.py
|
||||
@ -215,7 +217,7 @@ steps:
|
||||
##### 1 GPU test #####
|
||||
|
||||
- label: Regression Test # 5min
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/test_regression
|
||||
@ -225,7 +227,7 @@ steps:
|
||||
working_dir: "/vllm-workspace/tests" # optional
|
||||
|
||||
- label: Engine Test # 10min
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/engine
|
||||
@ -338,7 +340,7 @@ steps:
|
||||
parallelism: 4
|
||||
|
||||
- label: PyTorch Compilation Unit Tests
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
mirror_hardwares: [amdexperimental]
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
@ -420,7 +422,7 @@ steps:
|
||||
- pytest -v -s kernels/mamba
|
||||
|
||||
- label: Tensorizer Test # 11min
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
mirror_hardwares: [amdexperimental]
|
||||
soft_fail: true
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/model_loader
|
||||
@ -512,7 +514,7 @@ steps:
|
||||
##### models test #####
|
||||
|
||||
- label: Basic Models Test # 24min
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
mirror_hardwares: [amdexperimental]
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
@ -601,7 +603,7 @@ steps:
|
||||
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
|
||||
|
||||
- label: Multi-Modal Models Test (Extended) 3
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
mirror_hardwares: [amdexperimental]
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
@ -682,10 +684,12 @@ steps:
|
||||
- vllm/worker/model_runner.py
|
||||
- entrypoints/llm/test_collective_rpc.py
|
||||
- tests/v1/test_async_llm_dp.py
|
||||
- tests/v1/test_external_lb_dp.py
|
||||
- tests/v1/entrypoints/openai/test_multi_api_servers.py
|
||||
- vllm/v1/engine/
|
||||
commands:
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
|
||||
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
|
||||
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
||||
- pytest -v -s ./compile/test_basic_correctness.py
|
||||
|
2
.github/CODEOWNERS
vendored
2
.github/CODEOWNERS
vendored
@ -16,7 +16,7 @@
|
||||
/vllm/lora @jeejeelee
|
||||
/vllm/reasoning @aarnphm
|
||||
/vllm/entrypoints @aarnphm
|
||||
CMakeLists.txt @tlrmchlsmth
|
||||
CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
||||
|
||||
# Any change to the VllmConfig changes can have a large user-facing impact,
|
||||
# so spam a lot of people
|
||||
|
21
.github/mergify.yml
vendored
21
.github/mergify.yml
vendored
@ -74,14 +74,25 @@ pull_request_rules:
|
||||
- files~=^vllm/multimodal/
|
||||
- files~=^tests/multimodal/
|
||||
- files~=^tests/models/multimodal/
|
||||
- files~=^tests/models/*/audio_language/
|
||||
- files~=^tests/models/*/vision_language/
|
||||
- files=tests/models/test_vision.py
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
- multi-modality
|
||||
|
||||
- name: label-new-model
|
||||
description: Automatically apply new-model label
|
||||
conditions:
|
||||
- and:
|
||||
- files~=^vllm/model_executor/models/
|
||||
- files=vllm/model_executor/models/registry.py
|
||||
- files=tests/models/registry.py
|
||||
- files=docs/models/supported_models.md
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
- new-model
|
||||
|
||||
- name: label-performance
|
||||
description: Automatically apply performance label
|
||||
conditions:
|
||||
@ -156,8 +167,14 @@ pull_request_rules:
|
||||
conditions:
|
||||
- or:
|
||||
- files~=^vllm/spec_decode/
|
||||
- files~=^vllm/v1/spec_decode/
|
||||
- files=vllm/model_executor/layers/spec_decode_base_sampler.py
|
||||
- files~=^tests/spec_decode/
|
||||
- files~=^tests/v1/spec_decode/
|
||||
- files~=^examples/.*(spec_decode|mlpspeculator|eagle|speculation).*\.py
|
||||
- files~=^vllm/model_executor/models/.*eagle.*\.py
|
||||
- files=vllm/model_executor/models/mlp_speculator.py
|
||||
- files~=^vllm/transformers_utils/configs/(eagle|medusa|mlp_speculator)\.py
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
|
2
.github/workflows/lint-and-deploy.yaml
vendored
2
.github/workflows/lint-and-deploy.yaml
vendored
@ -68,7 +68,7 @@ jobs:
|
||||
export AWS_ACCESS_KEY_ID=minioadmin
|
||||
export AWS_SECRET_ACCESS_KEY=minioadmin
|
||||
sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" &
|
||||
helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
|
||||
helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set image.env[2].name=VLLM_CPU_CI_ENV --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string image.env[2].value="1" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
|
||||
|
||||
- name: curl test
|
||||
run: |
|
||||
|
@ -259,7 +259,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
|
||||
|
||||
# Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
|
||||
set(CUTLASS_REVISION "v3.9.2" CACHE STRING "CUTLASS revision to use")
|
||||
set(CUTLASS_REVISION "v4.0.0" CACHE STRING "CUTLASS revision to use")
|
||||
|
||||
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
|
||||
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
|
||||
@ -616,6 +616,26 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||
set(SRCS "csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${SRCS}"
|
||||
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
||||
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
||||
list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM100=1")
|
||||
message(STATUS "Building blockwise_scaled_group_mm_sm100 for archs: ${SCALED_MM_ARCHS}")
|
||||
else()
|
||||
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||
message(STATUS "Not building blockwise_scaled_group_mm_sm100 kernels as CUDA Compiler version is "
|
||||
"not >= 12.8, we recommend upgrading to CUDA 12.8 or later "
|
||||
"if you intend on running FP8 quantized MoE models on Blackwell.")
|
||||
else()
|
||||
message(STATUS "Not building blockwise_scaled_group_mm_sm100 as no compatible archs found "
|
||||
"in CUDA target architectures")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
#
|
||||
# Machete kernels
|
||||
|
||||
|
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import argparse
|
||||
import copy
|
||||
import itertools
|
||||
|
@ -620,7 +620,7 @@ def main(args: argparse.Namespace):
|
||||
4096,
|
||||
]
|
||||
else:
|
||||
batch_sizes = [args.batch_size]
|
||||
batch_sizes = args.batch_size
|
||||
|
||||
use_deep_gemm = bool(args.use_deep_gemm)
|
||||
|
||||
@ -728,7 +728,7 @@ if __name__ == "__main__":
|
||||
)
|
||||
parser.add_argument("--use-deep-gemm", action="store_true")
|
||||
parser.add_argument("--seed", type=int, default=0)
|
||||
parser.add_argument("--batch-size", type=int, required=False)
|
||||
parser.add_argument("--batch-size", type=int, nargs="+", required=False)
|
||||
parser.add_argument("--tune", action="store_true")
|
||||
parser.add_argument("--trust-remote-code", action="store_true")
|
||||
parser.add_argument("--model-prefix", type=str, required=False)
|
||||
|
@ -12,9 +12,8 @@ endif()
|
||||
#
|
||||
# Define environment variables for special configurations
|
||||
#
|
||||
if(DEFINED ENV{VLLM_CPU_AVX512BF16})
|
||||
set(ENABLE_AVX512BF16 ON)
|
||||
endif()
|
||||
set(ENABLE_AVX512BF16 $ENV{VLLM_CPU_AVX512BF16})
|
||||
set(ENABLE_AVX512VNNI $ENV{VLLM_CPU_AVX512VNNI})
|
||||
|
||||
include_directories("${CMAKE_SOURCE_DIR}/csrc")
|
||||
|
||||
@ -107,9 +106,18 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)
|
||||
endif()
|
||||
|
||||
find_isa(${CPUINFO} "avx512_vnni" AVX512VNNI_FOUND)
|
||||
if (AVX512VNNI_FOUND)
|
||||
if (AVX512VNNI_FOUND OR ENABLE_AVX512VNNI)
|
||||
if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND
|
||||
CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12.3)
|
||||
list(APPEND CXX_COMPILE_FLAGS "-mavx512vnni")
|
||||
set(ENABLE_AVX512VNNI ON)
|
||||
else()
|
||||
set(ENABLE_AVX512VNNI OFF)
|
||||
message(WARNING "Disable AVX512-VNNI ISA support, requires gcc/g++ >= 12.3")
|
||||
endif()
|
||||
else()
|
||||
set(ENABLE_AVX512VNNI OFF)
|
||||
message(WARNING "Disable AVX512-VNNI ISA support, no avx512_vnni found in local CPU flags." " If cross-compilation is required, please set env VLLM_CPU_AVX512VNNI=1.")
|
||||
endif()
|
||||
|
||||
elseif (AVX2_FOUND)
|
||||
@ -257,6 +265,8 @@ elseif(POWER10_FOUND)
|
||||
${VLLM_EXT_SRC})
|
||||
endif()
|
||||
|
||||
message(STATUS "CPU extension source files: ${VLLM_EXT_SRC}")
|
||||
|
||||
#
|
||||
# Define extension targets
|
||||
#
|
||||
|
@ -45,7 +45,6 @@
|
||||
#include "cute/algorithm/functional.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
#include "cute/algorithm/gemm.hpp"
|
||||
#include "cute/tensor_predicate.hpp"
|
||||
#include "cute/numeric/arithmetic_tuple.hpp"
|
||||
|
||||
#include "cutlass_extensions/gemm/dispatch_policy.hpp"
|
||||
|
@ -162,10 +162,11 @@ __global__ void dynamic_scaled_int8_quant_kernel(
|
||||
|
||||
// calculate for absmax
|
||||
float thread_max = 0.f;
|
||||
for (int i = tid; i < hidden_size; i += stride) {
|
||||
const auto v = fabsf(static_cast<float>(row_in[i]));
|
||||
vectorize_read_with_alignment<16>(
|
||||
row_in, hidden_size, tid, stride, [&] __device__(const scalar_t& src) {
|
||||
const float v = fabsf(static_cast<float>(src));
|
||||
thread_max = fmaxf(thread_max, v);
|
||||
}
|
||||
});
|
||||
using BlockReduce = cub::BlockReduce<float, 256>;
|
||||
__shared__ typename BlockReduce::TempStorage tmp;
|
||||
float block_max = BlockReduce(tmp).Reduce(thread_max, cub::Max{}, blockDim.x);
|
||||
@ -232,9 +233,10 @@ __global__ void dynamic_scaled_int8_azp_quant_kernel(
|
||||
|
||||
// 1. calculate min & max
|
||||
MinMax thread_mm;
|
||||
for (int i = tid; i < hidden_size; i += stride) {
|
||||
thread_mm += static_cast<float>(row_in[i]);
|
||||
}
|
||||
vectorize_read_with_alignment<16>(row_in, hidden_size, tid, stride,
|
||||
[&] __device__(const scalar_t& src) {
|
||||
thread_mm += static_cast<float>(src);
|
||||
});
|
||||
|
||||
using BlockReduce = cub::BlockReduce<MinMax, 256>;
|
||||
__shared__ typename BlockReduce::TempStorage tmp;
|
||||
|
@ -51,7 +51,8 @@ struct cutlass_3x_gemm {
|
||||
// These are the minimum alignments needed for the kernels to compile
|
||||
static constexpr int AlignmentAB =
|
||||
128 / cutlass::sizeof_bits<ElementAB>::value;
|
||||
static constexpr int AlignmentCD = 4;
|
||||
static constexpr int AlignmentCD =
|
||||
128 / cutlass::sizeof_bits<ElementD>::value;
|
||||
|
||||
using CollectiveEpilogue =
|
||||
typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
|
@ -0,0 +1,374 @@
|
||||
#include "core/registration.h"
|
||||
|
||||
#include <torch/all.h>
|
||||
#include <cutlass/arch/arch.h>
|
||||
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
#include <c10/cuda/CUDAStream.h>
|
||||
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cutlass/tensor_ref.h"
|
||||
#include "cutlass/epilogue/collective/default_epilogue.hpp"
|
||||
#include "cutlass/epilogue/thread/linear_combination.h"
|
||||
#include "cutlass/gemm/dispatch_policy.hpp"
|
||||
#include "cutlass/gemm/group_array_problem_shape.hpp"
|
||||
#include "cutlass/gemm/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||
#include "cutlass/gemm/kernel/gemm_universal.hpp"
|
||||
|
||||
#include "cutlass/util/command_line.h"
|
||||
#include "cutlass/util/distribution.h"
|
||||
#include "cutlass/util/host_tensor.h"
|
||||
#include "cutlass/util/packed_stride.hpp"
|
||||
#include "cutlass/util/tensor_view_io.h"
|
||||
#include "cutlass/util/reference/device/gemm.h"
|
||||
#include "cutlass/util/reference/device/tensor_compare.h"
|
||||
#include "cutlass/util/reference/host/tensor_fill.h"
|
||||
#include "cutlass/util/reference/host/gett.hpp"
|
||||
#include "cutlass/util/reference/host/tensor_norm.h"
|
||||
#include "cutlass/util/reference/host/tensor_compare.h"
|
||||
#include <cassert>
|
||||
|
||||
using namespace cute;
|
||||
|
||||
template <typename ElementAB, typename ElementC, typename ElementAccumulator,
|
||||
typename LayoutSFA, typename LayoutSFB, typename ScaleConfig>
|
||||
__global__ void get_ggemm_starts(
|
||||
int32_t* expert_offsets, ElementAB** a_offsets, ElementAB** b_offsets,
|
||||
ElementC** out_offsets, ElementAccumulator** a_scale_offsets,
|
||||
ElementAccumulator** b_scale_offsets, ElementAB* a_base_as_int,
|
||||
ElementAB* b_base_as_int, ElementC* out_base_as_int,
|
||||
ElementAccumulator* a_scale_base_as_int,
|
||||
ElementAccumulator* b_scale_base_as_int, LayoutSFA* layout_sfa_base_as_int,
|
||||
LayoutSFB* layout_sfb_base_as_int, int* problem_sizes) {
|
||||
int expert_id = threadIdx.x;
|
||||
|
||||
if (expert_id >= gridDim.x * blockDim.x) {
|
||||
return;
|
||||
}
|
||||
|
||||
int m = problem_sizes[expert_id * 3];
|
||||
int n = problem_sizes[expert_id * 3 + 1];
|
||||
int k = problem_sizes[expert_id * 3 + 2];
|
||||
|
||||
int32_t expert_offset = expert_offsets[expert_id];
|
||||
int a_stride = expert_offset * k;
|
||||
int b_stride = expert_id * k * n;
|
||||
int a_scale_stride = expert_offset * k / 128;
|
||||
int b_scale_stride = expert_id * k * n / 128 / 128;
|
||||
|
||||
a_offsets[expert_id] = a_base_as_int + a_stride;
|
||||
b_offsets[expert_id] = b_base_as_int + b_stride;
|
||||
out_offsets[expert_id] = out_base_as_int + expert_offset * n;
|
||||
a_scale_offsets[expert_id] = a_scale_base_as_int + a_scale_stride;
|
||||
b_scale_offsets[expert_id] = b_scale_base_as_int + b_scale_stride;
|
||||
|
||||
LayoutSFA* layout_sfa_ptr = layout_sfa_base_as_int + expert_id;
|
||||
LayoutSFB* layout_sfb_ptr = layout_sfb_base_as_int + expert_id;
|
||||
|
||||
*layout_sfa_ptr =
|
||||
ScaleConfig::tile_atom_to_shape_SFA(cute::make_shape(m, n, k, 1));
|
||||
*layout_sfb_ptr =
|
||||
ScaleConfig::tile_atom_to_shape_SFB(cute::make_shape(m, n, k, 1));
|
||||
}
|
||||
|
||||
#define __CALL_GET_STARTS_KERNEL(TENSOR_C_TYPE, C_TYPE, LayoutSFA, LayoutSFB, \
|
||||
ScaleConfig) \
|
||||
else if (out_tensors.dtype() == TENSOR_C_TYPE) { \
|
||||
get_ggemm_starts<cutlass::float_e4m3_t, C_TYPE, float, LayoutSFA, \
|
||||
LayoutSFB, ScaleConfig><<<1, num_experts, 0, stream>>>( \
|
||||
static_cast<int32_t*>(expert_offsets.data_ptr()), \
|
||||
static_cast<cutlass::float_e4m3_t**>(a_ptrs.data_ptr()), \
|
||||
static_cast<cutlass::float_e4m3_t**>(b_ptrs.data_ptr()), \
|
||||
static_cast<C_TYPE**>(out_ptrs.data_ptr()), \
|
||||
static_cast<float**>(a_scales_ptrs.data_ptr()), \
|
||||
static_cast<float**>(b_scales_ptrs.data_ptr()), \
|
||||
static_cast<cutlass::float_e4m3_t*>(a_tensors.data_ptr()), \
|
||||
static_cast<cutlass::float_e4m3_t*>(b_tensors.data_ptr()), \
|
||||
static_cast<C_TYPE*>(out_tensors.data_ptr()), \
|
||||
static_cast<float*>(a_scales.data_ptr()), \
|
||||
static_cast<float*>(b_scales.data_ptr()), \
|
||||
reinterpret_cast<LayoutSFA*>(layout_sfa.data_ptr()), \
|
||||
reinterpret_cast<LayoutSFB*>(layout_sfb.data_ptr()), \
|
||||
static_cast<int*>(problem_sizes.data_ptr())); \
|
||||
}
|
||||
|
||||
template <typename LayoutSFA, typename LayoutSFB, typename ScaleConfig>
|
||||
void run_get_ggemm_starts(
|
||||
torch::Tensor const& expert_offsets, torch::Tensor& a_ptrs,
|
||||
torch::Tensor& b_ptrs, torch::Tensor& out_ptrs,
|
||||
torch::Tensor& a_scales_ptrs, torch::Tensor& b_scales_ptrs,
|
||||
torch::Tensor const& a_tensors, torch::Tensor const& b_tensors,
|
||||
torch::Tensor out_tensors, torch::Tensor const& a_scales,
|
||||
torch::Tensor const& b_scales, torch::Tensor const& layout_sfa,
|
||||
torch::Tensor const& layout_sfb, torch::Tensor const& problem_sizes) {
|
||||
TORCH_CHECK(a_tensors.dtype() == torch::kFloat8_e4m3fn);
|
||||
TORCH_CHECK(b_tensors.dtype() == torch::kFloat8_e4m3fn);
|
||||
TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
|
||||
TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
|
||||
TORCH_CHECK(out_tensors.size(1) % 128 == 0 or out_tensors.size(0) % 128 == 0);
|
||||
TORCH_CHECK(a_tensors.size(1) % 128 == 0 or a_tensors.size(0) % 128 == 0);
|
||||
|
||||
int num_experts = (int)expert_offsets.size(0);
|
||||
auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index());
|
||||
|
||||
if (false) {
|
||||
}
|
||||
__CALL_GET_STARTS_KERNEL(torch::kBFloat16, cutlass::bfloat16_t, LayoutSFA,
|
||||
LayoutSFB, ScaleConfig)
|
||||
__CALL_GET_STARTS_KERNEL(torch::kFloat16, cutlass::half_t, LayoutSFA,
|
||||
LayoutSFB, ScaleConfig)
|
||||
else {
|
||||
TORCH_CHECK(false, "Unsupported output tensor type");
|
||||
}
|
||||
}
|
||||
|
||||
template <typename OutType, typename ScheduleConfig, typename LayoutD>
|
||||
void run_blockwise_scaled_group_mm(
|
||||
torch::Tensor& out_ptrs, const torch::Tensor& a_ptrs,
|
||||
const torch::Tensor& b_ptrs, const torch::Tensor& a_scales_ptrs,
|
||||
const torch::Tensor& b_scales_ptrs, const torch::Tensor& stride_a,
|
||||
const torch::Tensor& stride_b, const torch::Tensor& stride_c,
|
||||
const torch::Tensor& layout_sfa, const torch::Tensor& layout_sfb,
|
||||
const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets) {
|
||||
using ProblemShape = cutlass::gemm::GroupProblemShape<Shape<int, int, int>>;
|
||||
|
||||
// Types
|
||||
using ElementA = cutlass::float_e4m3_t;
|
||||
using ElementB = cutlass::float_e4m3_t;
|
||||
using ElementC = OutType;
|
||||
using ElementD = ElementC;
|
||||
using ElementAccumulator = float;
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using LayoutC = LayoutD;
|
||||
|
||||
// Alignments
|
||||
static constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value;
|
||||
static constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value;
|
||||
static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
|
||||
|
||||
using ArchTag = cutlass::arch::Sm100;
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp;
|
||||
|
||||
using CollectiveEpilogue =
|
||||
typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass, typename ScheduleConfig::MmaTileShape,
|
||||
typename ScheduleConfig::ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator,
|
||||
ElementAccumulator, void, LayoutC*, AlignmentC, ElementD, LayoutC*,
|
||||
AlignmentC, typename ScheduleConfig::EpilogueSchedule>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop =
|
||||
typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass, ElementA,
|
||||
cute::tuple<LayoutA*, typename ScheduleConfig::LayoutSFA*>,
|
||||
AlignmentA, ElementB,
|
||||
cute::tuple<LayoutB*, typename ScheduleConfig::LayoutSFB*>,
|
||||
AlignmentB, ElementAccumulator, typename ScheduleConfig::MmaTileShape,
|
||||
typename ScheduleConfig::ClusterShape,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
|
||||
sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
typename ScheduleConfig::KernelSchedule>::CollectiveOp;
|
||||
|
||||
using GemmKernel =
|
||||
cutlass::gemm::kernel::GemmUniversal<ProblemShape, CollectiveMainloop,
|
||||
CollectiveEpilogue, void>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
using StrideA = typename Gemm::GemmKernel::InternalStrideA;
|
||||
using StrideB = typename Gemm::GemmKernel::InternalStrideB;
|
||||
using StrideC = typename Gemm::GemmKernel::InternalStrideC;
|
||||
using StrideD = typename Gemm::GemmKernel::InternalStrideD;
|
||||
|
||||
using UnderlyingProblemShape = ProblemShape::UnderlyingProblemShape;
|
||||
int num_experts = (int)expert_offsets.size(0);
|
||||
|
||||
Gemm gemm_op;
|
||||
|
||||
// Mainloop Arguments
|
||||
typename GemmKernel::MainloopArguments mainloop_args{
|
||||
static_cast<const ElementA**>(a_ptrs.data_ptr()),
|
||||
static_cast<StrideA*>(stride_a.data_ptr()),
|
||||
static_cast<const ElementB**>(b_ptrs.data_ptr()),
|
||||
static_cast<StrideB*>(stride_b.data_ptr()),
|
||||
static_cast<const ElementAccumulator**>(a_scales_ptrs.data_ptr()),
|
||||
reinterpret_cast<typename ScheduleConfig::LayoutSFA*>(
|
||||
layout_sfa.data_ptr()),
|
||||
static_cast<const ElementAccumulator**>(b_scales_ptrs.data_ptr()),
|
||||
reinterpret_cast<typename ScheduleConfig::LayoutSFB*>(
|
||||
layout_sfb.data_ptr())};
|
||||
|
||||
cutlass::KernelHardwareInfo hw_info;
|
||||
hw_info.device_id = a_ptrs.get_device();
|
||||
hw_info.sm_count =
|
||||
cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
|
||||
hw_info.device_id);
|
||||
|
||||
// Epilogue Arguments
|
||||
typename GemmKernel::EpilogueArguments epilogue_args{
|
||||
{}, // epilogue.thread
|
||||
nullptr,
|
||||
static_cast<StrideC*>(stride_c.data_ptr()),
|
||||
static_cast<ElementD**>(out_ptrs.data_ptr()),
|
||||
static_cast<StrideC*>(stride_c.data_ptr())};
|
||||
|
||||
UnderlyingProblemShape* problem_sizes_as_shapes =
|
||||
static_cast<UnderlyingProblemShape*>(problem_sizes.data_ptr());
|
||||
|
||||
// Gemm Arguments
|
||||
typename GemmKernel::Arguments args{
|
||||
cutlass::gemm::GemmUniversalMode::kGrouped,
|
||||
{num_experts, problem_sizes_as_shapes, nullptr},
|
||||
mainloop_args,
|
||||
epilogue_args,
|
||||
hw_info};
|
||||
|
||||
at::cuda::CUDAGuard device_guard{(char)a_ptrs.device().index()};
|
||||
const cudaStream_t stream =
|
||||
at::cuda::getCurrentCUDAStream(a_ptrs.get_device());
|
||||
|
||||
auto can_implement_status = gemm_op.can_implement(args);
|
||||
TORCH_CHECK(can_implement_status == cutlass::Status::kSuccess,
|
||||
"Failed to implement GEMM");
|
||||
|
||||
size_t workspace_size = gemm_op.get_workspace_size(args);
|
||||
auto const workspace_options =
|
||||
torch::TensorOptions().dtype(torch::kUInt8).device(a_ptrs.device());
|
||||
auto workspace = torch::empty(workspace_size, workspace_options);
|
||||
|
||||
auto status = gemm_op.initialize(args, workspace.data_ptr(), stream);
|
||||
TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to initialize GEMM");
|
||||
|
||||
status = gemm_op.run(stream);
|
||||
TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to run GEMM");
|
||||
}
|
||||
|
||||
template <typename OutType>
|
||||
void blockwise_scaled_group_mm_dispatch_shape(
|
||||
torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
|
||||
const torch::Tensor& scales_a, const torch::Tensor& scales_b,
|
||||
const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets) {
|
||||
struct MmaConfig {
|
||||
using ElementA = cutlass::float_e4m3_t;
|
||||
using KernelSchedule =
|
||||
cutlass::gemm::KernelPtrArrayTmaWarpSpecializedBlockwise1SmSm100;
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
|
||||
using ScaleConfig = cutlass::detail::Sm100BlockwiseScaleConfig<
|
||||
1, 128, 128, cute::UMMA::Major::K, cute::UMMA::Major::K>;
|
||||
using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA());
|
||||
using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB());
|
||||
using LayoutC = cutlass::layout::RowMajor;
|
||||
using MmaTileShape = Shape<_128, _128, _128>;
|
||||
using ClusterShape = Shape<_1, _1, _1>;
|
||||
};
|
||||
|
||||
int num_experts = (int)expert_offsets.size(0);
|
||||
|
||||
auto a_ptrs = torch::empty(
|
||||
{num_experts},
|
||||
torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
|
||||
auto b_ptrs = torch::empty(
|
||||
{num_experts},
|
||||
torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
|
||||
auto out_ptrs = torch::empty(
|
||||
{num_experts},
|
||||
torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
|
||||
auto a_scales_ptrs = torch::empty(
|
||||
{num_experts},
|
||||
torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
|
||||
auto b_scales_ptrs = torch::empty(
|
||||
{num_experts},
|
||||
torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
|
||||
|
||||
auto layout_sfa = torch::empty(
|
||||
{num_experts, 5},
|
||||
torch::TensorOptions().dtype(torch::kInt32).device(a.device()));
|
||||
auto layout_sfb = torch::empty(
|
||||
{num_experts, 5},
|
||||
torch::TensorOptions().dtype(torch::kInt32).device(a.device()));
|
||||
|
||||
auto stride_a = torch::full(
|
||||
{num_experts}, a.size(1),
|
||||
torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
|
||||
auto stride_b = torch::full(
|
||||
{num_experts}, a.size(1),
|
||||
torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
|
||||
auto stride_c = torch::full(
|
||||
{num_experts}, output.size(1),
|
||||
torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
|
||||
|
||||
torch::TensorOptions options_int =
|
||||
torch::TensorOptions().dtype(torch::kInt64).device(a.device());
|
||||
|
||||
run_get_ggemm_starts<typename MmaConfig::LayoutSFA,
|
||||
typename MmaConfig::LayoutSFB,
|
||||
typename MmaConfig::ScaleConfig>(
|
||||
expert_offsets, a_ptrs, b_ptrs, out_ptrs, a_scales_ptrs, b_scales_ptrs, a,
|
||||
b, output, scales_a, scales_b, layout_sfa, layout_sfb, problem_sizes);
|
||||
|
||||
run_blockwise_scaled_group_mm<OutType, MmaConfig,
|
||||
typename MmaConfig::LayoutC>(
|
||||
out_ptrs, a_ptrs, b_ptrs, a_scales_ptrs, b_scales_ptrs, stride_a,
|
||||
stride_b, stride_c, layout_sfa, layout_sfb, problem_sizes,
|
||||
expert_offsets);
|
||||
}
|
||||
|
||||
void cutlass_blockwise_scaled_grouped_mm(
|
||||
torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
|
||||
const torch::Tensor& scales_a, const torch::Tensor& scales_b,
|
||||
const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets) {
|
||||
TORCH_CHECK(problem_sizes.dim() == 2, "problem_sizes must be 2D tensor");
|
||||
TORCH_CHECK(problem_sizes.size(1) == 3,
|
||||
"problem_sizes must have shape (num_experts, 3)");
|
||||
TORCH_CHECK(problem_sizes.size(0) == expert_offsets.size(0),
|
||||
"Number of experts in problem_sizes must match expert_offsets");
|
||||
TORCH_CHECK(problem_sizes.dtype() == torch::kInt32,
|
||||
"problem_sizes must be int32");
|
||||
TORCH_CHECK(a.scalar_type() == torch::kFloat8_e4m3fn,
|
||||
"a must be kFloat8_e4m3fn");
|
||||
TORCH_CHECK(b.scalar_type() == torch::kFloat8_e4m3fn,
|
||||
"b must be kFloat8_e4m3fn");
|
||||
TORCH_CHECK(output.scalar_type() == torch::kBFloat16 ||
|
||||
output.scalar_type() == torch::kHalf,
|
||||
"output must be bfloat16 or half");
|
||||
TORCH_CHECK(scales_a.scalar_type() == torch::kFloat32,
|
||||
"scales_a must be float32");
|
||||
TORCH_CHECK(scales_b.scalar_type() == torch::kFloat32,
|
||||
"scales_b must be float32");
|
||||
TORCH_CHECK(expert_offsets.scalar_type() == torch::kInt32,
|
||||
"expert_offsets must be int32");
|
||||
|
||||
TORCH_CHECK(output.dim() == 2, "output must be 2D tensor");
|
||||
TORCH_CHECK(a.dim() == 2, "a must be 2D tensor");
|
||||
TORCH_CHECK(b.dim() == 3, "b must be 3D tensor");
|
||||
TORCH_CHECK(scales_a.dim() == 2, "scales_a must be 2D tensor");
|
||||
TORCH_CHECK(scales_b.dim() == 3, "scales_b must be 3D tensor");
|
||||
TORCH_CHECK(problem_sizes.dim() == 2, "problem_sizes must be 2D tensor");
|
||||
TORCH_CHECK(problem_sizes.size(1) == 3,
|
||||
"problem_sizes must have shape (num_experts, 3)");
|
||||
TORCH_CHECK(problem_sizes.size(0) == expert_offsets.size(0),
|
||||
"Number of experts in problem_sizes must match expert_offsets");
|
||||
TORCH_CHECK(problem_sizes.dtype() == torch::kInt32,
|
||||
"problem_sizes must be int32");
|
||||
TORCH_CHECK(expert_offsets.dim() == 1, "expert_offsets must be 1D tensor");
|
||||
|
||||
#if defined(ENABLE_CUTLASS_MOE_SM100) && ENABLE_CUTLASS_MOE_SM100
|
||||
if (output.scalar_type() == torch::kBFloat16) {
|
||||
blockwise_scaled_group_mm_dispatch_shape<cutlass::bfloat16_t>(
|
||||
output, a, b, scales_a, scales_b, problem_sizes, expert_offsets);
|
||||
} else if (output.scalar_type() == torch::kFloat16) {
|
||||
blockwise_scaled_group_mm_dispatch_shape<cutlass::half_t>(
|
||||
output, a, b, scales_a, scales_b, problem_sizes, expert_offsets);
|
||||
} else {
|
||||
TORCH_CHECK(false, "Unsupported output tensor type");
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
|
||||
m.impl("cutlass_blockwise_scaled_grouped_mm",
|
||||
&cutlass_blockwise_scaled_grouped_mm);
|
||||
}
|
@ -38,7 +38,6 @@
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
#include "cute/atom/copy_traits_sm90_tma.hpp"
|
||||
#include "cute/algorithm/gemm.hpp"
|
||||
#include "cute/tensor_predicate.hpp"
|
||||
#include "cute/numeric/arithmetic_tuple.hpp"
|
||||
#include "cutlass/pipeline/pipeline.hpp"
|
||||
#include "cutlass/transform/collective/sm90_wgmma_transpose.hpp"
|
||||
|
@ -27,6 +27,26 @@ __device__ inline void vectorize_with_alignment(
|
||||
constexpr int WIDTH = VEC_SIZE * sizeof(InT); // eg: 64 B
|
||||
uintptr_t addr = reinterpret_cast<uintptr_t>(in);
|
||||
|
||||
// fast path when the whole region is already aligned
|
||||
// Note: currently the output is guaranteed to be same as the input, so we
|
||||
// don't check it here, comments here just for future reference.
|
||||
bool can_vec = ((addr & (WIDTH - 1)) == 0) && ((len & (VEC_SIZE - 1)) == 0);
|
||||
if (can_vec) {
|
||||
int num_vec = len / VEC_SIZE;
|
||||
|
||||
using vin_t = vec_n_t<InT, VEC_SIZE>;
|
||||
using vout_t = vec_n_t<OutT, VEC_SIZE>;
|
||||
auto* v_in = reinterpret_cast<const vin_t*>(in);
|
||||
auto* v_out = reinterpret_cast<vout_t*>(out);
|
||||
|
||||
for (int i = tid; i < num_vec; i += stride) {
|
||||
vout_t tmp;
|
||||
vec_op(tmp, v_in[i]);
|
||||
v_out[i] = tmp;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
int misalignment_offset = addr & (WIDTH - 1); // addr % 64
|
||||
int alignment_bytes = WIDTH - misalignment_offset; // 64 - (addr % 64)
|
||||
int prefix_elems = alignment_bytes & (WIDTH - 1); // handle 64
|
||||
@ -72,4 +92,81 @@ __device__ __forceinline__ void vectorize_with_alignment(const InT* in,
|
||||
std::forward<ScaOp>(scalar_op));
|
||||
}
|
||||
|
||||
template <int VEC_SIZE, typename InT, typename ScaOp>
|
||||
struct DefaultReadVecOp {
|
||||
ScaOp scalar_op;
|
||||
|
||||
__device__ __forceinline__ void operator()(
|
||||
const vec_n_t<InT, VEC_SIZE>& src) const {
|
||||
#pragma unroll
|
||||
for (int i = 0; i < VEC_SIZE; ++i) {
|
||||
scalar_op(src.val[i]);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// read-only version: iterate over the input with alignment guarantees
|
||||
template <int VEC_SIZE, typename InT, typename VecOp, typename ScaOp>
|
||||
__device__ inline void vectorize_read_with_alignment(const InT* in, int len,
|
||||
int tid, int stride,
|
||||
VecOp&& vec_op,
|
||||
ScaOp&& scalar_op) {
|
||||
static_assert(VEC_SIZE > 0 && (VEC_SIZE & (VEC_SIZE - 1)) == 0,
|
||||
"VEC_SIZE must be a positive power-of-two");
|
||||
constexpr int WIDTH = VEC_SIZE * sizeof(InT);
|
||||
uintptr_t addr = reinterpret_cast<uintptr_t>(in);
|
||||
|
||||
// fast path when the whole region is already aligned
|
||||
bool can_vec = ((addr & (WIDTH - 1)) == 0) && ((len & (VEC_SIZE - 1)) == 0);
|
||||
if (can_vec) {
|
||||
int num_vec = len / VEC_SIZE;
|
||||
|
||||
using vin_t = vec_n_t<InT, VEC_SIZE>;
|
||||
auto* v_in = reinterpret_cast<const vin_t*>(in);
|
||||
|
||||
for (int i = tid; i < num_vec; i += stride) {
|
||||
vec_op(v_in[i]);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
int misalignment_offset = addr & (WIDTH - 1);
|
||||
int alignment_bytes = WIDTH - misalignment_offset;
|
||||
int prefix_elems = alignment_bytes & (WIDTH - 1);
|
||||
prefix_elems /= sizeof(InT);
|
||||
prefix_elems = min(prefix_elems, len);
|
||||
|
||||
// 1. handle the possibly unaligned prefix with scalar access.
|
||||
for (int i = tid; i < prefix_elems; i += stride) {
|
||||
scalar_op(in[i]);
|
||||
}
|
||||
|
||||
in += prefix_elems;
|
||||
len -= prefix_elems;
|
||||
|
||||
int num_vec = len / VEC_SIZE;
|
||||
using vin_t = vec_n_t<InT, VEC_SIZE>;
|
||||
auto* v_in = reinterpret_cast<const vin_t*>(in);
|
||||
|
||||
// 2. vectorized traversal of the main aligned region.
|
||||
for (int i = tid; i < num_vec; i += stride) {
|
||||
vec_op(v_in[i]);
|
||||
}
|
||||
|
||||
// 3. handle remaining tail elements.
|
||||
int tail_start = num_vec * VEC_SIZE;
|
||||
for (int i = tid + tail_start; i < len; i += stride) {
|
||||
scalar_op(in[i]);
|
||||
}
|
||||
}
|
||||
|
||||
// overload that requires only a scalar_op
|
||||
template <int VEC_SIZE, typename InT, typename ScaOp>
|
||||
__device__ __forceinline__ void vectorize_read_with_alignment(
|
||||
const InT* in, int len, int tid, int stride, ScaOp&& scalar_op) {
|
||||
using Vec = DefaultReadVecOp<VEC_SIZE, InT, std::decay_t<ScaOp>>;
|
||||
vectorize_read_with_alignment<VEC_SIZE>(in, len, tid, stride, Vec{scalar_op},
|
||||
std::forward<ScaOp>(scalar_op));
|
||||
}
|
||||
|
||||
} // namespace vllm
|
||||
|
@ -59,6 +59,8 @@ void apply_repetition_penalties_(
|
||||
int vocab_size = logits.size(-1);
|
||||
int num_seqs = logits.size(0);
|
||||
|
||||
if (num_seqs == 0) return;
|
||||
|
||||
// Get number of SMs on the current device
|
||||
int sms = 0;
|
||||
cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount,
|
||||
|
@ -79,7 +79,8 @@ struct cutlass_sparse_3x_gemm {
|
||||
// These are the minimum alignments needed for the kernels to compile
|
||||
static constexpr int AlignmentAB =
|
||||
128 / cutlass::sizeof_bits<ElementAB>::value;
|
||||
static constexpr int AlignmentCD = 4;
|
||||
static constexpr int AlignmentCD =
|
||||
128 / cutlass::sizeof_bits<ElementD>::value;
|
||||
|
||||
using CollectiveEpilogue =
|
||||
typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
|
@ -393,6 +393,14 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
{stride_tag});
|
||||
ops.impl("cutlass_scaled_fp4_mm", torch::kCUDA, &cutlass_scaled_fp4_mm);
|
||||
|
||||
// cutlass blockwise scaledgroup GEMM
|
||||
ops.def(
|
||||
"cutlass_blockwise_scaled_grouped_mm(Tensor! output, Tensor a, Tensor b, "
|
||||
"Tensor scales_a, Tensor scales_b, "
|
||||
"Tensor problem_sizes, Tensor expert_offsets) -> ()",
|
||||
{stride_tag});
|
||||
// conditionally compiled so impl registration is in source file
|
||||
|
||||
// cutlass nvfp4 block scaled group GEMM
|
||||
ops.def(
|
||||
"cutlass_fp4_group_mm(Tensor! out, Tensor a, Tensor b,"
|
||||
|
@ -1,3 +1,4 @@
|
||||
|
||||
# The vLLM Dockerfile is used to construct vLLM image that can be directly used
|
||||
# to run the OpenAI compatible server.
|
||||
|
||||
@ -62,12 +63,16 @@ ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL=https://download.pytorch.org/whl/nightly
|
||||
ARG PIP_KEYRING_PROVIDER=disabled
|
||||
ARG UV_KEYRING_PROVIDER=${PIP_KEYRING_PROVIDER}
|
||||
|
||||
# Flag enables build-in KV-connector dependency libs into docker images
|
||||
ARG INSTALL_KV_CONNECTORS=false
|
||||
|
||||
#################### BASE BUILD IMAGE ####################
|
||||
# prepare basic build environment
|
||||
FROM ${BUILD_BASE_IMAGE} AS base
|
||||
ARG CUDA_VERSION
|
||||
ARG PYTHON_VERSION
|
||||
ARG TARGETPLATFORM
|
||||
ARG INSTALL_KV_CONNECTORS=false
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
ARG DEADSNAKES_MIRROR_URL
|
||||
@ -276,6 +281,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
FROM ${FINAL_BASE_IMAGE} AS vllm-base
|
||||
ARG CUDA_VERSION
|
||||
ARG PYTHON_VERSION
|
||||
ARG INSTALL_KV_CONNECTORS=false
|
||||
WORKDIR /vllm-workspace
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
ARG TARGETPLATFORM
|
||||
@ -374,23 +380,44 @@ ARG FLASHINFER_CUDA128_INDEX_URL="https://download.pytorch.org/whl/cu128/flashin
|
||||
ARG FLASHINFER_CUDA128_WHEEL="flashinfer_python-0.2.6.post1%2Bcu128torch2.7-cp39-abi3-linux_x86_64.whl"
|
||||
ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
|
||||
ARG FLASHINFER_GIT_REF="v0.2.6.post1"
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
. /etc/environment && \
|
||||
if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
|
||||
RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
|
||||
. /etc/environment
|
||||
if [ "$TARGETPLATFORM" != "linux/arm64" ]; then
|
||||
# FlashInfer already has a wheel for PyTorch 2.7.0 and CUDA 12.8. This is enough for CI use
|
||||
if [[ "$CUDA_VERSION" == 12.8* ]]; then \
|
||||
uv pip install --system ${FLASHINFER_CUDA128_INDEX_URL}/${FLASHINFER_CUDA128_WHEEL} ; \
|
||||
else \
|
||||
export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0a 10.0a 12.0' && \
|
||||
git clone ${FLASHINFER_GIT_REPO} --single-branch --branch ${FLASHINFER_GIT_REF} --recursive && \
|
||||
if [[ "$CUDA_VERSION" == 12.8* ]]; then
|
||||
uv pip install --system ${FLASHINFER_CUDA128_INDEX_URL}/${FLASHINFER_CUDA128_WHEEL}
|
||||
else
|
||||
export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0a 10.0a 12.0'
|
||||
git clone ${FLASHINFER_GIT_REPO} --single-branch --branch ${FLASHINFER_GIT_REF} --recursive
|
||||
# Needed to build AOT kernels
|
||||
(cd flashinfer && \
|
||||
python3 -m flashinfer.aot && \
|
||||
uv pip install --system --no-build-isolation . \
|
||||
) && \
|
||||
rm -rf flashinfer; \
|
||||
)
|
||||
rm -rf flashinfer
|
||||
|
||||
# Default arches (skipping 10.0a and 12.0 since these need 12.8)
|
||||
# TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg.
|
||||
TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
|
||||
if [[ "${CUDA_VERSION}" == 11.* ]]; then
|
||||
TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
|
||||
fi
|
||||
echo "🏗️ Building FlashInfer for arches: ${TORCH_CUDA_ARCH_LIST}"
|
||||
|
||||
git clone --depth 1 --recursive --shallow-submodules \
|
||||
--branch v0.2.6.post1 \
|
||||
https://github.com/flashinfer-ai/flashinfer.git flashinfer
|
||||
|
||||
pushd flashinfer
|
||||
python3 -m flashinfer.aot
|
||||
TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST}" \
|
||||
uv pip install --system --no-build-isolation .
|
||||
popd
|
||||
|
||||
rm -rf flashinfer
|
||||
fi \
|
||||
fi
|
||||
BASH
|
||||
COPY examples examples
|
||||
COPY benchmarks benchmarks
|
||||
COPY ./vllm/collect_env.py .
|
||||
@ -464,6 +491,7 @@ RUN mv mkdocs.yaml test_docs/
|
||||
# base openai image with additional requirements, for any subsequent openai-style images
|
||||
FROM vllm-base AS vllm-openai-base
|
||||
ARG TARGETPLATFORM
|
||||
ARG INSTALL_KV_CONNECTORS=false
|
||||
|
||||
ARG PIP_INDEX_URL UV_INDEX_URL
|
||||
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
|
||||
@ -472,12 +500,17 @@ ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
|
||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||
ENV UV_HTTP_TIMEOUT=500
|
||||
|
||||
COPY requirements/kv_connectors.txt requirements/kv_connectors.txt
|
||||
|
||||
# install additional dependencies for openai api server
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
if [ "$INSTALL_KV_CONNECTORS" = "true" ]; then \
|
||||
uv pip install --system -r requirements/kv_connectors.txt; \
|
||||
fi; \
|
||||
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
|
||||
uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
|
||||
else \
|
||||
uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.3' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
|
||||
uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.46.1' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
|
||||
fi
|
||||
|
||||
ENV VLLM_USAGE_SOURCE production-docker-image
|
||||
|
@ -8,6 +8,8 @@
|
||||
# Build arguments:
|
||||
# PYTHON_VERSION=3.12 (default)|3.11|3.10|3.9
|
||||
# VLLM_CPU_DISABLE_AVX512=false (default)|true
|
||||
# VLLM_CPU_AVX512BF16=false (default)|true
|
||||
# VLLM_CPU_AVX512VNNI=false (default)|true
|
||||
#
|
||||
|
||||
######################### BASE IMAGE #########################
|
||||
@ -25,7 +27,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
|
||||
--mount=type=cache,target=/var/lib/apt,sharing=locked \
|
||||
apt-get update -y \
|
||||
&& apt-get install -y --no-install-recommends ccache git curl wget ca-certificates \
|
||||
gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 \
|
||||
gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof \
|
||||
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 \
|
||||
&& curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
|
||||
@ -60,8 +62,14 @@ FROM base AS vllm-build
|
||||
|
||||
ARG GIT_REPO_CHECK=0
|
||||
# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
|
||||
ARG VLLM_CPU_DISABLE_AVX512
|
||||
ARG VLLM_CPU_DISABLE_AVX512=0
|
||||
ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
|
||||
# Support for building with AVX512BF16 ISA: docker build --build-arg VLLM_CPU_AVX512BF16="true" ...
|
||||
ARG VLLM_CPU_AVX512BF16=0
|
||||
ENV VLLM_CPU_AVX512BF16=${VLLM_CPU_AVX512BF16}
|
||||
# Support for building with AVX512VNNI ISA: docker build --build-arg VLLM_CPU_AVX512VNNI="true" ...
|
||||
ARG VLLM_CPU_AVX512VNNI=0
|
||||
ENV VLLM_CPU_AVX512VNNI=${VLLM_CPU_AVX512VNNI}
|
||||
|
||||
WORKDIR /workspace/vllm
|
||||
|
||||
@ -134,6 +142,7 @@ ADD ./tests/ ./tests/
|
||||
ADD ./examples/ ./examples/
|
||||
ADD ./benchmarks/ ./benchmarks/
|
||||
ADD ./vllm/collect_env.py .
|
||||
ADD ./.buildkite/ ./.buildkite/
|
||||
|
||||
# install development dependencies (for testing)
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
|
@ -14,7 +14,7 @@ Before setting up the incremental build:
|
||||
VLLM_USE_PRECOMPILED=1 uv pip install -U -e . --torch-backend=auto
|
||||
```
|
||||
|
||||
2. **CUDA Toolkit:** Verify that the NVIDIA CUDA Toolkit is correctly installed and `nvcc` is accessible in your `PATH`. CMake relies on `nvcc` to compile CUDA code. You can typically find `nvcc` in `$CUDA_HOME/bin/nvcc` or by running `which nvcc`. If you encounter issues, refer to the [official CUDA Toolkit installation guides](https://developer.nvidia.com/cuda-toolkit-archive) and vLLM's main [GPU installation documentation](../getting_started/installation/gpu/cuda.inc.md#troubleshooting) for troubleshooting. The `CMAKE_CUDA_COMPILER` variable in your `CMakeUserPresets.json` should also point to your `nvcc` binary.
|
||||
2. **CUDA Toolkit:** Verify that the NVIDIA CUDA Toolkit is correctly installed and `nvcc` is accessible in your `PATH`. CMake relies on `nvcc` to compile CUDA code. You can typically find `nvcc` in `$CUDA_HOME/bin/nvcc` or by running `which nvcc`. If you encounter issues, refer to the [official CUDA Toolkit installation guides](https://developer.nvidia.com/cuda-toolkit-archive) and vLLM's main [GPU installation documentation](../getting_started/installation/gpu.md#troubleshooting) for troubleshooting. The `CMAKE_CUDA_COMPILER` variable in your `CMakeUserPresets.json` should also point to your `nvcc` binary.
|
||||
|
||||
3. **Build Tools:** It is highly recommended to install `ccache` for fast rebuilds by caching compilation results (e.g., `sudo apt install ccache` or `conda install ccache`). Also, ensure the core build dependencies like `cmake` and `ninja` are installed. These are installable through `requirements/build.txt` or your system's package manager.
|
||||
|
||||
|
@ -10,6 +10,22 @@ This document walks you through the steps to extend a basic model so that it acc
|
||||
It is assumed that you have already implemented the model in vLLM according to [these steps][new-model-basic].
|
||||
Further update the model as follows:
|
||||
|
||||
- Implement [get_placeholder_str][vllm.model_executor.models.interfaces.SupportsMultiModal.get_placeholder_str] to define the placeholder string which is used to represent the multi-modal item in the text prompt. This should be consistent with the chat template of the model.
|
||||
|
||||
??? Code
|
||||
|
||||
```python
|
||||
class YourModelForImage2Seq(nn.Module):
|
||||
...
|
||||
|
||||
@classmethod
|
||||
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
|
||||
if modality.startswith("image"):
|
||||
return "<image>"
|
||||
|
||||
raise ValueError("Only image modality is supported")
|
||||
```
|
||||
|
||||
- Reserve a keyword parameter in [forward][torch.nn.Module.forward] for each input tensor that corresponds to a multi-modal input, as shown in the following example:
|
||||
|
||||
```diff
|
||||
|
@ -101,6 +101,49 @@ To substitute multiple images inside the same text prompt, you can pass in a lis
|
||||
|
||||
Full example: <gh-file:examples/offline_inference/vision_language_multi_image.py>
|
||||
|
||||
If using the [LLM.chat](https://docs.vllm.ai/en/stable/models/generative_models.html#llmchat) method, you can pass images directly in the message content using various formats: image URLs, PIL Image objects, or pre-computed embeddings:
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
from vllm.assets.image import ImageAsset
|
||||
|
||||
llm = LLM(model="llava-hf/llava-1.5-7b-hf")
|
||||
image_url = "https://picsum.photos/id/32/512/512"
|
||||
image_pil = ImageAsset('cherry_blossom').pil_image
|
||||
image_embeds = torch.load(...)
|
||||
|
||||
conversation = [
|
||||
{"role": "system", "content": "You are a helpful assistant"},
|
||||
{"role": "user", "content": "Hello"},
|
||||
{"role": "assistant", "content": "Hello! How can I assist you today?"},
|
||||
{
|
||||
"role": "user",
|
||||
"content": [{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url
|
||||
}
|
||||
},{
|
||||
"type": "image_pil",
|
||||
"image_pil": image_pil
|
||||
}, {
|
||||
"type": "image_embeds",
|
||||
"image_embeds": image_embeds
|
||||
}, {
|
||||
"type": "text",
|
||||
"text": "What's in these images?"
|
||||
}],
|
||||
},
|
||||
]
|
||||
|
||||
# Perform inference and log output.
|
||||
outputs = llm.chat(conversation)
|
||||
|
||||
for o in outputs:
|
||||
generated_text = o.outputs[0].text
|
||||
print(generated_text)
|
||||
```
|
||||
|
||||
Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos:
|
||||
|
||||
??? Code
|
||||
@ -228,7 +271,7 @@ Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions
|
||||
If no default chat template is available, we will first look for a built-in fallback in <gh-file:vllm/transformers_utils/chat_templates/registry.py>.
|
||||
If no fallback is available, an error is raised and you have to provide the chat template manually via the `--chat-template` argument.
|
||||
|
||||
For certain models, we provide alternative chat templates inside <gh-dir:vllm/examples>.
|
||||
For certain models, we provide alternative chat templates inside <gh-dir:examples>.
|
||||
For example, VLM2Vec uses <gh-file:examples/template_vlm2vec.jinja> which is different from the default one for Phi-3-Vision.
|
||||
|
||||
### Image Inputs
|
||||
|
@ -10,7 +10,7 @@ Compared to other quantization methods, BitsAndBytes eliminates the need for cal
|
||||
Below are the steps to utilize BitsAndBytes with vLLM.
|
||||
|
||||
```bash
|
||||
pip install bitsandbytes>=0.45.3
|
||||
pip install bitsandbytes>=0.46.1
|
||||
```
|
||||
|
||||
vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint.
|
||||
|
@ -201,6 +201,7 @@ an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https
|
||||
speculative_config={
|
||||
"model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
|
||||
"draft_tensor_parallel_size": 1,
|
||||
"num_speculative_tokens": 2,
|
||||
},
|
||||
)
|
||||
|
||||
|
@ -264,6 +264,15 @@ For Qwen2.5, the chat template in tokenizer_config.json has already included sup
|
||||
|
||||
Flags: `--tool-call-parser hermes`
|
||||
|
||||
### MiniMax Models (`minimax_m1`)
|
||||
|
||||
Supported models:
|
||||
|
||||
* `MiniMaxAi/MiniMax-M1-40k` (use with <gh-file:examples/tool_chat_template_minimax.jinja>)
|
||||
* `MiniMaxAi/MiniMax-M1-80k` (use with <gh-file:examples/tool_chat_template_minimax.jinja>)
|
||||
|
||||
Flags: `--tool-call-parser minimax --chat-template examples/tool_chat_template_minimax.jinja`
|
||||
|
||||
### DeepSeek-V3 Models (`deepseek_v3`)
|
||||
|
||||
Supported models:
|
||||
|
@ -198,7 +198,12 @@ INFO 08-01 21:37:59 hpu_model_runner.py:504] Decode bucket config (min, step, ma
|
||||
INFO 08-01 21:37:59 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
|
||||
```
|
||||
|
||||
`min` determines the lowest value of the bucket. `step` determines the interval between buckets, and `max` determines the upper bound of the bucket. Furthermore, interval between `min` and `step` has special handling -- `min` gets multiplied by consecutive powers of two, until `step` gets reached. We call this the ramp-up phase and it is used for handling lower batch sizes with minimum wastage, while allowing larger padding on larger batch sizes.
|
||||
| Parameter | Description |
|
||||
|----------------|-----------------------------------------------------------------------------|
|
||||
| `min` | Determines the lowest value of the bucket. |
|
||||
| `step` | Determines the interval between buckets. |
|
||||
| `max` | Determines the upper bound of the bucket. |
|
||||
| Ramp-up phase | A special handling phase applied between `min` and `step`:<br/>- `min` is multiplied by consecutive powers of two until `step` is reached.<br/>- Minimizes resource wastage for small batch sizes.<br/>- Allows larger padding for larger batches. |
|
||||
|
||||
Example (with ramp-up):
|
||||
|
||||
@ -357,20 +362,20 @@ Each described step is logged by vLLM server, as follows (negative values corres
|
||||
|
||||
* Default values:
|
||||
|
||||
- Prompt:
|
||||
- batch size min (`VLLM_PROMPT_BS_BUCKET_MIN`): `1`
|
||||
- batch size step (`VLLM_PROMPT_BS_BUCKET_STEP`): `min(max_num_seqs, 32)`
|
||||
- batch size max (`VLLM_PROMPT_BS_BUCKET_MAX`): `min(max_num_seqs, 64)`
|
||||
- sequence length min (`VLLM_PROMPT_SEQ_BUCKET_MIN`): `block_size`
|
||||
- sequence length step (`VLLM_PROMPT_SEQ_BUCKET_STEP`): `block_size`
|
||||
- sequence length max (`VLLM_PROMPT_SEQ_BUCKET_MAX`): `max_model_len`
|
||||
- Decode:
|
||||
- batch size min (`VLLM_DECODE_BS_BUCKET_MIN`): `1`
|
||||
- batch size step (`VLLM_DECODE_BS_BUCKET_STEP`): `min(max_num_seqs, 32)`
|
||||
- batch size max (`VLLM_DECODE_BS_BUCKET_MAX`): `max_num_seqs`
|
||||
- sequence length min (`VLLM_DECODE_BLOCK_BUCKET_MIN`): `block_size`
|
||||
- sequence length step (`VLLM_DECODE_BLOCK_BUCKET_STEP`): `block_size`
|
||||
- sequence length max (`VLLM_DECODE_BLOCK_BUCKET_MAX`): `max(128, (max_num_seqs*max_model_len)/block_size)`
|
||||
| `{phase}` | Parameter | Env Variable | Value Expression |
|
||||
|-----------|-----------|--------------|------------------|
|
||||
| Prompt | Batch size min | `VLLM_PROMPT_BS_BUCKET_MIN` | `1` |
|
||||
| Prompt | Batch size step | `VLLM_PROMPT_BS_BUCKET_STEP` | `min(max_num_seqs, 32)` |
|
||||
| Prompt | Batch size max | `VLLM_PROMPT_BS_BUCKET_MAX` | `min(max_num_seqs, 64)` |
|
||||
| Prompt | Sequence length min | `VLLM_PROMPT_SEQ_BUCKET_MIN` | `block_size` |
|
||||
| Prompt | Sequence length step | `VLLM_PROMPT_SEQ_BUCKET_STEP` | `block_size` |
|
||||
| Prompt | Sequence length max | `VLLM_PROMPT_SEQ_BUCKET_MAX` | `max_model_len` |
|
||||
| Decode | Batch size min | `VLLM_DECODE_BS_BUCKET_MIN` | `1` |
|
||||
| Decode | Batch size step | `VLLM_DECODE_BS_BUCKET_STEP` | `min(max_num_seqs, 32)` |
|
||||
| Decode | Batch size max | `VLLM_DECODE_BS_BUCKET_MAX` | `max_num_seqs` |
|
||||
| Decode | Sequence length min | `VLLM_DECODE_BLOCK_BUCKET_MIN` | `block_size` |
|
||||
| Decode | Sequence length step | `VLLM_DECODE_BLOCK_BUCKET_STEP` | `block_size` |
|
||||
| Decode | Sequence length max | `VLLM_DECODE_BLOCK_BUCKET_MAX` | `max(128, (max_num_seqs*max_model_len)/block_size)` |
|
||||
|
||||
Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM execution:
|
||||
|
||||
|
@ -470,26 +470,36 @@ Specified using `--task classify`.
|
||||
|----------------------------------|----------|----------------------------------------|------------------------|-----------------------------|-----------------------|
|
||||
| `JambaForSequenceClassification` | Jamba | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎ | ✅︎ | |
|
||||
| `GPT2ForSequenceClassification` | GPT2 | `nie3e/sentiment-polish-gpt2-small` | | | ✅︎ |
|
||||
|
||||
If your model is not in the above list, we will try to automatically convert the model using
|
||||
[as_classification_model][vllm.model_executor.models.adapters.as_classification_model]. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token.
|
||||
[as_seq_cls_model][vllm.model_executor.models.adapters.as_seq_cls_model]. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token.
|
||||
|
||||
#### Sentence Pair Scoring
|
||||
|
||||
Specified using `--task score`.
|
||||
|
||||
| Architecture | Models | Example HF Models | [V1](gh-issue:8779) |
|
||||
|---------------------------------------|-------------------|--------------------------------------------------------------------------------------|-----------------------|
|
||||
|---------------------------------------|-------------------|--------------------------------------------------------------------------------------|---------------------|
|
||||
| `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | |
|
||||
| `Qwen2ForSequenceClassification` | Qwen2-based | `mixedbread-ai/mxbai-rerank-base-v2` (see note), etc. | ✅︎ |
|
||||
| `Qwen3ForSequenceClassification` | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎ |
|
||||
| `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | |
|
||||
| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | |
|
||||
|
||||
!!! note
|
||||
Load the official original `mxbai-rerank-v2` by using the following command.
|
||||
|
||||
```bash
|
||||
vllm serve mixedbread-ai/mxbai-rerank-base-v2 --hf_overrides '{"architectures": ["Qwen2ForSequenceClassification"],"classifier_from_token": ["0", "1"], "method": "from_2_way_softmax"}'
|
||||
```
|
||||
|
||||
!!! note
|
||||
Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: <gh-file:examples/offline_inference/qwen3_reranker.py>.
|
||||
|
||||
```bash
|
||||
vllm serve Qwen/Qwen3-Reranker-0.6B --hf_overrides '{"architectures": ["Qwen3ForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}'
|
||||
```
|
||||
|
||||
[](){ #supported-mm-models }
|
||||
|
||||
## List of Multimodal Language Models
|
||||
@ -616,9 +626,6 @@ Specified using `--task generate`.
|
||||
!!! note
|
||||
Only `InternVLChatModel` with Qwen2.5 text backbone (`OpenGVLab/InternVL3-2B`, `OpenGVLab/InternVL2.5-1B` etc) has video inputs support currently.
|
||||
|
||||
!!! note
|
||||
`h2oai/h2ovl-mississippi-2b` will be available in V1 once we support head size 80.
|
||||
|
||||
!!! note
|
||||
To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have to pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
|
||||
|
||||
@ -661,11 +668,8 @@ Specified using `--task generate`.
|
||||
Our PaliGemma implementations have the same problem as Gemma 3 (see above) for both V0 and V1.
|
||||
|
||||
!!! note
|
||||
To use Qwen2.5-Omni, you have to install Hugging Face Transformers library from source via
|
||||
`pip install git+https://github.com/huggingface/transformers.git`.
|
||||
|
||||
Read audio from video pre-processing is currently supported on V0 (but not V1), because overlapping modalities is not yet supported in V1.
|
||||
`--mm-processor-kwargs '{"use_audio_in_video": true}'`.
|
||||
For Qwen2.5-Omni, reading audio from video pre-processing (`--mm-processor-kwargs '{"use_audio_in_video": true}'`)
|
||||
is currently supported on V0 (but not V1), because overlapping modalities is not yet supported in V1.
|
||||
|
||||
#### Transcription
|
||||
|
||||
|
@ -426,7 +426,7 @@ Code example: <gh-file:examples/online_serving/openai_pooling_client.py>
|
||||
|
||||
Our Classification API directly supports Hugging Face sequence-classification models such as [ai21labs/Jamba-tiny-reward-dev](https://huggingface.co/ai21labs/Jamba-tiny-reward-dev) and [jason9693/Qwen2.5-1.5B-apeach](https://huggingface.co/jason9693/Qwen2.5-1.5B-apeach).
|
||||
|
||||
We automatically wrap any other transformer via `as_classification_model()`, which pools on the last token, attaches a `RowParallelLinear` head, and applies a softmax to produce per-class probabilities.
|
||||
We automatically wrap any other transformer via `as_seq_cls_model()`, which pools on the last token, attaches a `RowParallelLinear` head, and applies a softmax to produce per-class probabilities.
|
||||
|
||||
Code example: <gh-file:examples/online_serving/openai_classification_client.py>
|
||||
|
||||
|
@ -6,6 +6,7 @@ import argparse
|
||||
|
||||
from vllm import LLM
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.assets.image import ImageAsset
|
||||
|
||||
# This script is an offline demo for running Mistral-Small-3.1
|
||||
#
|
||||
@ -71,14 +72,16 @@ def run_simple_demo(args: argparse.Namespace):
|
||||
)
|
||||
|
||||
prompt = "Describe this image in one sentence."
|
||||
image_url = "https://picsum.photos/id/237/200/300"
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": prompt},
|
||||
{"type": "image_url", "image_url": {"url": image_url}},
|
||||
{
|
||||
"type": "image_pil",
|
||||
"image_pil": ImageAsset("cherry_blossom").pil_image,
|
||||
},
|
||||
],
|
||||
},
|
||||
]
|
||||
|
@ -4,7 +4,7 @@ This script is used to profile the TPU performance of vLLM for specific prefill
|
||||
|
||||
Note: an actual running server is a mix of both prefill of many shapes and decode of many shapes.
|
||||
|
||||
We assume you are on a TPU already (this was tested on TPU v6e) and have installed vLLM according to the [installation guide](https://docs.vllm.ai/en/latest/getting_started/installation/ai_accelerator/index.html).
|
||||
We assume you are on a TPU already (this was tested on TPU v6e) and have installed vLLM according to the [Google TPU installation guide](https://docs.vllm.ai/en/latest/getting_started/installation/google_tpu.html).
|
||||
|
||||
> In all examples below, we run several warmups before (so `--enforce-eager` is okay)
|
||||
|
||||
@ -57,7 +57,10 @@ Once you have collected your profiles with this script, you can visualize them u
|
||||
Here are most likely the dependencies you need to install:
|
||||
|
||||
```bash
|
||||
pip install tensorflow-cpu tensorboard-plugin-profile etils importlib_resources
|
||||
pip install tensorflow-cpu \
|
||||
tensorboard-plugin-profile \
|
||||
etils \
|
||||
importlib_resources
|
||||
```
|
||||
|
||||
Then you just need to point TensorBoard to the directory where you saved the profiles and visit `http://localhost:6006/` in your browser:
|
||||
|
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
|
@ -98,7 +98,7 @@ def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
|
||||
# See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
|
||||
prompts = [f"Question: {question} Answer:" for question in questions]
|
||||
engine_args = EngineArgs(
|
||||
model="Salesforce/blip2-opt-6.7b",
|
||||
model="Salesforce/blip2-opt-2.7b",
|
||||
limit_mm_per_prompt={modality: 1},
|
||||
)
|
||||
|
||||
@ -677,6 +677,7 @@ def run_mistral3(questions: list[str], modality: str) -> ModelRequestData:
|
||||
max_num_seqs=2,
|
||||
tensor_parallel_size=2,
|
||||
limit_mm_per_prompt={modality: 1},
|
||||
ignore_patterns=["consolidated.safetensors"],
|
||||
)
|
||||
|
||||
prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
|
||||
@ -970,7 +971,7 @@ def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
# Qwen
|
||||
# Qwen-VL
|
||||
def run_qwen_vl(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
|
||||
|
@ -505,6 +505,7 @@ def load_mistral3(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
max_num_seqs=2,
|
||||
tensor_parallel_size=2,
|
||||
limit_mm_per_prompt={"image": len(image_urls)},
|
||||
ignore_patterns=["consolidated.safetensors"],
|
||||
)
|
||||
|
||||
placeholders = "[IMG]" * len(image_urls)
|
||||
|
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import os
|
||||
import socket
|
||||
|
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import asyncio
|
||||
from typing import Optional
|
||||
|
||||
|
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# ruff: noqa: E501
|
||||
"""
|
||||
Set up this example by starting a vLLM OpenAI-compatible server with tool call
|
||||
|
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# ruff: noqa: E501
|
||||
"""
|
||||
Set up this example by starting a vLLM OpenAI-compatible server with tool call
|
||||
|
@ -13,13 +13,15 @@ vllm serve Qwen/Qwen2.5-3B-Instruct
|
||||
To serve a reasoning model, you can use the following command:
|
||||
|
||||
```bash
|
||||
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --reasoning-parser deepseek_r1
|
||||
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B \
|
||||
--reasoning-parser deepseek_r1
|
||||
```
|
||||
|
||||
If you want to run this script standalone with `uv`, you can use the following:
|
||||
|
||||
```bash
|
||||
uvx --from git+https://github.com/vllm-project/vllm#subdirectory=examples/online_serving/structured_outputs structured-output
|
||||
uvx --from git+https://github.com/vllm-project/vllm#subdirectory=examples/online_serving/structured_outputs \
|
||||
structured-output
|
||||
```
|
||||
|
||||
See [feature docs](https://docs.vllm.ai/en/latest/features/structured_outputs.html) for more information.
|
||||
@ -44,7 +46,9 @@ uv run structured_outputs.py --stream
|
||||
Run certain constraints, for example `structural_tag` and `regex`, streaming:
|
||||
|
||||
```bash
|
||||
uv run structured_outputs.py --constraint structural_tag regex --stream
|
||||
uv run structured_outputs.py \
|
||||
--constraint structural_tag regex \
|
||||
--stream
|
||||
```
|
||||
|
||||
Run all constraints, with reasoning models and streaming:
|
||||
|
@ -202,7 +202,7 @@ def parse_args():
|
||||
|
||||
|
||||
|
||||
def deserialize():
|
||||
def deserialize(args, tensorizer_config):
|
||||
if args.lora_path:
|
||||
tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir
|
||||
llm = LLM(model=args.model,
|
||||
@ -242,7 +242,7 @@ def deserialize():
|
||||
return llm
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
def main():
|
||||
args = parse_args()
|
||||
|
||||
s3_access_key_id = (getattr(args, 's3_access_key_id', None)
|
||||
@ -260,8 +260,6 @@ if __name__ == '__main__':
|
||||
|
||||
model_ref = args.model
|
||||
|
||||
model_name = model_ref.split("/")[1]
|
||||
|
||||
if args.command == "serialize" or args.command == "deserialize":
|
||||
keyfile = args.keyfile
|
||||
else:
|
||||
@ -309,6 +307,10 @@ if __name__ == '__main__':
|
||||
encryption_keyfile = keyfile,
|
||||
**credentials
|
||||
)
|
||||
deserialize()
|
||||
deserialize(args, tensorizer_config)
|
||||
else:
|
||||
raise ValueError("Either serialize or deserialize must be specified.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
91
examples/tool_chat_template_minimax_m1.jinja
Normal file
91
examples/tool_chat_template_minimax_m1.jinja
Normal file
@ -0,0 +1,91 @@
|
||||
{{ '<begin_of_document>' -}}
|
||||
{%- if custom_tools is defined %}
|
||||
{%- set tools = custom_tools %}
|
||||
{%- endif %}
|
||||
{%- if not tools is defined %}
|
||||
{%- set tools = none %}
|
||||
{%- endif %}
|
||||
|
||||
{#- Extract system message #}
|
||||
{% set ns = namespace(system_prompt='') -%}
|
||||
{%- if messages[0]['role'] == 'system' %}
|
||||
{%- if messages[0]['content'] is string %}
|
||||
{%- set ns.system_prompt = messages[0]['content']|trim %}
|
||||
{%- else %}
|
||||
{%- set ns.system_prompt = messages[0]['content'][0]['text']|trim %}
|
||||
{%- endif %}
|
||||
{%- set messages = messages[1:] %}
|
||||
{%- else %}
|
||||
{%- if tools is not none %}
|
||||
{%- set ns.system_prompt = "You are a helpful assistant created by Minimax based on MiniMax-M1 model." %}
|
||||
{%- else %}
|
||||
{%- set ns.system_prompt = "You are a helpful assistant created by Minimax based on MiniMax-M1 model." %}
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
|
||||
{#- System message #}
|
||||
{%- if ns.system_prompt != '' %}
|
||||
{{ '<beginning_of_sentence>system ai_setting=assistant\n' + ns.system_prompt + '<end_of_sentence>\n' -}}
|
||||
{%- endif %}
|
||||
|
||||
{#- Tools configuration #}
|
||||
{%- if tools is not none %}
|
||||
{{ '<beginning_of_sentence>system tool_setting=tools\nYou are provided with these tools:\n<tools>\n' -}}
|
||||
{%- for tool in tools %}
|
||||
{{ tool | tojson ~ '\n' -}}
|
||||
{%- endfor %}
|
||||
{{ '</tools>\n\nIf you need to call tools, please respond with <tool_calls></tool_calls> XML tags, and provide tool-name and json-object of arguments, following the format below:\n<tool_calls>\n{"name": <tool-name>, "arguments": <args-json-object>}\n...\n</tool_calls><end_of_sentence>\n' -}}
|
||||
{%- endif %}
|
||||
|
||||
{#- Process messages #}
|
||||
{%- for message in messages %}
|
||||
{%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
|
||||
{%- if message['role'] == 'user' %}
|
||||
{{ '<beginning_of_sentence>user name=user\n' -}}
|
||||
{%- if message['content'] is string %}
|
||||
{{ message['content']|trim -}}
|
||||
{%- else %}
|
||||
{%- for content in message['content'] %}
|
||||
{%- if content['type'] == 'text' %}
|
||||
{{ content['text']|trim -}}
|
||||
{%- endif %}
|
||||
{%- endfor %}
|
||||
{%- endif %}
|
||||
{{ '<end_of_sentence>\n' -}}
|
||||
{%- elif message['role'] == 'assistant' %}
|
||||
{{ '<beginning_of_sentence>ai name=assistant\n' -}}
|
||||
{%- if message['content'] is string %}
|
||||
{{ message['content']|trim -}}
|
||||
{%- else %}
|
||||
{%- for content in message['content'] | selectattr('type', 'equalto', 'text') %}
|
||||
{{ content['text']|trim -}}
|
||||
{%- endfor %}
|
||||
{%- endif %}
|
||||
{{ '<end_of_sentence>\n' -}}
|
||||
{%- endif %}
|
||||
{%- elif 'tool_calls' in message %}
|
||||
{{ '<beginning_of_sentence>ai name=assistant\n<tool_calls>\n' -}}
|
||||
{%- for tool_call in message.tool_calls %}
|
||||
{{ '{"name": "' + tool_call.function.name + '", "arguments": ' + tool_call.function.arguments | tojson + '}\n' -}}
|
||||
{%- endfor %}
|
||||
{{ '</tool_calls><end_of_sentence>\n' -}}
|
||||
{%- elif message.role == "tool" or message.role == "ipython" %}
|
||||
{{ '<beginning_of_sentence>tool name=tools\n' -}}
|
||||
{%- if message.content is string %}
|
||||
{{ 'tool result: ' + message.content + '\n\n' -}}
|
||||
{%- else %}
|
||||
{%- for content in message['content'] %}
|
||||
{%- if content['type'] == 'text' %}
|
||||
{{ 'tool result: ' + content['text'] + '\n\n' -}}
|
||||
{%- elif content.get('name') %}
|
||||
{{ 'tool name: ' + content['name'] + '\ntool result: ' + content['text'] + '\n\n' -}}
|
||||
{%- endif %}
|
||||
{%- endfor %}
|
||||
{%- endif %}
|
||||
{{ '<end_of_sentence>\n' -}}
|
||||
{%- endif %}
|
||||
{%- endfor %}
|
||||
|
||||
{%- if add_generation_prompt %}
|
||||
{{ '<beginning_of_sentence>ai name=assistant\n' -}}
|
||||
{%- endif %}
|
@ -13,7 +13,7 @@ tokenizers >= 0.21.1 # Required for fast incremental detokenization.
|
||||
protobuf # Required by LlamaTokenizer.
|
||||
fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
|
||||
aiohttp
|
||||
openai >= 1.52.0 # Ensure modern openai package (ensure types module present and max_completion_tokens field support)
|
||||
openai >= 1.52.0, <= 1.90.0 # Ensure modern openai package (ensure types module present and max_completion_tokens field support)
|
||||
pydantic >= 2.10
|
||||
prometheus_client >= 0.18.0
|
||||
pillow # Required for image processing
|
||||
|
1
requirements/kv_connectors.txt
Normal file
1
requirements/kv_connectors.txt
Normal file
@ -0,0 +1 @@
|
||||
lmcache
|
@ -34,7 +34,7 @@ tokenizers==0.21.1
|
||||
huggingface-hub[hf_xet]>=0.30.0 # Required for Xet downloads.
|
||||
schemathesis>=3.39.15 # Required for openai schema test.
|
||||
# quantization
|
||||
bitsandbytes>=0.45.3
|
||||
bitsandbytes>=0.46.1
|
||||
buildkite-test-collector==0.1.9
|
||||
|
||||
|
||||
|
@ -39,7 +39,7 @@ tokenizers==0.21.1
|
||||
huggingface-hub[hf_xet]>=0.33.0 # Required for Xet downloads.
|
||||
schemathesis>=3.39.15 # Required for openai schema test.
|
||||
# quantization
|
||||
bitsandbytes>=0.45.3
|
||||
bitsandbytes==0.46.1
|
||||
buildkite-test-collector==0.1.9
|
||||
|
||||
|
||||
|
@ -45,7 +45,7 @@ backoff==2.2.1
|
||||
# via
|
||||
# -r requirements/test.in
|
||||
# schemathesis
|
||||
bitsandbytes==0.45.3
|
||||
bitsandbytes==0.46.1
|
||||
# via -r requirements/test.in
|
||||
black==24.10.0
|
||||
# via datamodel-code-generator
|
||||
|
@ -33,7 +33,6 @@ class RequestOutput:
|
||||
class MockModelConfig:
|
||||
use_async_output_proc = True
|
||||
media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
|
||||
mm_placeholder_str_override: dict[str, str] = field(default_factory=dict)
|
||||
|
||||
|
||||
class MockEngine:
|
||||
|
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
|
57
tests/config/test_mp_reducer.py
Normal file
57
tests/config/test_mp_reducer.py
Normal file
@ -0,0 +1,57 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import sys
|
||||
from unittest.mock import patch
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||
from vllm.v1.engine.async_llm import AsyncLLM
|
||||
|
||||
|
||||
def test_mp_reducer(monkeypatch):
|
||||
"""
|
||||
Test that _reduce_config reducer is registered when AsyncLLM is instantiated
|
||||
without transformers_modules. This is a regression test for
|
||||
https://github.com/vllm-project/vllm/pull/18640.
|
||||
"""
|
||||
|
||||
# Use V1 AsyncLLM which calls maybe_register_config_serialize_by_value
|
||||
monkeypatch.setenv('VLLM_USE_V1', '1')
|
||||
|
||||
# Ensure transformers_modules is not in sys.modules
|
||||
if 'transformers_modules' in sys.modules:
|
||||
del sys.modules['transformers_modules']
|
||||
|
||||
with patch('multiprocessing.reducer.register') as mock_register:
|
||||
engine_args = AsyncEngineArgs(
|
||||
model="facebook/opt-125m",
|
||||
max_model_len=32,
|
||||
gpu_memory_utilization=0.1,
|
||||
disable_log_stats=True,
|
||||
disable_log_requests=True,
|
||||
)
|
||||
|
||||
async_llm = AsyncLLM.from_engine_args(
|
||||
engine_args,
|
||||
start_engine_loop=False,
|
||||
)
|
||||
|
||||
assert mock_register.called, (
|
||||
"multiprocessing.reducer.register should have been called")
|
||||
|
||||
vllm_config_registered = False
|
||||
for call_args in mock_register.call_args_list:
|
||||
# Verify that a reducer for VllmConfig was registered
|
||||
if len(call_args[0]) >= 2 and call_args[0][0] == VllmConfig:
|
||||
vllm_config_registered = True
|
||||
|
||||
reducer_func = call_args[0][1]
|
||||
assert callable(
|
||||
reducer_func), "Reducer function should be callable"
|
||||
break
|
||||
|
||||
assert vllm_config_registered, (
|
||||
"VllmConfig should have been registered to multiprocessing.reducer"
|
||||
)
|
||||
|
||||
async_llm.shutdown()
|
@ -263,26 +263,6 @@ def test_media_io_kwargs_parser(arg, expected):
|
||||
assert args.media_io_kwargs == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize(("arg", "expected"), [
|
||||
(None, dict()),
|
||||
('{"video":"<|video_placeholder|>"}', {
|
||||
"video": "<|video_placeholder|>"
|
||||
}),
|
||||
('{"video":"<|video_placeholder|>", "image": "<|image_placeholder|>"}', {
|
||||
"video": "<|video_placeholder|>",
|
||||
"image": "<|image_placeholder|>"
|
||||
}),
|
||||
])
|
||||
def test_mm_placeholder_str_override_parser(arg, expected):
|
||||
parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
|
||||
if arg is None:
|
||||
args = parser.parse_args([])
|
||||
else:
|
||||
args = parser.parse_args(["--mm-placeholder-str-override", arg])
|
||||
|
||||
assert args.mm_placeholder_str_override == expected
|
||||
|
||||
|
||||
def test_compilation_config():
|
||||
parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
|
||||
|
||||
|
@ -48,9 +48,6 @@ def test_enable_prompt_embeds(hf_runner, model: str,
|
||||
ctx = (nullcontext() if enable_prompt_embeds else pytest.raises(
|
||||
ValueError, match="set `--enable-prompt-embeds`"))
|
||||
|
||||
# This test checks if the flag skip_tokenizer_init skips the initialization
|
||||
# of tokenizer and detokenizer. The generated output is expected to contain
|
||||
# token ids.
|
||||
llm = LLM(
|
||||
model=model,
|
||||
enable_prompt_embeds=enable_prompt_embeds,
|
||||
|
@ -6,19 +6,16 @@ import pytest
|
||||
|
||||
# yapf conflicts with isort for this block
|
||||
# yapf: disable
|
||||
from tests.models.language.pooling.mteb_utils import (MTEB_RERANK_LANGS,
|
||||
MTEB_RERANK_TASKS,
|
||||
MTEB_RERANK_TOL,
|
||||
RerankClientMtebEncoder,
|
||||
ScoreClientMtebEncoder,
|
||||
run_mteb_rerank)
|
||||
from tests.models.language.pooling.mteb_utils import (
|
||||
MTEB_RERANK_LANGS, MTEB_RERANK_TASKS, MTEB_RERANK_TOL,
|
||||
RerankClientMtebEncoder, ScoreClientMtebEncoder,
|
||||
mteb_test_rerank_models_hf, run_mteb_rerank)
|
||||
# yapf: enable
|
||||
from tests.utils import RemoteOpenAIServer
|
||||
|
||||
os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"
|
||||
|
||||
MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"
|
||||
MAIN_SCORE = 0.33437
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
@ -31,12 +28,19 @@ def server():
|
||||
yield remote_server
|
||||
|
||||
|
||||
def test_mteb_score(server):
|
||||
@pytest.fixture(scope="module")
|
||||
def st_main_score(hf_runner):
|
||||
# The main score related to the version of the dependency.
|
||||
# So we need to recalculate every time.
|
||||
main_score, st_dtype = mteb_test_rerank_models_hf(hf_runner, MODEL_NAME)
|
||||
return main_score
|
||||
|
||||
|
||||
def test_mteb_score(server, st_main_score):
|
||||
url = server.url_for("score")
|
||||
encoder = ScoreClientMtebEncoder(MODEL_NAME, url)
|
||||
vllm_main_score = run_mteb_rerank(encoder, MTEB_RERANK_TASKS,
|
||||
MTEB_RERANK_LANGS)
|
||||
st_main_score = MAIN_SCORE
|
||||
|
||||
print("VLLM main score: ", vllm_main_score)
|
||||
print("SentenceTransformer main score: ", st_main_score)
|
||||
@ -45,12 +49,11 @@ def test_mteb_score(server):
|
||||
assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_RERANK_TOL)
|
||||
|
||||
|
||||
def test_mteb_rerank(server):
|
||||
def test_mteb_rerank(server, st_main_score):
|
||||
url = server.url_for("rerank")
|
||||
encoder = RerankClientMtebEncoder(MODEL_NAME, url)
|
||||
vllm_main_score = run_mteb_rerank(encoder, MTEB_RERANK_TASKS,
|
||||
MTEB_RERANK_LANGS)
|
||||
st_main_score = MAIN_SCORE
|
||||
|
||||
print("VLLM main score: ", vllm_main_score)
|
||||
print("SentenceTransformer main score: ", st_main_score)
|
||||
|
@ -41,7 +41,6 @@ class MockModelConfig:
|
||||
encoder_config = None
|
||||
generation_config: str = "auto"
|
||||
media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
|
||||
mm_placeholder_str_override: dict[str, str] = field(default_factory=dict)
|
||||
|
||||
def get_diff_sampling_param(self):
|
||||
return self.diff_sampling_param or {}
|
||||
|
@ -37,7 +37,6 @@ async def test_basic_audio(mary_had_lamb):
|
||||
model_name = "openai/whisper-large-v3-turbo"
|
||||
server_args = ["--enforce-eager"]
|
||||
# Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
|
||||
prompt = "THE FIRST WORDS I SPOKE"
|
||||
with RemoteOpenAIServer(model_name, server_args) as remote_server:
|
||||
client = remote_server.get_async_client()
|
||||
transcription = await client.audio.transcriptions.create(
|
||||
@ -48,16 +47,6 @@ async def test_basic_audio(mary_had_lamb):
|
||||
temperature=0.0)
|
||||
out = json.loads(transcription)['text']
|
||||
assert "Mary had a little lamb," in out
|
||||
# This should "force" whisper to continue prompt in all caps
|
||||
transcription_wprompt = await client.audio.transcriptions.create(
|
||||
model=model_name,
|
||||
file=mary_had_lamb,
|
||||
language="en",
|
||||
response_format="text",
|
||||
prompt=prompt,
|
||||
temperature=0.0)
|
||||
out_capital = json.loads(transcription_wprompt)['text']
|
||||
assert prompt not in out_capital
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@ -238,3 +227,31 @@ async def test_sampling_params(mary_had_lamb):
|
||||
extra_body=dict(seed=42))
|
||||
|
||||
assert greedy_transcription.text != transcription.text
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_audio_prompt(mary_had_lamb):
|
||||
model_name = "openai/whisper-large-v3-turbo"
|
||||
server_args = ["--enforce-eager"]
|
||||
prompt = "This is a speech, recorded in a phonograph."
|
||||
with RemoteOpenAIServer(model_name, server_args) as remote_server:
|
||||
#Prompts should not omit the part of original prompt while transcribing.
|
||||
prefix = "The first words I spoke in the original phonograph"
|
||||
client = remote_server.get_async_client()
|
||||
transcription = await client.audio.transcriptions.create(
|
||||
model=model_name,
|
||||
file=mary_had_lamb,
|
||||
language="en",
|
||||
response_format="text",
|
||||
temperature=0.0)
|
||||
out = json.loads(transcription)['text']
|
||||
assert prefix in out
|
||||
transcription_wprompt = await client.audio.transcriptions.create(
|
||||
model=model_name,
|
||||
file=mary_had_lamb,
|
||||
language="en",
|
||||
response_format="text",
|
||||
prompt=prompt,
|
||||
temperature=0.0)
|
||||
out_prompt = json.loads(transcription_wprompt)['text']
|
||||
assert prefix in out_prompt
|
||||
|
@ -264,10 +264,8 @@ def test_parse_chat_messages_multiple_images(
|
||||
"url": image_url
|
||||
}
|
||||
}, {
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url
|
||||
}
|
||||
"type": "image_pil",
|
||||
"image_pil": ImageAsset('cherry_blossom').pil_image
|
||||
}, {
|
||||
"type": "text",
|
||||
"text": "What's in these images?"
|
||||
@ -303,10 +301,8 @@ async def test_parse_chat_messages_multiple_images_async(
|
||||
"url": image_url
|
||||
}
|
||||
}, {
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url
|
||||
}
|
||||
"type": "image_pil",
|
||||
"image_pil": ImageAsset('cherry_blossom').pil_image
|
||||
}, {
|
||||
"type": "text",
|
||||
"text": "What's in these images?"
|
||||
|
@ -450,7 +450,8 @@ def test_multi_query_kv_attention(
|
||||
start += seq_len
|
||||
# xformers.AttentionBias to Tensor for use in reference impl.
|
||||
alibi_bias = [
|
||||
b.materialize(b.shape, device=device).squeeze() for b in attn_bias
|
||||
b.materialize((1, num_query_heads, i, i), device=device).squeeze()
|
||||
for b, i in zip(attn_bias, seq_lens)
|
||||
]
|
||||
else:
|
||||
attn_bias = BlockDiagonalCausalMask.from_seqlens(seq_lens)
|
||||
|
@ -171,7 +171,7 @@ def test_env(
|
||||
expected = "FLASHINFER_VLLM_V1" if use_v1 else name
|
||||
assert backend.get_name() == expected
|
||||
else:
|
||||
backend = get_attn_backend(16,
|
||||
backend = get_attn_backend(32,
|
||||
torch.float16,
|
||||
torch.float16,
|
||||
block_size,
|
||||
@ -180,6 +180,45 @@ def test_env(
|
||||
expected = "FLASH_ATTN_VLLM_V1" if use_v1 else name
|
||||
assert backend.get_name() == expected
|
||||
|
||||
if use_v1:
|
||||
backend = get_attn_backend(16,
|
||||
torch.float16,
|
||||
torch.float16,
|
||||
block_size,
|
||||
False,
|
||||
use_mla=use_mla)
|
||||
assert backend.get_name() == "FLEX_ATTENTION", (
|
||||
"Should fallback to FlexAttention if head size is "
|
||||
"not supported by FlashAttention")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("device", ["cpu", "cuda"])
|
||||
@pytest.mark.parametrize("use_v1", [True, False])
|
||||
def test_fp32_fallback(
|
||||
device: str,
|
||||
use_v1: bool,
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
):
|
||||
"""Test attention backend selection with fp32."""
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
|
||||
|
||||
if device == "cpu":
|
||||
with patch("vllm.attention.selector.current_platform",
|
||||
CpuPlatform()):
|
||||
backend = get_attn_backend(16, torch.float32, torch.float32,
|
||||
16, False)
|
||||
assert (backend.get_name() == "TORCH_SDPA_VLLM_V1"
|
||||
if use_v1 else "TORCH_SDPA")
|
||||
|
||||
elif device == "cuda":
|
||||
with patch("vllm.attention.selector.current_platform",
|
||||
CudaPlatform()):
|
||||
backend = get_attn_backend(16, torch.float32, torch.float32,
|
||||
16, False)
|
||||
assert (backend.get_name() == "FLEX_ATTENTION"
|
||||
if use_v1 else "XFORMERS")
|
||||
|
||||
|
||||
def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
|
||||
"""Test FlashAttn validation."""
|
||||
|
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
DeepEP test utilities
|
||||
"""
|
||||
@ -137,8 +138,7 @@ def make_deepep_ht_a2a(pg: ProcessGroup,
|
||||
low_latency_mode=low_latency_mode,
|
||||
num_qps_per_rank=num_qps_per_rank)
|
||||
return DeepEPHTPrepareAndFinalize(buffer=buffer,
|
||||
world_size=pgi.world_size,
|
||||
rank=pgi.rank,
|
||||
num_dispatchers=pgi.world_size,
|
||||
dp_size=dp_size,
|
||||
rank_expert_offset=pgi.rank *
|
||||
ht_args.num_local_experts)
|
||||
@ -146,7 +146,6 @@ def make_deepep_ht_a2a(pg: ProcessGroup,
|
||||
|
||||
def make_deepep_ll_a2a(pg: ProcessGroup,
|
||||
pgi: ProcessGroupInfo,
|
||||
dp_size: int,
|
||||
deepep_ll_args: DeepEPLLArgs,
|
||||
q_dtype: Optional[torch.dtype] = None,
|
||||
block_shape: Optional[list[int]] = None):
|
||||
@ -166,8 +165,7 @@ def make_deepep_ll_a2a(pg: ProcessGroup,
|
||||
|
||||
return DeepEPLLPrepareAndFinalize(
|
||||
buffer=buffer,
|
||||
world_size=pgi.world_size,
|
||||
dp_size=dp_size,
|
||||
num_dispatchers=pgi.world_size,
|
||||
max_tokens_per_rank=deepep_ll_args.max_tokens_per_rank,
|
||||
use_fp8_dispatch=deepep_ll_args.use_fp8_dispatch,
|
||||
)
|
||||
@ -186,5 +184,4 @@ def make_deepep_a2a(pg: ProcessGroup,
|
||||
block_shape)
|
||||
|
||||
assert deepep_ll_args is not None
|
||||
return make_deepep_ll_a2a(pg, pgi, dp_size, deepep_ll_args, q_dtype,
|
||||
block_shape)
|
||||
return make_deepep_ll_a2a(pg, pgi, deepep_ll_args, q_dtype, block_shape)
|
||||
|
@ -10,7 +10,7 @@ import triton.language as tl
|
||||
|
||||
from tests.kernels.moe.utils import (batched_moe,
|
||||
make_quantized_test_activations,
|
||||
make_test_weights, triton_moe)
|
||||
make_test_weights, naive_batched_moe)
|
||||
from tests.kernels.quant_utils import native_batched_masked_quant_matmul
|
||||
from tests.kernels.utils import torch_experts
|
||||
from vllm.config import VllmConfig, set_current_vllm_config
|
||||
@ -33,12 +33,10 @@ MNK_FACTORS = [
|
||||
(45, 512, 512),
|
||||
(45, 1024, 128),
|
||||
(45, 1024, 2048),
|
||||
(64, 128, 128),
|
||||
(64, 512, 512),
|
||||
(64, 1024, 2048),
|
||||
(222, 128, 128),
|
||||
(222, 128, 2048),
|
||||
(222, 512, 512),
|
||||
(222, 1024, 128),
|
||||
(222, 1024, 2048),
|
||||
]
|
||||
@ -95,11 +93,12 @@ class BatchedMMTensors:
|
||||
@pytest.mark.parametrize("max_tokens_per_expert",
|
||||
[32, 64, 128, 192, 224, 256, 512])
|
||||
@pytest.mark.parametrize("K", [128, 256, 1024])
|
||||
@pytest.mark.parametrize("N", [128, 256, 512, 1024])
|
||||
@pytest.mark.parametrize("dtype",
|
||||
[torch.float32, torch.float16, torch.bfloat16])
|
||||
@pytest.mark.parametrize("block_shape", [None])
|
||||
@pytest.mark.parametrize("per_act_token_quant", [False])
|
||||
@pytest.mark.parametrize("N", [128, 256, 1024])
|
||||
@pytest.mark.parametrize(
|
||||
"dtype",
|
||||
[torch.float8_e4m3fn, torch.float32, torch.float16, torch.bfloat16])
|
||||
@pytest.mark.parametrize("block_shape", [None, [128, 128]])
|
||||
@pytest.mark.parametrize("per_act_token_quant", [False, True])
|
||||
def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
|
||||
N: int, dtype: torch.dtype,
|
||||
block_shape: Optional[list[int]],
|
||||
@ -134,7 +133,8 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
|
||||
in_dtype=act_dtype,
|
||||
quant_dtype=quant_dtype,
|
||||
block_shape=block_shape,
|
||||
per_act_token_quant=per_act_token_quant)
|
||||
per_act_token_quant=per_act_token_quant,
|
||||
)
|
||||
|
||||
B, B_q, B_scale, _, _, _ = make_test_weights(
|
||||
num_experts,
|
||||
@ -143,6 +143,7 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
|
||||
in_dtype=act_dtype,
|
||||
quant_dtype=quant_dtype,
|
||||
block_shape=block_shape,
|
||||
per_act_token_quant=per_act_token_quant,
|
||||
)
|
||||
|
||||
out_shape = (num_experts, max_tokens_per_expert, N)
|
||||
@ -177,6 +178,7 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
|
||||
"BLOCK_SIZE_N": 16,
|
||||
"BLOCK_SIZE_K": 16 if dtype.itemsize > 1 else 32
|
||||
},
|
||||
per_act_token_quant=per_act_token_quant,
|
||||
block_shape=block_shape,
|
||||
)
|
||||
|
||||
@ -185,15 +187,13 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
|
||||
B,
|
||||
ref_output,
|
||||
num_expert_tokens,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
)
|
||||
|
||||
q_ref_output = native_batched_masked_quant_matmul(A_q, B_q, q_ref_output,
|
||||
num_expert_tokens,
|
||||
A_scale, B_scale,
|
||||
block_shape)
|
||||
block_shape,
|
||||
per_act_token_quant)
|
||||
|
||||
rtol, atol = {
|
||||
torch.float16: (6e-2, 6e-2),
|
||||
@ -201,16 +201,17 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
|
||||
torch.float32: (1e-2, 1e-2),
|
||||
}[test_output.dtype]
|
||||
|
||||
torch.testing.assert_close(ref_output, test_output, atol=atol, rtol=rtol)
|
||||
torch.testing.assert_close(ref_output, q_ref_output, atol=atol, rtol=rtol)
|
||||
torch.testing.assert_close(test_output, q_ref_output, atol=atol, rtol=rtol)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(("m", "n", "k"), MNK_FACTORS)
|
||||
@pytest.mark.parametrize("e", NUM_EXPERTS)
|
||||
@pytest.mark.parametrize("topk", TOP_KS)
|
||||
@pytest.mark.parametrize("dtype", [torch.bfloat16])
|
||||
@pytest.mark.parametrize("per_act_token_quant", [False])
|
||||
@pytest.mark.parametrize("block_shape", [None])
|
||||
@pytest.mark.parametrize("dtype", [torch.float8_e4m3fn, torch.bfloat16])
|
||||
@pytest.mark.parametrize("per_act_token_quant", [False, True])
|
||||
@pytest.mark.parametrize("block_shape", [None, [128, 128]])
|
||||
@pytest.mark.parametrize("input_scales", [False])
|
||||
def test_fused_moe_batched_experts(
|
||||
m: int,
|
||||
n: int,
|
||||
@ -220,15 +221,19 @@ def test_fused_moe_batched_experts(
|
||||
dtype: torch.dtype,
|
||||
per_act_token_quant: bool,
|
||||
block_shape: Optional[list[int]],
|
||||
input_scales: bool,
|
||||
):
|
||||
current_platform.seed_everything(7)
|
||||
|
||||
use_fp8_w8a8 = dtype == torch.float8_e4m3fn
|
||||
|
||||
if topk > e:
|
||||
pytest.skip("topk > e")
|
||||
|
||||
if not use_fp8_w8a8 and (per_act_token_quant or block_shape is not None):
|
||||
pytest.skip("Skip quantization test for non-quantized type")
|
||||
|
||||
if per_act_token_quant and block_shape is not None or topk > e:
|
||||
if per_act_token_quant and block_shape is not None:
|
||||
pytest.skip("Skip illegal quantization test.")
|
||||
|
||||
a = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10
|
||||
@ -241,27 +246,26 @@ def test_fused_moe_batched_experts(
|
||||
act_dtype = dtype
|
||||
quant_dtype = None
|
||||
|
||||
_, w1, w1_s, _, w2, w2_s = make_test_weights(e,
|
||||
w1_16, w1, w1_s, w2_16, w2, w2_s = make_test_weights(
|
||||
e,
|
||||
n,
|
||||
k,
|
||||
block_shape=block_shape,
|
||||
in_dtype=act_dtype,
|
||||
quant_dtype=quant_dtype)
|
||||
quant_dtype=quant_dtype,
|
||||
per_act_token_quant=per_act_token_quant,
|
||||
)
|
||||
|
||||
if input_scales and quant_dtype is not None:
|
||||
a1_scale = torch.tensor(1, device="cuda", dtype=torch.float32)
|
||||
a2_scale = torch.tensor(1, device="cuda", dtype=torch.float32)
|
||||
else:
|
||||
a1_scale = None
|
||||
a2_scale = None
|
||||
|
||||
with set_current_vllm_config(vllm_config):
|
||||
topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
|
||||
batched_output = batched_moe(
|
||||
a,
|
||||
w1,
|
||||
w2,
|
||||
topk_weight,
|
||||
topk_ids,
|
||||
w1_scale=w1_s,
|
||||
w2_scale=w2_s,
|
||||
quant_dtype=quant_dtype,
|
||||
per_act_token_quant=per_act_token_quant,
|
||||
block_shape=block_shape,
|
||||
)
|
||||
|
||||
baseline_output = torch_experts(
|
||||
a,
|
||||
w1,
|
||||
@ -270,11 +274,14 @@ def test_fused_moe_batched_experts(
|
||||
topk_ids,
|
||||
w1_scale=w1_s,
|
||||
w2_scale=w2_s,
|
||||
a1_scale=a1_scale,
|
||||
a2_scale=a2_scale,
|
||||
quant_dtype=quant_dtype,
|
||||
per_act_token_quant=per_act_token_quant,
|
||||
block_shape=block_shape)
|
||||
block_shape=block_shape,
|
||||
)
|
||||
|
||||
triton_output = triton_moe(
|
||||
batched_output = naive_batched_moe(
|
||||
a,
|
||||
w1,
|
||||
w2,
|
||||
@ -282,14 +289,31 @@ def test_fused_moe_batched_experts(
|
||||
topk_ids,
|
||||
w1_scale=w1_s,
|
||||
w2_scale=w2_s,
|
||||
a1_scale=a1_scale,
|
||||
a2_scale=a2_scale,
|
||||
quant_dtype=quant_dtype,
|
||||
per_act_token_quant=per_act_token_quant,
|
||||
block_shape=block_shape,
|
||||
)
|
||||
|
||||
torch.testing.assert_close(triton_output,
|
||||
triton_output = batched_moe(
|
||||
a,
|
||||
w1,
|
||||
w2,
|
||||
topk_weight,
|
||||
topk_ids,
|
||||
w1_scale=w1_s,
|
||||
w2_scale=w2_s,
|
||||
a1_scale=a1_scale,
|
||||
a2_scale=a2_scale,
|
||||
quant_dtype=quant_dtype,
|
||||
per_act_token_quant=per_act_token_quant,
|
||||
block_shape=block_shape,
|
||||
)
|
||||
|
||||
torch.testing.assert_close(batched_output,
|
||||
baseline_output,
|
||||
atol=2e-2,
|
||||
atol=3e-2,
|
||||
rtol=2e-2)
|
||||
|
||||
torch.testing.assert_close(triton_output,
|
||||
|
116
tests/kernels/moe/test_cutlass_grouped_gemm.py
Normal file
116
tests/kernels/moe/test_cutlass_grouped_gemm.py
Normal file
@ -0,0 +1,116 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# DeepGEMM Style Cutlass Grouped GEMM Test
|
||||
# See https://github.com/deepseek-ai/DeepGEMM/blob/main/tests/test_core.py
|
||||
|
||||
import random
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from tests.kernels.utils import baseline_scaled_mm
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
|
||||
def cdiv(a, b):
|
||||
return (a + b - 1) // b
|
||||
|
||||
|
||||
def per_token_cast_to_fp8(
|
||||
x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
assert x.dim() == 2
|
||||
m, n = x.shape
|
||||
pad_size = (128 - (n % 128)) % 128
|
||||
x = torch.nn.functional.pad(x,
|
||||
(0, pad_size), value=0) if pad_size > 0 else x
|
||||
x_view = x.view(m, -1, 128)
|
||||
x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
|
||||
fp8_data = (x_view *
|
||||
(448.0 / x_amax.unsqueeze(2))).to(dtype=torch.float8_e4m3fn)
|
||||
return fp8_data.view(m, n + pad_size)[:, :n], (x_amax / 448.0).view(m, -1)
|
||||
|
||||
|
||||
def per_block_cast_to_fp8(
|
||||
x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
assert x.dim() == 2
|
||||
m, n = x.shape
|
||||
x_padded = torch.zeros((cdiv(m, 128) * 128, cdiv(n, 128) * 128),
|
||||
device=x.device,
|
||||
dtype=x.dtype)
|
||||
x_padded[:m, :n] = x
|
||||
x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128)
|
||||
x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
|
||||
x_scaled = (x_view * (448.0 / x_amax)).to(dtype=torch.float8_e4m3fn)
|
||||
return x_scaled.view_as(x_padded)[:m, :n].contiguous(), (
|
||||
x_amax / 448.0).view(x_view.size(0), x_view.size(2))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_groups, expected_m_per_group, k, n", [
|
||||
(4, 8192, 7168, 4096),
|
||||
(4, 8192, 2048, 7168),
|
||||
(8, 4096, 7168, 4096),
|
||||
(8, 4096, 2048, 7168),
|
||||
(32, 1024, 7168, 4096),
|
||||
(32, 1024, 2048, 7168),
|
||||
])
|
||||
@pytest.mark.parametrize("out_dtype", [torch.float16])
|
||||
@pytest.mark.skipif(
|
||||
(lambda x: x is None or x.to_int() != 100)(
|
||||
current_platform.get_device_capability()),
|
||||
reason="Block Scaled Grouped GEMM is only supported on SM100.")
|
||||
def test_cutlass_grouped_gemm(
|
||||
num_groups: int,
|
||||
expected_m_per_group: int,
|
||||
k: int,
|
||||
n: int,
|
||||
out_dtype: torch.dtype,
|
||||
):
|
||||
device = "cuda"
|
||||
alignment = 128
|
||||
group_ms = [
|
||||
int(expected_m_per_group * random.uniform(0.7, 1.3))
|
||||
for _ in range(num_groups)
|
||||
]
|
||||
m = sum([cdiv(m, alignment) * alignment for m in group_ms])
|
||||
|
||||
x = torch.randn((m, k), device=device, dtype=out_dtype)
|
||||
y = torch.randn((num_groups, n, k), device=device, dtype=out_dtype)
|
||||
out = torch.empty((m, n), device=device, dtype=out_dtype)
|
||||
ref_out = torch.randn((m, n), device=device, dtype=out_dtype)
|
||||
|
||||
ep_offset = [0] + [sum(group_ms[:i]) for i in range(1, num_groups)] + [m]
|
||||
pb_size = []
|
||||
for i in range(num_groups):
|
||||
pb_size.append([ep_offset[i + 1] - ep_offset[i], n, k])
|
||||
problem_sizes = torch.tensor(pb_size, device=device, dtype=torch.int32)
|
||||
expert_offsets = torch.tensor(ep_offset, device=device, dtype=torch.int32)
|
||||
|
||||
x_fp8 = per_token_cast_to_fp8(x)
|
||||
y_fp8 = (torch.empty_like(y, dtype=torch.float8_e4m3fn),
|
||||
torch.empty((num_groups, cdiv(n, 128), k // 128),
|
||||
device=device,
|
||||
dtype=torch.float))
|
||||
for i in range(num_groups):
|
||||
y_fp8[0][i], y_fp8[1][i] = per_block_cast_to_fp8(y[i])
|
||||
|
||||
for i in range(num_groups):
|
||||
a = x_fp8[0][ep_offset[i]:ep_offset[i + 1]]
|
||||
a_scale = x_fp8[1][ep_offset[i]:ep_offset[i + 1]]
|
||||
b = y_fp8[0][i].t()
|
||||
b_scale = y_fp8[1][i].t()
|
||||
baseline = baseline_scaled_mm(a, b, a_scale, b_scale, out_dtype)
|
||||
ref_out[ep_offset[i]:ep_offset[i + 1]] = baseline
|
||||
|
||||
ops.cutlass_blockwise_scaled_grouped_mm(
|
||||
out,
|
||||
x_fp8[0],
|
||||
y_fp8[0],
|
||||
x_fp8[1],
|
||||
y_fp8[1],
|
||||
problem_sizes,
|
||||
expert_offsets[:-1],
|
||||
)
|
||||
|
||||
torch.testing.assert_close(ref_out, out, atol=5e-1, rtol=1e-3)
|
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Test DeepEP + DeepGEMM integration
|
||||
DeepGEMM are gemm kernels specialized for the
|
||||
@ -148,8 +149,7 @@ def make_ll_modular_kernel(pg: ProcessGroup, pgi: ProcessGroupInfo,
|
||||
|
||||
fused_experts = BatchedDeepGemmExperts(
|
||||
max_num_tokens=max_tokens_per_rank,
|
||||
world_size=pgi.world_size,
|
||||
dp_size=dp_size,
|
||||
num_dispatchers=pgi.world_size // dp_size,
|
||||
block_shape=test_config.block_size,
|
||||
per_act_token_quant=test_config.per_act_token_quant)
|
||||
mk = FusedMoEModularKernel(prepare_finalize=a2a,
|
||||
|
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Test deepep dispatch-combine logic
|
||||
"""
|
||||
@ -154,12 +155,13 @@ def make_modular_kernel(
|
||||
deepep_ht_args = ht_args,
|
||||
deepep_ll_args = ll_args)
|
||||
|
||||
num_dispatchers = pgi.world_size // dp_size
|
||||
|
||||
if low_latency_mode:
|
||||
assert not per_act_token_quant, "not supported in ll mode"
|
||||
fused_experts = BatchedTritonExperts(
|
||||
max_num_tokens=MAX_TOKENS_PER_RANK,
|
||||
world_size=pgi.world_size,
|
||||
dp_size=dp_size,
|
||||
num_dispatchers=num_dispatchers,
|
||||
use_fp8_w8a8=is_quantized,
|
||||
use_int8_w8a8=False,
|
||||
use_int8_w8a16=False,
|
||||
|
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Unit-test DeepGEMM FP8 kernels (no DeepEP).
|
||||
Compare DeepGEMM path against the Triton fallback inside vLLM's fused_experts.
|
||||
|
@ -14,6 +14,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
|
||||
from vllm.model_executor.layers.fused_moe.modular_kernel import (
|
||||
FusedMoEModularKernel)
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import cdiv
|
||||
|
||||
from .parallel_utils import ProcessGroupInfo, parallel_launch
|
||||
|
||||
@ -112,18 +113,21 @@ def pplx_cutlass_moe(
|
||||
w2_scale = w2_scale.to(device)
|
||||
a1_scale = a1_scale.to(device)
|
||||
|
||||
assert num_experts % world_size == 0
|
||||
num_local_experts = cdiv(num_experts, world_size)
|
||||
num_dispatchers = pgi.world_size // dp_size
|
||||
|
||||
prepare_finalize = PplxPrepareAndFinalize(
|
||||
ata,
|
||||
max_num_tokens,
|
||||
pgi.world_size,
|
||||
rank,
|
||||
dp_size,
|
||||
)
|
||||
max_num_tokens=max_num_tokens,
|
||||
num_local_experts=num_local_experts,
|
||||
num_dispatchers=num_dispatchers)
|
||||
|
||||
experts = CutlassExpertsFp8((num_experts + world_size - 1) // world_size,
|
||||
experts = CutlassExpertsFp8(num_local_experts,
|
||||
out_dtype,
|
||||
per_act_token,
|
||||
per_out_ch,
|
||||
num_dispatchers=num_dispatchers,
|
||||
use_batched_format=True)
|
||||
|
||||
fused_cutlass_experts = FusedMoEModularKernel(
|
||||
@ -181,6 +185,7 @@ def _pplx_moe(
|
||||
per_out_ch: bool,
|
||||
use_internode: bool,
|
||||
):
|
||||
try:
|
||||
if use_internode:
|
||||
uid = nvshmem_get_unique_id(
|
||||
) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id()
|
||||
@ -188,12 +193,13 @@ def _pplx_moe(
|
||||
nvshmem_init(uid, pgi.rank, pgi.world_size)
|
||||
else:
|
||||
group_ranks = list(range(pgi.world_size))
|
||||
cpu_group = torch.distributed.new_group(group_ranks, backend="gloo")
|
||||
cpu_group = torch.distributed.new_group(group_ranks,
|
||||
backend="gloo")
|
||||
group_name = cpu_group.group_name
|
||||
|
||||
with set_current_vllm_config(vllm_config):
|
||||
torch_output = torch_experts(a_full, w1_full, w2_full, topk_weights,
|
||||
topk_ids)
|
||||
torch_output = torch_experts(a_full, w1_full, w2_full,
|
||||
topk_weights, topk_ids)
|
||||
pplx_output = pplx_cutlass_moe(pgi, dp_size, a, w1, w2, w1_scale,
|
||||
w2_scale, topk_weights, topk_ids,
|
||||
a1_scale, out_dtype, per_act_token,
|
||||
@ -206,8 +212,11 @@ def _pplx_moe(
|
||||
# print("PPLX OUT:", pplx_output)
|
||||
# print("TORCH OUT:", torch_output)
|
||||
|
||||
torch.testing.assert_close(pplx_output, torch_output, atol=0.05, rtol=0)
|
||||
|
||||
torch.testing.assert_close(pplx_output,
|
||||
torch_output,
|
||||
atol=0.05,
|
||||
rtol=0)
|
||||
finally:
|
||||
if use_internode:
|
||||
nvshmem_finalize()
|
||||
|
||||
|
@ -4,7 +4,10 @@
|
||||
|
||||
Run `pytest tests/kernels/test_pplx_moe.py`.
|
||||
"""
|
||||
from typing import Optional
|
||||
import itertools
|
||||
import textwrap
|
||||
import traceback
|
||||
from typing import Callable, Optional
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
@ -19,12 +22,13 @@ except ImportError:
|
||||
has_pplx = False
|
||||
|
||||
from tests.kernels.moe.utils import make_test_weights, naive_batched_moe
|
||||
from tests.kernels.quant_utils import dequant
|
||||
from tests.kernels.utils import torch_experts
|
||||
from vllm.config import VllmConfig, set_current_vllm_config
|
||||
from vllm.model_executor.layers.fused_moe import fused_topk, override_config
|
||||
from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
|
||||
from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
|
||||
BatchedPrepareAndFinalize, BatchedTritonExperts, NaiveBatchedExperts)
|
||||
BatchedTritonExperts)
|
||||
from vllm.model_executor.layers.fused_moe.fused_moe import get_default_config
|
||||
from vllm.model_executor.layers.fused_moe.modular_kernel import (
|
||||
FusedMoEModularKernel)
|
||||
@ -38,22 +42,22 @@ requires_pplx = pytest.mark.skipif(
|
||||
reason="Requires PPLX kernels",
|
||||
)
|
||||
|
||||
PPLX_PREPARE_COMBOS = [(4, 128, 128), (32, 1024, 512), (64, 1024, 512),
|
||||
(222, 2048, 1024)]
|
||||
|
||||
PPLX_MOE_COMBOS = [
|
||||
(1, 128, 128),
|
||||
PPLX_COMBOS = [
|
||||
# TODO: figure out why this fails, seems to be test problem
|
||||
#(1, 128, 128),
|
||||
(2, 128, 512),
|
||||
(3, 1024, 2048),
|
||||
(32, 128, 1024),
|
||||
(4, 128, 128),
|
||||
(32, 1024, 512),
|
||||
(45, 512, 2048),
|
||||
(64, 1024, 1024),
|
||||
(222, 1024, 2048),
|
||||
(64, 1024, 512),
|
||||
(222, 2048, 1024),
|
||||
(256, 1408, 2048),
|
||||
]
|
||||
|
||||
NUM_EXPERTS = [8, 64]
|
||||
EP_SIZE = [1, 4]
|
||||
TOP_KS = [1, 2, 6]
|
||||
DTYPES = [torch.float8_e4m3fn, torch.bfloat16]
|
||||
|
||||
vllm_config = VllmConfig()
|
||||
vllm_config.scheduler_config.max_num_seqs = 128
|
||||
@ -169,9 +173,11 @@ def test_fused_moe_batched_experts(
|
||||
|
||||
with set_current_vllm_config(vllm_config):
|
||||
topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
|
||||
baseline_output = torch_experts(a, w1, w2, topk_weight, topk_ids)
|
||||
baseline_output = torch_experts(a, w1, w2, topk_weight,
|
||||
topk_ids) # only for baseline
|
||||
torch_output = torch_batched_moe(a, w1, w2, topk_weight, topk_ids)
|
||||
batched_output = naive_batched_moe(a, w1, w2, topk_weight, topk_ids)
|
||||
batched_output = naive_batched_moe(
|
||||
a, w1, w2, topk_weight, topk_ids) # pick torch_experts or this
|
||||
|
||||
torch.testing.assert_close(baseline_output,
|
||||
torch_output,
|
||||
@ -183,6 +189,63 @@ def test_fused_moe_batched_experts(
|
||||
rtol=0)
|
||||
|
||||
|
||||
def create_pplx_prepare_finalize(
|
||||
num_tokens: int,
|
||||
hidden_dim: int,
|
||||
topk: int,
|
||||
num_experts: int,
|
||||
rank: int,
|
||||
dp_size: int,
|
||||
world_size: int,
|
||||
in_dtype: torch.dtype,
|
||||
quant_dtype: Optional[torch.dtype],
|
||||
block_shape: Optional[list[int]],
|
||||
per_act_token_quant: bool,
|
||||
group_name: Optional[str],
|
||||
):
|
||||
from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
|
||||
PplxPrepareAndFinalize, pplx_hidden_dim_scale_bytes)
|
||||
|
||||
max_num_tokens = max(rank_chunk(num_tokens, 0, world_size), 1)
|
||||
num_local_experts = rank_chunk(num_experts, 0, world_size)
|
||||
|
||||
hidden_dim_bytes, scale_bytes = pplx_hidden_dim_scale_bytes(
|
||||
max_num_tokens,
|
||||
hidden_dim,
|
||||
in_dtype,
|
||||
quant_dtype,
|
||||
per_act_token_quant=per_act_token_quant,
|
||||
block_shape=block_shape,
|
||||
)
|
||||
|
||||
args = dict(
|
||||
max_num_tokens=max_num_tokens,
|
||||
num_experts=num_experts,
|
||||
experts_per_token=topk,
|
||||
rank=rank,
|
||||
world_size=world_size,
|
||||
dp_size=dp_size,
|
||||
hidden_dim=hidden_dim,
|
||||
hidden_dim_bytes=hidden_dim_bytes,
|
||||
hidden_dim_scale_bytes=scale_bytes,
|
||||
)
|
||||
|
||||
if group_name is None:
|
||||
ata = AllToAll.internode(**args)
|
||||
else:
|
||||
args["group_name"] = group_name
|
||||
ata = AllToAll.intranode(**args)
|
||||
|
||||
prepare_finalize = PplxPrepareAndFinalize(
|
||||
ata,
|
||||
max_num_tokens=max_num_tokens,
|
||||
num_local_experts=num_local_experts,
|
||||
num_dispatchers=world_size // dp_size,
|
||||
)
|
||||
|
||||
return prepare_finalize, ata
|
||||
|
||||
|
||||
def rank_chunk(num: int, r: int, w: int) -> int:
|
||||
rem = num % w
|
||||
return (num // w) + (1 if r < rem else 0)
|
||||
@ -193,6 +256,35 @@ def chunk_by_rank(t: torch.Tensor, r: int, w: int) -> torch.Tensor:
|
||||
return t[(r * chunk):(r + 1) * chunk]
|
||||
|
||||
|
||||
def maybe_chunk_by_rank(t: Optional[torch.Tensor], r: int,
|
||||
w: int) -> Optional[torch.Tensor]:
|
||||
if t is not None:
|
||||
return chunk_by_rank(t, r, w)
|
||||
else:
|
||||
return t
|
||||
|
||||
|
||||
def chunk_scales_by_rank(t: Optional[torch.Tensor], r: int,
|
||||
w: int) -> Optional[torch.Tensor]:
|
||||
if t is not None and t.numel() > 1:
|
||||
chunk = rank_chunk(t.shape[0], r, w)
|
||||
return t[(r * chunk):(r + 1) * chunk]
|
||||
else:
|
||||
return t
|
||||
|
||||
|
||||
def chunk_scales(t: Optional[torch.Tensor], start: int,
|
||||
end: int) -> Optional[torch.Tensor]:
|
||||
if t is not None and t.numel() > 1:
|
||||
return t[start:end]
|
||||
else:
|
||||
return t
|
||||
|
||||
|
||||
def dummy_work(a: torch.Tensor) -> torch.Tensor:
|
||||
return a * 1.1
|
||||
|
||||
|
||||
def pplx_prepare_finalize(
|
||||
pgi: ProcessGroupInfo,
|
||||
dp_size: int,
|
||||
@ -200,11 +292,11 @@ def pplx_prepare_finalize(
|
||||
topk_weight: torch.Tensor,
|
||||
topk_ids: torch.Tensor,
|
||||
num_experts: int,
|
||||
quant_dtype: Optional[torch.dtype],
|
||||
block_shape: Optional[list[int]],
|
||||
per_act_token_quant: bool,
|
||||
group_name: Optional[str],
|
||||
) -> torch.Tensor:
|
||||
from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
|
||||
PplxPrepareAndFinalize)
|
||||
|
||||
assert torch.cuda.current_device() == pgi.local_rank
|
||||
|
||||
topk = topk_ids.shape[1]
|
||||
@ -212,60 +304,66 @@ def pplx_prepare_finalize(
|
||||
device = pgi.device
|
||||
rank = pgi.rank
|
||||
world_size = pgi.world_size
|
||||
max_num_tokens = rank_chunk(num_tokens, 0, world_size)
|
||||
|
||||
args = dict(
|
||||
max_num_tokens=max_num_tokens,
|
||||
num_experts=num_experts,
|
||||
experts_per_token=topk,
|
||||
rank=rank,
|
||||
world_size=world_size,
|
||||
dp_size=dp_size,
|
||||
hidden_dim=hidden_dim,
|
||||
hidden_dim_bytes=hidden_dim * a.dtype.itemsize,
|
||||
hidden_dim_scale_bytes=0,
|
||||
)
|
||||
|
||||
if group_name is None:
|
||||
ata = AllToAll.internode(**args)
|
||||
else:
|
||||
args["group_name"] = group_name
|
||||
ata = AllToAll.intranode(**args)
|
||||
|
||||
topk_ids = topk_ids.to(dtype=torch.uint32)
|
||||
|
||||
prepare_finalize = PplxPrepareAndFinalize(
|
||||
ata,
|
||||
max_num_tokens,
|
||||
world_size,
|
||||
prepare_finalize, ata = create_pplx_prepare_finalize(
|
||||
num_tokens,
|
||||
hidden_dim,
|
||||
topk,
|
||||
num_experts,
|
||||
rank,
|
||||
dp_size,
|
||||
world_size,
|
||||
a.dtype,
|
||||
quant_dtype,
|
||||
block_shape,
|
||||
per_act_token_quant,
|
||||
group_name,
|
||||
)
|
||||
|
||||
assert a.shape[0] == topk_ids.shape[0]
|
||||
|
||||
a_chunk = chunk_by_rank(a, rank, world_size).to(device)
|
||||
chunk_topk_weight = chunk_by_rank(topk_weight, rank, world_size).to(device)
|
||||
chunk_topk_ids = chunk_by_rank(topk_ids, rank, world_size).to(device)
|
||||
|
||||
assert a_chunk.shape[0] == chunk_topk_ids.shape[0]
|
||||
|
||||
out = torch.full(
|
||||
a_chunk.shape,
|
||||
torch.nan,
|
||||
dtype=a.dtype,
|
||||
device=device,
|
||||
)
|
||||
|
||||
if (quant_dtype is not None and not per_act_token_quant
|
||||
and block_shape is None):
|
||||
a1_scale = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
||||
a2_scale = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
||||
else:
|
||||
a1_scale = None
|
||||
a2_scale = None
|
||||
|
||||
b_a, b_a_scale, expert_num_tokens, _, _ = prepare_finalize.prepare(
|
||||
a_chunk,
|
||||
None,
|
||||
None,
|
||||
a1_scale,
|
||||
a2_scale,
|
||||
chunk_topk_weight,
|
||||
chunk_topk_ids,
|
||||
num_experts,
|
||||
None,
|
||||
False,
|
||||
FusedMoEQuantConfig(),
|
||||
FusedMoEQuantConfig(
|
||||
quant_dtype,
|
||||
per_act_token_quant,
|
||||
False,
|
||||
block_shape,
|
||||
),
|
||||
)
|
||||
|
||||
b_a = b_a * 1.5
|
||||
|
||||
out = torch.full(
|
||||
(max_num_tokens, hidden_dim),
|
||||
torch.nan,
|
||||
dtype=a.dtype,
|
||||
device=device,
|
||||
)
|
||||
b_a = dummy_work(
|
||||
dequant(b_a, b_a_scale, block_shape, per_act_token_quant, a.dtype))
|
||||
|
||||
prepare_finalize.finalize(
|
||||
out,
|
||||
@ -291,8 +389,12 @@ def _pplx_prepare_finalize(
|
||||
score: torch.Tensor,
|
||||
topk: torch.Tensor,
|
||||
num_experts: int,
|
||||
quant_dtype: Optional[torch.dtype],
|
||||
block_shape: Optional[list[int]],
|
||||
per_act_token_quant: bool,
|
||||
use_internode: bool,
|
||||
):
|
||||
try:
|
||||
if use_internode:
|
||||
uid = nvshmem_get_unique_id(
|
||||
) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id()
|
||||
@ -301,60 +403,82 @@ def _pplx_prepare_finalize(
|
||||
group_name = None
|
||||
else:
|
||||
group_ranks = list(range(pgi.world_size))
|
||||
cpu_group = torch.distributed.new_group(group_ranks, backend="gloo")
|
||||
cpu_group = torch.distributed.new_group(group_ranks,
|
||||
backend="gloo")
|
||||
group_name = cpu_group.group_name
|
||||
|
||||
device = pgi.device
|
||||
|
||||
topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
|
||||
k = a.shape[1]
|
||||
m, k = a.shape
|
||||
|
||||
a_rep = torch.repeat_interleave(a, topk, dim=0).to(device)
|
||||
a_rep = torch.repeat_interleave(dummy_work(a), topk, dim=0)
|
||||
|
||||
torch_output = (a_rep.view(-1, topk, k) * 1.5 *
|
||||
topk_weight.view(-1, topk, 1).to(device)).sum(dim=1).to(
|
||||
a.dtype)
|
||||
torch_output = (a_rep.view(m, topk, k) *
|
||||
topk_weight.view(m, topk, 1).to(a_rep.dtype)).sum(
|
||||
dim=1)
|
||||
|
||||
pplx_output = pplx_prepare_finalize(pgi, dp_size, a, topk_weight, topk_ids,
|
||||
num_experts, group_name)
|
||||
pplx_output = pplx_prepare_finalize(pgi, dp_size, a, topk_weight,
|
||||
topk_ids, num_experts, quant_dtype,
|
||||
block_shape, per_act_token_quant,
|
||||
group_name)
|
||||
|
||||
torch_output = chunk_by_rank(torch_output, pgi.rank,
|
||||
pgi.world_size).to(pplx_output.device)
|
||||
|
||||
torch.testing.assert_close(pplx_output, torch_output, atol=2e-2, rtol=0)
|
||||
pgi.world_size).to(pgi.device)
|
||||
|
||||
torch.testing.assert_close(pplx_output,
|
||||
torch_output,
|
||||
atol=3e-2,
|
||||
rtol=3e-2)
|
||||
finally:
|
||||
if use_internode:
|
||||
nvshmem_finalize()
|
||||
|
||||
|
||||
# TODO (bnell): this test point does not work for odd M due to how the test is
|
||||
# written, not due to limitations of the pplx kernels. The pplx_moe
|
||||
# test below is able to deal with odd M.
|
||||
# TODO (bnell) add fp8 tests
|
||||
@pytest.mark.parametrize("mnk", PPLX_PREPARE_COMBOS)
|
||||
@pytest.mark.parametrize("mnk", PPLX_COMBOS)
|
||||
@pytest.mark.parametrize("e", NUM_EXPERTS)
|
||||
@pytest.mark.parametrize("topk", TOP_KS)
|
||||
@pytest.mark.parametrize("dtype", [torch.bfloat16])
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("world_dp_size", [[2, 1]])
|
||||
@pytest.mark.parametrize("per_act_token_quant", [False, True])
|
||||
@pytest.mark.parametrize("block_shape", [None, [128, 128]])
|
||||
@pytest.mark.parametrize("use_internode", [False])
|
||||
@pytest.mark.optional
|
||||
@requires_pplx
|
||||
def test_pplx_prepare_finalize(
|
||||
def test_pplx_prepare_finalize_slow(
|
||||
mnk: tuple[int, int, int],
|
||||
e: int,
|
||||
topk: int,
|
||||
dtype: torch.dtype,
|
||||
world_dp_size: tuple[int, int],
|
||||
per_act_token_quant: bool,
|
||||
block_shape: Optional[list[int]],
|
||||
use_internode: bool,
|
||||
):
|
||||
if dtype == torch.float8_e4m3fn:
|
||||
use_fp8_w8a8 = True
|
||||
act_dtype = torch.bfloat16
|
||||
quant_dtype = dtype
|
||||
else:
|
||||
use_fp8_w8a8 = False
|
||||
act_dtype = dtype
|
||||
quant_dtype = None
|
||||
|
||||
if not use_fp8_w8a8 and (per_act_token_quant or block_shape is not None):
|
||||
pytest.skip("Skip quantization test for non-quantized type")
|
||||
|
||||
if per_act_token_quant and block_shape is not None:
|
||||
pytest.skip("Skip illegal quantization combination")
|
||||
|
||||
current_platform.seed_everything(7)
|
||||
m, n, k = mnk
|
||||
world_size, dp_size = world_dp_size
|
||||
device = "cuda"
|
||||
a = torch.randn((m, k), device=device, dtype=dtype) / 10
|
||||
score = torch.randn((m, e), device=device, dtype=dtype)
|
||||
|
||||
a = torch.randn((m, k), device=device, dtype=act_dtype) / 10
|
||||
score = torch.randn((m, e), device=device, dtype=act_dtype)
|
||||
|
||||
parallel_launch(world_size, _pplx_prepare_finalize, dp_size, a, score,
|
||||
topk, e, use_internode)
|
||||
topk, e, quant_dtype, block_shape, per_act_token_quant,
|
||||
use_internode)
|
||||
|
||||
|
||||
def pplx_moe(
|
||||
@ -369,84 +493,62 @@ def pplx_moe(
|
||||
topk_ids: torch.Tensor,
|
||||
w1_scale: Optional[torch.Tensor] = None,
|
||||
w2_scale: Optional[torch.Tensor] = None,
|
||||
qtype: Optional[torch.dtype] = None,
|
||||
a1_scale: Optional[torch.Tensor] = None,
|
||||
a2_scale: Optional[torch.Tensor] = None,
|
||||
quant_dtype: Optional[torch.dtype] = None,
|
||||
per_act_token_quant=False,
|
||||
block_shape: Optional[list[int]] = None,
|
||||
use_compile: bool = False,
|
||||
use_cudagraphs: bool = True,
|
||||
) -> torch.Tensor:
|
||||
from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
|
||||
PplxPrepareAndFinalize, pplx_hidden_dim_scale_bytes)
|
||||
|
||||
device = torch.device("cuda", rank)
|
||||
hidden_dim = a.shape[1]
|
||||
num_tokens, hidden_dim = a.shape
|
||||
num_experts = w1.shape[0]
|
||||
topk = topk_ids.shape[1]
|
||||
max_num_tokens = round_up(rank_chunk(a.shape[0], 0, world_size), 64)
|
||||
max_num_tokens = round_up(rank_chunk(a.shape[0], 0, world_size), 16)
|
||||
|
||||
hidden_dim_bytes, scale_bytes = pplx_hidden_dim_scale_bytes(
|
||||
max_num_tokens,
|
||||
prepare_finalize, ata = create_pplx_prepare_finalize(
|
||||
num_tokens,
|
||||
hidden_dim,
|
||||
topk,
|
||||
num_experts,
|
||||
rank,
|
||||
dp_size,
|
||||
world_size,
|
||||
a.dtype,
|
||||
qtype,
|
||||
per_act_token_quant=per_act_token_quant,
|
||||
block_shape=block_shape,
|
||||
quant_dtype,
|
||||
block_shape,
|
||||
per_act_token_quant,
|
||||
group_name,
|
||||
)
|
||||
|
||||
args = dict(
|
||||
max_num_tokens=max_num_tokens,
|
||||
num_experts=num_experts,
|
||||
experts_per_token=topk,
|
||||
rank=rank,
|
||||
world_size=world_size,
|
||||
dp_size=dp_size,
|
||||
hidden_dim=hidden_dim,
|
||||
hidden_dim_bytes=hidden_dim_bytes,
|
||||
hidden_dim_scale_bytes=scale_bytes,
|
||||
)
|
||||
|
||||
if group_name is None:
|
||||
ata = AllToAll.internode(**args)
|
||||
else:
|
||||
args["group_name"] = group_name
|
||||
ata = AllToAll.intranode(**args)
|
||||
|
||||
topk_ids = topk_ids.to(dtype=torch.uint32)
|
||||
|
||||
prepare_finalize = PplxPrepareAndFinalize(
|
||||
ata,
|
||||
max_num_tokens,
|
||||
world_size,
|
||||
rank,
|
||||
dp_size,
|
||||
experts = BatchedTritonExperts(
|
||||
max_num_tokens=max_num_tokens,
|
||||
num_dispatchers=prepare_finalize.num_dispatchers(),
|
||||
use_fp8_w8a8=quant_dtype == torch.float8_e4m3fn,
|
||||
block_shape=block_shape,
|
||||
per_act_token_quant=per_act_token_quant,
|
||||
)
|
||||
|
||||
experts = BatchedTritonExperts(max_num_tokens=max_num_tokens,
|
||||
world_size=world_size,
|
||||
dp_size=dp_size,
|
||||
use_fp8_w8a8=qtype == torch.float8_e4m3fn,
|
||||
block_shape=block_shape)
|
||||
|
||||
fused_experts = FusedMoEModularKernel(
|
||||
prepare_finalize,
|
||||
experts,
|
||||
)
|
||||
|
||||
# Note: workers with the same dp_rank must use the exact same inputs.
|
||||
a_chunk = chunk_by_rank(a, rank, world_size).to(device)
|
||||
chunk_topk_weight = chunk_by_rank(topk_weight, rank, world_size).to(device)
|
||||
chunk_topk_ids = chunk_by_rank(topk_ids, rank, world_size).to(device)
|
||||
a_chunk = chunk_by_rank(a, rank, world_size)
|
||||
chunk_topk_weight = chunk_by_rank(topk_weight, rank, world_size)
|
||||
chunk_topk_ids = chunk_by_rank(topk_ids, rank, world_size)
|
||||
|
||||
# Chunking weights like this only works for batched format
|
||||
w1_chunk = chunk_by_rank(w1, rank, world_size).to(device)
|
||||
w2_chunk = chunk_by_rank(w2, rank, world_size).to(device)
|
||||
|
||||
if w1_scale is not None:
|
||||
w1_scale_chunk = chunk_by_rank(w1_scale, rank, world_size).to(device)
|
||||
w2_scale_chunk = chunk_by_rank(w2_scale, rank, world_size).to(device)
|
||||
else:
|
||||
w1_scale_chunk = None
|
||||
w2_scale_chunk = None
|
||||
w1_chunk = chunk_by_rank(w1, rank, world_size)
|
||||
w2_chunk = chunk_by_rank(w2, rank, world_size)
|
||||
w1_scale_chunk = maybe_chunk_by_rank(w1_scale, rank, world_size)
|
||||
w2_scale_chunk = maybe_chunk_by_rank(w2_scale, rank, world_size)
|
||||
a1_scale_chunk = chunk_scales_by_rank(a1_scale, rank, world_size)
|
||||
a2_scale_chunk = chunk_scales_by_rank(a2_scale, rank, world_size)
|
||||
|
||||
# Note: for now use_compile will error out if the problem size is
|
||||
# large enough to trigger chunking. I'm leaving the flag and
|
||||
@ -468,6 +570,8 @@ def pplx_moe(
|
||||
chunk_topk_ids,
|
||||
w1_scale=w1_scale_chunk,
|
||||
w2_scale=w2_scale_chunk,
|
||||
a1_scale=a1_scale_chunk,
|
||||
a2_scale=a2_scale_chunk,
|
||||
global_num_experts=num_experts)
|
||||
|
||||
if use_cudagraphs:
|
||||
@ -482,6 +586,8 @@ def pplx_moe(
|
||||
chunk_topk_ids,
|
||||
w1_scale=w1_scale_chunk,
|
||||
w2_scale=w2_scale_chunk,
|
||||
a1_scale=a1_scale_chunk,
|
||||
a2_scale=a2_scale_chunk,
|
||||
global_num_experts=num_experts)
|
||||
|
||||
torch.cuda.synchronize()
|
||||
@ -494,48 +600,6 @@ def pplx_moe(
|
||||
return out
|
||||
|
||||
|
||||
def _batched_moe(pgi, dp_size, a, w1, w2, topk_weight, topk_ids):
|
||||
assert torch.cuda.current_device() == pgi.local_rank
|
||||
|
||||
num_experts = w1.shape[0]
|
||||
device = pgi.device
|
||||
rank = pgi.rank
|
||||
world_size = pgi.world_size
|
||||
max_num_tokens = rank_chunk(a.shape[0], 0, world_size)
|
||||
|
||||
prepare_finalize = BatchedPrepareAndFinalize(
|
||||
max_num_tokens=max_num_tokens,
|
||||
world_size=world_size,
|
||||
dp_size=dp_size,
|
||||
rank=rank,
|
||||
)
|
||||
|
||||
experts = NaiveBatchedExperts(max_num_tokens=a.shape[0],
|
||||
world_size=1,
|
||||
dp_size=1)
|
||||
|
||||
fused_experts = FusedMoEModularKernel(
|
||||
prepare_finalize,
|
||||
experts,
|
||||
)
|
||||
|
||||
# Note: workers with the same dp_rank must use the exact same inputs.
|
||||
a_chunk = chunk_by_rank(a, rank, world_size).to(device)
|
||||
chunk_topk_weight = chunk_by_rank(topk_weight, rank, world_size).to(device)
|
||||
chunk_topk_ids = chunk_by_rank(topk_ids, rank, world_size).to(device)
|
||||
|
||||
out = fused_experts(
|
||||
a_chunk,
|
||||
# Chunking weights like this only works for batched format
|
||||
chunk_by_rank(w1, rank, world_size).to(device),
|
||||
chunk_by_rank(w2, rank, world_size).to(device),
|
||||
chunk_topk_weight,
|
||||
chunk_topk_ids,
|
||||
global_num_experts=num_experts)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
def _pplx_moe(
|
||||
pgi: ProcessGroupInfo,
|
||||
dp_size: int,
|
||||
@ -544,13 +608,15 @@ def _pplx_moe(
|
||||
w2: torch.Tensor,
|
||||
score: torch.Tensor,
|
||||
topk: int,
|
||||
num_experts: int,
|
||||
w1_s: Optional[torch.Tensor] = None,
|
||||
w2_s: Optional[torch.Tensor] = None,
|
||||
qtype: Optional[torch.dtype] = None,
|
||||
quant_dtype: Optional[torch.dtype] = None,
|
||||
per_act_token_quant: bool = False,
|
||||
block_shape: Optional[list[int]] = None,
|
||||
use_internode: bool = False,
|
||||
):
|
||||
try:
|
||||
if use_internode:
|
||||
uid = nvshmem_get_unique_id(
|
||||
) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id()
|
||||
@ -559,7 +625,8 @@ def _pplx_moe(
|
||||
group_name = None
|
||||
else:
|
||||
group_ranks = list(range(pgi.world_size))
|
||||
cpu_group = torch.distributed.new_group(group_ranks, backend="gloo")
|
||||
cpu_group = torch.distributed.new_group(group_ranks,
|
||||
backend="gloo")
|
||||
group_name = cpu_group.group_name
|
||||
|
||||
m, k = a.shape
|
||||
@ -568,51 +635,103 @@ def _pplx_moe(
|
||||
moe_config = get_default_config(m, e, n, k, topk, a.dtype, False)
|
||||
|
||||
device = torch.device("cuda", pgi.rank)
|
||||
rank = pgi.rank
|
||||
world_size = pgi.world_size
|
||||
|
||||
a = a.to(device)
|
||||
w1 = w1.to(device)
|
||||
w2 = w2.to(device)
|
||||
w1_s = w1_s.to(device) if w1_s is not None else None
|
||||
w2_s = w2_s.to(device) if w2_s is not None else None
|
||||
|
||||
if (quant_dtype is not None and not per_act_token_quant
|
||||
and block_shape is None):
|
||||
a1_scale = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
||||
a2_scale = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
||||
else:
|
||||
a1_scale = None
|
||||
a2_scale = None
|
||||
|
||||
with set_current_vllm_config(vllm_config), override_config(moe_config):
|
||||
topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
|
||||
torch_output = torch_experts(a,
|
||||
|
||||
torch_output = torch_experts(
|
||||
a,
|
||||
w1,
|
||||
w2,
|
||||
topk_weight,
|
||||
topk_ids,
|
||||
w1_scale=w1_s,
|
||||
w2_scale=w2_s,
|
||||
quant_dtype=qtype,
|
||||
a1_scale=a1_scale,
|
||||
a2_scale=a2_scale,
|
||||
quant_dtype=quant_dtype,
|
||||
per_act_token_quant=per_act_token_quant,
|
||||
block_shape=block_shape)
|
||||
pplx_output = pplx_moe(group_name, pgi.rank, pgi.world_size, dp_size,
|
||||
a, w1, w2, topk_weight, topk_ids, w1_s, w2_s,
|
||||
qtype, per_act_token_quant, block_shape)
|
||||
# TODO (bnell): fix + re-enable
|
||||
#batched_output = _batched_moe(pgi, dp_size, a, w1, w2, topk_weight,
|
||||
# topk_ids)
|
||||
block_shape=block_shape,
|
||||
)
|
||||
|
||||
torch_output = chunk_by_rank(torch_output, pgi.rank,
|
||||
pgi.world_size).to(pplx_output.device)
|
||||
batched_output = naive_batched_moe(
|
||||
a,
|
||||
w1,
|
||||
w2,
|
||||
topk_weight,
|
||||
topk_ids,
|
||||
w1_scale=w1_s,
|
||||
w2_scale=w2_s,
|
||||
a1_scale=a1_scale,
|
||||
a2_scale=a2_scale,
|
||||
quant_dtype=quant_dtype,
|
||||
per_act_token_quant=per_act_token_quant,
|
||||
block_shape=block_shape,
|
||||
)
|
||||
|
||||
torch.testing.assert_close(pplx_output, torch_output, atol=2e-2, rtol=0)
|
||||
#torch.testing.assert_close(batched_output, torch_output, atol=2e-2, rtol=0)
|
||||
pplx_output = pplx_moe(
|
||||
group_name,
|
||||
rank,
|
||||
world_size,
|
||||
dp_size,
|
||||
a,
|
||||
w1,
|
||||
w2,
|
||||
topk_weight,
|
||||
topk_ids,
|
||||
w1_scale=w1_s,
|
||||
w2_scale=w2_s,
|
||||
a1_scale=a1_scale,
|
||||
a2_scale=a2_scale,
|
||||
quant_dtype=quant_dtype,
|
||||
per_act_token_quant=per_act_token_quant,
|
||||
block_shape=block_shape,
|
||||
)
|
||||
|
||||
chunked_batch_output = chunk_by_rank(
|
||||
batched_output, pgi.rank, pgi.world_size).to(pplx_output.device)
|
||||
|
||||
torch.testing.assert_close(batched_output,
|
||||
torch_output,
|
||||
atol=3e-2,
|
||||
rtol=3e-2)
|
||||
|
||||
torch.testing.assert_close(pplx_output,
|
||||
chunked_batch_output,
|
||||
atol=3e-2,
|
||||
rtol=3e-2)
|
||||
finally:
|
||||
if use_internode:
|
||||
nvshmem_finalize()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("mnk", PPLX_MOE_COMBOS)
|
||||
@pytest.mark.parametrize("mnk", PPLX_COMBOS)
|
||||
@pytest.mark.parametrize("e", NUM_EXPERTS)
|
||||
@pytest.mark.parametrize("topk", TOP_KS)
|
||||
@pytest.mark.parametrize("dtype", [torch.bfloat16])
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("world_dp_size", [[2, 1]])
|
||||
@pytest.mark.parametrize("per_act_token_quant", [False, True])
|
||||
@pytest.mark.parametrize("block_shape", [None, [128, 128]])
|
||||
@pytest.mark.parametrize("use_internode", [False])
|
||||
@pytest.mark.optional
|
||||
@requires_pplx
|
||||
def test_pplx_moe(
|
||||
def test_pplx_moe_slow(
|
||||
mnk: tuple[int, int, int],
|
||||
e: int,
|
||||
topk: int,
|
||||
@ -633,18 +752,143 @@ def test_pplx_moe(
|
||||
use_fp8_w8a8 = False
|
||||
quant_dtype = None
|
||||
|
||||
if not use_fp8_w8a8 and per_act_token_quant and block_shape is not None:
|
||||
if not use_fp8_w8a8 and (per_act_token_quant or block_shape is not None):
|
||||
pytest.skip("Skip quantization test for non-quantized type")
|
||||
|
||||
if per_act_token_quant and block_shape is not None:
|
||||
pytest.skip("Skip illegal quantization combination")
|
||||
|
||||
a = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10
|
||||
score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
|
||||
|
||||
_, w1, w1_s, _, w2, w2_s = make_test_weights(e,
|
||||
_, w1, w1_s, _, w2, w2_s = make_test_weights(
|
||||
e,
|
||||
n,
|
||||
k,
|
||||
quant_dtype=quant_dtype,
|
||||
block_shape=block_shape)
|
||||
block_shape=block_shape,
|
||||
per_act_token_quant=per_act_token_quant,
|
||||
)
|
||||
|
||||
parallel_launch(world_size, _pplx_moe, dp_size, a, w1, w2, score, topk,
|
||||
parallel_launch(world_size, _pplx_moe, dp_size, a, w1, w2, score, topk, e,
|
||||
w1_s, w2_s, quant_dtype, per_act_token_quant, block_shape,
|
||||
use_internode)
|
||||
|
||||
|
||||
def _pplx_test_loop(pgi: ProcessGroupInfo, dp_size: int, use_internode: bool,
|
||||
make_weights: bool, test_fn: Callable):
|
||||
|
||||
def format_result(msg, ex=None):
|
||||
if ex is not None:
|
||||
x = str(ex)
|
||||
newx = x.strip(" \n\t")[:16]
|
||||
if len(newx) < len(x):
|
||||
newx = newx + " ..."
|
||||
|
||||
prefix = "E\t"
|
||||
print(f"{textwrap.indent(traceback.format_exc(), prefix)}")
|
||||
print(f"FAILED {msg} - {newx}\n")
|
||||
else:
|
||||
print(f"PASSED {msg}")
|
||||
|
||||
current_platform.seed_everything(7)
|
||||
combos = itertools.product(PPLX_COMBOS, NUM_EXPERTS, TOP_KS, DTYPES,
|
||||
[False, True], [None, [128, 128]])
|
||||
exceptions = []
|
||||
count = 0
|
||||
for mnk, e, topk, dtype, per_act_token_quant, block_shape in combos:
|
||||
count = count + 1
|
||||
m, n, k = mnk
|
||||
|
||||
if dtype == torch.float8_e4m3fn:
|
||||
use_fp8_w8a8 = True
|
||||
quant_dtype = dtype
|
||||
else:
|
||||
use_fp8_w8a8 = False
|
||||
quant_dtype = None
|
||||
|
||||
test_desc = (f"test_pplx_moe[mnk={mnk}, e={e}, topk={topk}, "
|
||||
f"dtype={dtype}, per_act_token={per_act_token_quant}, "
|
||||
f"block_shape={block_shape}")
|
||||
|
||||
if not use_fp8_w8a8 and (per_act_token_quant
|
||||
or block_shape is not None):
|
||||
print(
|
||||
f"{test_desc} - Skip quantization test for non-quantized type."
|
||||
)
|
||||
continue
|
||||
|
||||
if per_act_token_quant and block_shape is not None:
|
||||
print(f"{test_desc} - Skip illegal quantization combination.")
|
||||
continue
|
||||
|
||||
a = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10
|
||||
score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
|
||||
|
||||
args = dict()
|
||||
if make_weights:
|
||||
_, w1, w1_s, _, w2, w2_s = make_test_weights(
|
||||
e,
|
||||
n,
|
||||
k,
|
||||
quant_dtype=quant_dtype,
|
||||
block_shape=block_shape,
|
||||
per_act_token_quant=per_act_token_quant,
|
||||
)
|
||||
args["w1"] = w1
|
||||
args["w2"] = w2
|
||||
args["w1_s"] = w1_s
|
||||
args["w2_s"] = w2_s
|
||||
|
||||
try:
|
||||
test_fn(
|
||||
pgi=pgi,
|
||||
dp_size=dp_size,
|
||||
a=a,
|
||||
score=score,
|
||||
topk=topk,
|
||||
num_experts=e,
|
||||
quant_dtype=quant_dtype,
|
||||
per_act_token_quant=per_act_token_quant,
|
||||
block_shape=block_shape,
|
||||
use_internode=use_internode,
|
||||
**args,
|
||||
)
|
||||
format_result(test_desc)
|
||||
except Exception as ex:
|
||||
format_result(test_desc, ex)
|
||||
exceptions.append(ex)
|
||||
|
||||
if len(exceptions) > 0:
|
||||
raise RuntimeError(
|
||||
f"{len(exceptions)} of {count} tests failed in child process, "
|
||||
f"rank={pgi.rank}.")
|
||||
else:
|
||||
print(f"{count} of {count} tests passed in child process, "
|
||||
f"rank={pgi.rank}.")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("world_dp_size", [[2, 1]])
|
||||
@pytest.mark.parametrize("use_internode", [False])
|
||||
@requires_pplx
|
||||
def test_pplx_prepare_finalize(
|
||||
world_dp_size: tuple[int, int],
|
||||
use_internode: bool,
|
||||
):
|
||||
current_platform.seed_everything(7)
|
||||
world_size, dp_size = world_dp_size
|
||||
parallel_launch(world_size * dp_size, _pplx_test_loop, dp_size,
|
||||
use_internode, False, _pplx_prepare_finalize)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("world_dp_size", [[2, 1]])
|
||||
@pytest.mark.parametrize("use_internode", [False])
|
||||
@requires_pplx
|
||||
def test_pplx_moe(
|
||||
world_dp_size: tuple[int, int],
|
||||
use_internode: bool,
|
||||
):
|
||||
current_platform.seed_everything(7)
|
||||
world_size, dp_size = world_dp_size
|
||||
parallel_launch(world_size, _pplx_test_loop, dp_size, use_internode, True,
|
||||
_pplx_moe)
|
||||
|
@ -63,13 +63,12 @@ def batched_moe(
|
||||
|
||||
fused_experts = FusedMoEModularKernel(
|
||||
BatchedPrepareAndFinalize(max_num_tokens,
|
||||
world_size=1,
|
||||
dp_size=1,
|
||||
num_dispatchers=1,
|
||||
num_local_experts=w1.shape[0],
|
||||
rank=0),
|
||||
BatchedTritonExperts(
|
||||
max_num_tokens=max_num_tokens,
|
||||
world_size=1,
|
||||
dp_size=1,
|
||||
num_dispatchers=1,
|
||||
use_fp8_w8a8=quant_dtype == torch.float8_e4m3fn,
|
||||
per_act_token_quant=per_act_token_quant,
|
||||
block_shape=block_shape,
|
||||
@ -105,13 +104,12 @@ def naive_batched_moe(
|
||||
|
||||
fused_experts = FusedMoEModularKernel(
|
||||
BatchedPrepareAndFinalize(max_num_tokens,
|
||||
world_size=1,
|
||||
dp_size=1,
|
||||
num_dispatchers=1,
|
||||
num_local_experts=w1.shape[0],
|
||||
rank=0),
|
||||
NaiveBatchedExperts(
|
||||
max_num_tokens=max_num_tokens,
|
||||
dp_size=1,
|
||||
world_size=1,
|
||||
num_dispatchers=1,
|
||||
use_fp8_w8a8=quant_dtype == torch.float8_e4m3fn,
|
||||
per_act_token_quant=per_act_token_quant,
|
||||
block_shape=block_shape,
|
||||
|
@ -277,6 +277,24 @@ def dequant(
|
||||
return t.to(out_dtype)
|
||||
|
||||
|
||||
def batched_dequant(
|
||||
t: torch.Tensor,
|
||||
scale: Optional[torch.Tensor],
|
||||
block_shape: Optional[list[int]],
|
||||
per_act_token_quant: bool,
|
||||
out_dtype: Optional[torch.dtype] = torch.float32,
|
||||
) -> torch.Tensor:
|
||||
if scale is not None:
|
||||
assert t.shape[0] == scale.shape[0]
|
||||
out = torch.empty_like(t, dtype=out_dtype)
|
||||
for e in range(t.shape[0]):
|
||||
out[e] = dequant(t[e], scale[e], block_shape, per_act_token_quant,
|
||||
out_dtype)
|
||||
return out
|
||||
|
||||
return t.to(out_dtype)
|
||||
|
||||
|
||||
def native_batched_masked_quant_matmul(
|
||||
A: torch.Tensor,
|
||||
B: torch.Tensor,
|
||||
|
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
@ -74,3 +75,51 @@ def test_apply_repetition_penalties(
|
||||
# Test the operator by applying the opcheck utility
|
||||
opcheck(torch.ops._C.apply_repetition_penalties_,
|
||||
(logits.clone(), prompt_mask, output_mask, repetition_penalties))
|
||||
|
||||
|
||||
@pytest.mark.skipif(not current_platform.is_cuda(),
|
||||
reason="This test for checking CUDA kernel")
|
||||
@torch.inference_mode()
|
||||
def test_apply_repetition_penalties_zero_seqs() -> None:
|
||||
"""
|
||||
Test the apply_repetition_penalties custom op with num_seqs=0
|
||||
against a reference implementation.
|
||||
"""
|
||||
num_seqs = 0
|
||||
vocab_size = 17
|
||||
repetition_penalty = 1.05
|
||||
dtype = torch.float32
|
||||
seed = 0
|
||||
|
||||
current_platform.seed_everything(seed)
|
||||
torch.set_default_device("cuda:0")
|
||||
|
||||
# Create test data
|
||||
logits = torch.randn(num_seqs, vocab_size, dtype=dtype)
|
||||
|
||||
# Create masks with some random tokens marked as repeated
|
||||
prompt_mask = torch.zeros(num_seqs, vocab_size, dtype=torch.bool)
|
||||
output_mask = torch.zeros(num_seqs, vocab_size, dtype=torch.bool)
|
||||
|
||||
# No tokens to mark as repeated since num_seqs=0
|
||||
|
||||
# Create repetition penalties tensor
|
||||
repetition_penalties = torch.full((num_seqs, ),
|
||||
repetition_penalty,
|
||||
dtype=dtype)
|
||||
|
||||
# Run all three implementations
|
||||
logits_torch = logits.clone()
|
||||
logits_cuda = logits.clone()
|
||||
|
||||
apply_repetition_penalties_torch(logits_torch, prompt_mask, output_mask,
|
||||
repetition_penalties)
|
||||
apply_repetition_penalties_cuda(logits_cuda, prompt_mask, output_mask,
|
||||
repetition_penalties)
|
||||
|
||||
# Compare all outputs to reference
|
||||
torch.testing.assert_close(logits_torch, logits_cuda, rtol=1e-3, atol=1e-3)
|
||||
|
||||
# Test the operator by applying the opcheck utility
|
||||
opcheck(torch.ops._C.apply_repetition_penalties_,
|
||||
(logits.clone(), prompt_mask, output_mask, repetition_penalties))
|
||||
|
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Integration tests for FlexAttention backend vs default backend"""
|
||||
|
||||
import random
|
||||
|
@ -1094,6 +1094,8 @@ def torch_experts(
|
||||
if expert_map is not None:
|
||||
topk_ids = expert_map[topk_ids]
|
||||
|
||||
f32 = torch.float32
|
||||
|
||||
for i in range(num_experts):
|
||||
mask = topk_ids == i
|
||||
if mask.sum():
|
||||
@ -1109,7 +1111,8 @@ def torch_experts(
|
||||
out.dtype)
|
||||
tmp2 = SiluAndMul()(tmp1)
|
||||
tmp2, b_scale = moe_kernel_quantize_input(
|
||||
tmp2, None, quant_dtype, per_act_token_quant, block_shape)
|
||||
tmp2, a2_scale, quant_dtype, per_act_token_quant,
|
||||
block_shape)
|
||||
|
||||
out[mask] = native_w8a8_block_matmul(tmp2, w2[i], b_scale,
|
||||
w2_scale[i], block_shape,
|
||||
@ -1117,7 +1120,6 @@ def torch_experts(
|
||||
else:
|
||||
assert (a_scale is not None and w1_scale is not None
|
||||
and w2_scale is not None)
|
||||
f32 = torch.float32
|
||||
scales = a_scale if a_scale.numel() == 1 else a_scale[mask]
|
||||
tmp1 = a[mask].to(f32) * scales
|
||||
w1_dq = (w1[i].to(f32) * w1_scale[i]).transpose(0, 1)
|
||||
@ -1126,8 +1128,8 @@ def torch_experts(
|
||||
w2_dq = (w2[i].to(f32) * w2_scale[i]).transpose(0, 1)
|
||||
out[mask] = (tmp2 @ w2_dq).to(out.dtype)
|
||||
|
||||
return (out.view(M, -1, w2.shape[1]) *
|
||||
topk_weight.view(M, -1, 1).to(out.dtype)).sum(dim=1)
|
||||
return (out.view(M, -1, w2.shape[1]).to(f32) *
|
||||
topk_weight.view(M, -1, 1)).sum(dim=1).to(out.dtype)
|
||||
|
||||
|
||||
def torch_moe(a: torch.Tensor,
|
||||
|
@ -249,23 +249,6 @@ def llama_2_7b_model_extra_embeddings(llama_2_7b_engine_extra_embeddings):
|
||||
model_runner.model)
|
||||
|
||||
|
||||
@pytest.fixture(params=[True, False])
|
||||
def run_with_both_engines_lora(request, monkeypatch):
|
||||
# Automatically runs tests twice, once with V1 and once without
|
||||
use_v1 = request.param
|
||||
# Tests decorated with `@skip_v1` are only run without v1
|
||||
skip_v1 = request.node.get_closest_marker("skip_v1")
|
||||
|
||||
if use_v1:
|
||||
if skip_v1:
|
||||
pytest.skip("Skipping test on vllm V1")
|
||||
monkeypatch.setenv('VLLM_USE_V1', '1')
|
||||
else:
|
||||
monkeypatch.setenv('VLLM_USE_V1', '0')
|
||||
|
||||
yield
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def reset_default_device():
|
||||
"""
|
||||
|
@ -3,6 +3,7 @@
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.models.registry import HF_EXAMPLE_MODELS
|
||||
from tests.utils import multi_gpu_test
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.sampling_params import SamplingParams
|
||||
@ -19,31 +20,55 @@ pytestmark = pytest.mark.hybrid_model
|
||||
SSM_MODELS = [
|
||||
"state-spaces/mamba-130m-hf",
|
||||
"tiiuae/falcon-mamba-tiny-dev",
|
||||
# TODO: Compare to a Mamba2 model. The HF transformers implementation of
|
||||
# Mamba2 is buggy for Codestral as it doesn't handle n_groups, so the test
|
||||
# doesn't compare vLLM output with HF output.
|
||||
# See https://github.com/huggingface/transformers/pull/35943
|
||||
"mistralai/Mamba-Codestral-7B-v0.1",
|
||||
]
|
||||
|
||||
HYBRID_MODELS = [
|
||||
"ai21labs/Jamba-tiny-dev",
|
||||
# NOTE: Currently the test failes due to HF transformers issue fixed in:
|
||||
# https://github.com/huggingface/transformers/pull/39033
|
||||
# We will enable vLLM test for Granite after next HF transformers release.
|
||||
# "ibm-granite/granite-4.0-tiny-preview",
|
||||
# NOTE: Running Plamo2 in transformers implementation requires to install
|
||||
# causal-conv1d package, which is not listed as a test dependency as it's
|
||||
# not compatible with pip-compile.
|
||||
"pfnet/plamo-2-1b",
|
||||
"Zyphra/Zamba2-1.2B-instruct",
|
||||
"hmellor/tiny-random-BambaForCausalLM",
|
||||
"ibm-ai-platform/Bamba-9B-v1",
|
||||
"nvidia/Nemotron-H-8B-Base-8K",
|
||||
"ibm-granite/granite-4.0-tiny-preview",
|
||||
"tiiuae/Falcon-H1-0.5B-Base",
|
||||
]
|
||||
|
||||
HF_UNSUPPORTED_MODELS = [
|
||||
# The HF transformers implementation of
|
||||
# Mamba2 is buggy for Codestral as it doesn't handle n_groups, so the test
|
||||
# doesn't compare vLLM output with HF output.
|
||||
# See https://github.com/huggingface/transformers/pull/35943
|
||||
"mistralai/Mamba-Codestral-7B-v0.1",
|
||||
# Note: I'm not seeing the same output from vLLM V0 vs. HF transformers
|
||||
# for Nemotron-H-8B; currently only compare vLLM V0 vs. vLLM V1
|
||||
"nvidia/Nemotron-H-8B-Base-8K",
|
||||
# NOTE: Currently the test fails due to HF transformers issue fixed in:
|
||||
# https://github.com/huggingface/transformers/pull/39033
|
||||
# We will enable vLLM test for Granite after next HF transformers release.
|
||||
"ibm-granite/granite-4.0-tiny-preview",
|
||||
]
|
||||
|
||||
V1_SUPPORTED_MODELS = [
|
||||
"mistralai/Mamba-Codestral-7B-v0.1",
|
||||
"ibm-ai-platform/Bamba-9B-v1",
|
||||
"Zyphra/Zamba2-1.2B-instruct",
|
||||
"nvidia/Nemotron-H-8B-Base-8K",
|
||||
"ibm-granite/granite-4.0-tiny-preview",
|
||||
"tiiuae/Falcon-H1-0.5B-Base",
|
||||
]
|
||||
|
||||
ATTN_BLOCK_SIZES = {
|
||||
"ibm-ai-platform/Bamba-9B-v1": 528,
|
||||
"Zyphra/Zamba2-1.2B-instruct": 80,
|
||||
"nvidia/Nemotron-H-8B-Base-8K": 528,
|
||||
"ibm-granite/granite-4.0-tiny-preview": 400,
|
||||
"tiiuae/Falcon-H1-0.5B-Base": 800,
|
||||
}
|
||||
|
||||
# Avoid OOM
|
||||
MAX_NUM_SEQS = 4
|
||||
|
||||
@ -60,8 +85,16 @@ def test_models(
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
|
||||
try:
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
with hf_runner(model) as hf_model:
|
||||
if model != "mistralai/Mamba-Codestral-7B-v0.1":
|
||||
if model not in HF_UNSUPPORTED_MODELS:
|
||||
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
else:
|
||||
@ -72,12 +105,21 @@ def test_models(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
if model in V1_SUPPORTED_MODELS:
|
||||
if model in HYBRID_MODELS and model in ATTN_BLOCK_SIZES:
|
||||
block_size = ATTN_BLOCK_SIZES[model]
|
||||
else:
|
||||
block_size = 16
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
if model in HYBRID_MODELS:
|
||||
# required due to reorder_batch behaviour
|
||||
m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER")
|
||||
with vllm_runner(model,
|
||||
max_num_seqs=MAX_NUM_SEQS,
|
||||
enforce_eager=True,
|
||||
enable_prefix_caching=False) as vllm_model:
|
||||
enable_prefix_caching=False,
|
||||
block_size=block_size) as vllm_model:
|
||||
vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
else:
|
||||
@ -111,6 +153,14 @@ def test_batching(
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
|
||||
try:
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
for_loop_outputs = []
|
||||
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
|
||||
for prompt in example_prompts:
|
||||
|
@ -234,6 +234,35 @@ def run_mteb_rerank(cross_encoder, tasks, languages):
|
||||
return main_score
|
||||
|
||||
|
||||
def mteb_test_rerank_models_hf(hf_runner, model_name, hf_model_callback=None):
|
||||
with hf_runner(model_name, is_cross_encoder=True,
|
||||
dtype="float32") as hf_model:
|
||||
|
||||
original_predict = hf_model.predict
|
||||
|
||||
def _predict(
|
||||
sentences: list[tuple[str, str,
|
||||
Optional[str]]], # query, corpus, prompt
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
# vllm and st both remove the prompt, fair comparison.
|
||||
prompts = [(s[0], s[1]) for s in sentences]
|
||||
return original_predict(prompts, *args, **kwargs, batch_size=8)
|
||||
|
||||
hf_model.predict = _predict
|
||||
hf_model.original_predict = original_predict
|
||||
|
||||
if hf_model_callback is not None:
|
||||
hf_model_callback(hf_model)
|
||||
|
||||
st_main_score = run_mteb_rerank(hf_model,
|
||||
tasks=MTEB_RERANK_TASKS,
|
||||
languages=MTEB_RERANK_LANGS)
|
||||
st_dtype = next(hf_model.model.model.parameters()).dtype
|
||||
return st_main_score, st_dtype
|
||||
|
||||
|
||||
def mteb_test_rerank_models(hf_runner,
|
||||
vllm_runner,
|
||||
model_info: RerankModelInfo,
|
||||
@ -264,31 +293,8 @@ def mteb_test_rerank_models(hf_runner,
|
||||
languages=MTEB_RERANK_LANGS)
|
||||
vllm_dtype = model_config.dtype
|
||||
|
||||
with hf_runner(model_info.name, is_cross_encoder=True,
|
||||
dtype="float32") as hf_model:
|
||||
|
||||
original_predict = hf_model.predict
|
||||
|
||||
def _predict(
|
||||
sentences: list[tuple[str, str,
|
||||
Optional[str]]], # query, corpus, prompt
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
# vllm and st both remove the prompt, fair comparison.
|
||||
prompts = [(s[0], s[1]) for s in sentences]
|
||||
return original_predict(prompts, *args, **kwargs, batch_size=8)
|
||||
|
||||
hf_model.predict = _predict
|
||||
hf_model.original_predict = original_predict
|
||||
|
||||
if hf_model_callback is not None:
|
||||
hf_model_callback(hf_model)
|
||||
|
||||
st_main_score = run_mteb_rerank(hf_model,
|
||||
tasks=MTEB_RERANK_TASKS,
|
||||
languages=MTEB_RERANK_LANGS)
|
||||
st_dtype = next(hf_model.model.model.parameters()).dtype
|
||||
st_main_score, st_dtype = mteb_test_rerank_models_hf(
|
||||
hf_runner, model_info.name, hf_model_callback)
|
||||
|
||||
print("VLLM:", vllm_dtype, vllm_main_score)
|
||||
print("SentenceTransformers:", st_dtype, st_main_score)
|
||||
|
@ -1,6 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import os
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
|
||||
@ -74,6 +75,13 @@ def test_models(
|
||||
vllm_extra_kwargs["override_pooler_config"] = \
|
||||
PoolerConfig(pooling_type="MEAN", normalize=False)
|
||||
|
||||
max_model_len: Optional[int] = 512
|
||||
if model in [
|
||||
"sentence-transformers/all-MiniLM-L12-v2",
|
||||
"sentence-transformers/stsb-roberta-base-v2"
|
||||
]:
|
||||
max_model_len = None
|
||||
|
||||
# The example_prompts has ending "\n", for example:
|
||||
# "Write a short story about a robot that dreams for the first time.\n"
|
||||
# sentence_transformers will strip the input texts, see:
|
||||
@ -87,7 +95,7 @@ def test_models(
|
||||
|
||||
with vllm_runner(model,
|
||||
task="embed",
|
||||
max_model_len=512,
|
||||
max_model_len=max_model_len,
|
||||
**vllm_extra_kwargs) as vllm_model:
|
||||
vllm_outputs = vllm_model.embed(example_prompts)
|
||||
|
||||
|
@ -56,10 +56,16 @@ MODELS = [
|
||||
enable_test=False),
|
||||
]
|
||||
|
||||
V1FlashAttentionImpNotSupported = [
|
||||
"Alibaba-NLP/gte-Qwen2-1.5B-instruct", "Alibaba-NLP/gte-modernbert-base"
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
def test_embed_models_mteb(hf_runner, vllm_runner,
|
||||
model_info: EmbedModelInfo) -> None:
|
||||
def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo,
|
||||
monkeypatch) -> None:
|
||||
if model_info.name in V1FlashAttentionImpNotSupported:
|
||||
monkeypatch.setenv("VLLM_USE_V1", "0")
|
||||
|
||||
vllm_extra_kwargs: dict[str, Any] = {}
|
||||
if model_info.architecture == "GteNewModel":
|
||||
@ -71,8 +77,10 @@ def test_embed_models_mteb(hf_runner, vllm_runner,
|
||||
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
def test_embed_models_correctness(hf_runner, vllm_runner,
|
||||
model_info: EmbedModelInfo,
|
||||
example_prompts) -> None:
|
||||
model_info: EmbedModelInfo, example_prompts,
|
||||
monkeypatch) -> None:
|
||||
if model_info.name in V1FlashAttentionImpNotSupported:
|
||||
monkeypatch.setenv("VLLM_USE_V1", "0")
|
||||
|
||||
vllm_extra_kwargs: dict[str, Any] = {}
|
||||
if model_info.architecture == "GteNewModel":
|
||||
|
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
|
||||
from ...utils import EmbedModelInfo
|
||||
|
84
tests/models/language/pooling/test_mxbai_rerank.py
Normal file
84
tests/models/language/pooling/test_mxbai_rerank.py
Normal file
@ -0,0 +1,84 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from tests.conftest import HfRunner
|
||||
|
||||
from .mteb_utils import RerankModelInfo, mteb_test_rerank_models
|
||||
|
||||
RERANK_MODELS = [
|
||||
RerankModelInfo("mixedbread-ai/mxbai-rerank-base-v2",
|
||||
architecture="Qwen2ForSequenceClassification",
|
||||
dtype="float32",
|
||||
enable_test=True),
|
||||
RerankModelInfo("mixedbread-ai/mxbai-rerank-large-v2",
|
||||
architecture="Qwen2ForSequenceClassification",
|
||||
dtype="float32",
|
||||
enable_test=False)
|
||||
]
|
||||
|
||||
|
||||
class MxbaiRerankerHfRunner(HfRunner):
|
||||
|
||||
def __init__(self,
|
||||
model_name: str,
|
||||
dtype: str = "auto",
|
||||
*args: Any,
|
||||
**kwargs: Any) -> None:
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
super().__init__(model_name, dtype, auto_cls=AutoModelForCausalLM)
|
||||
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(model_name,
|
||||
padding_side='left')
|
||||
self.yes_loc = self.tokenizer.convert_tokens_to_ids("1")
|
||||
self.no_loc = self.tokenizer.convert_tokens_to_ids("0")
|
||||
|
||||
def predict(self, prompts: list[list[str]], *args,
|
||||
**kwargs) -> torch.Tensor:
|
||||
|
||||
def process_inputs(pairs):
|
||||
inputs = self.tokenizer(pairs,
|
||||
padding=False,
|
||||
truncation='longest_first',
|
||||
return_attention_mask=False)
|
||||
for i, ele in enumerate(inputs['input_ids']):
|
||||
inputs['input_ids'][i] = ele
|
||||
inputs = self.tokenizer.pad(inputs,
|
||||
padding=True,
|
||||
return_tensors="pt")
|
||||
for key in inputs:
|
||||
inputs[key] = inputs[key].to(self.model.device)
|
||||
return inputs
|
||||
|
||||
@torch.no_grad()
|
||||
def compute_logits(inputs):
|
||||
logits = self.model(**inputs).logits[:, -1, :]
|
||||
yes_logits = logits[:, self.yes_loc]
|
||||
no_logits = logits[:, self.no_loc]
|
||||
logits = yes_logits - no_logits
|
||||
scores = logits.float().sigmoid()
|
||||
return scores
|
||||
|
||||
scores = []
|
||||
for prompt in prompts:
|
||||
inputs = process_inputs([prompt])
|
||||
score = compute_logits(inputs)
|
||||
scores.append(score[0].item())
|
||||
return torch.Tensor(scores)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", RERANK_MODELS)
|
||||
def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
|
||||
vllm_extra_kwargs: dict[str, Any] = {}
|
||||
if model_info.architecture == "Qwen2ForSequenceClassification":
|
||||
vllm_extra_kwargs["hf_overrides"] = {
|
||||
"architectures": ["Qwen2ForSequenceClassification"],
|
||||
"classifier_from_token": ["0", "1"],
|
||||
"method": "from_2_way_softmax",
|
||||
}
|
||||
|
||||
mteb_test_rerank_models(MxbaiRerankerHfRunner, vllm_runner, model_info,
|
||||
vllm_extra_kwargs)
|
@ -33,9 +33,6 @@ if current_platform.is_rocm():
|
||||
os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
|
||||
|
||||
REQUIRES_V0_MODELS = [
|
||||
# V1 Test: no way to fall back for head_dim = 80
|
||||
# https://github.com/vllm-project/vllm/issues/14524
|
||||
"qwen_vl",
|
||||
# V1 Test: not enough KV cache space in C1.
|
||||
"fuyu",
|
||||
]
|
||||
@ -221,8 +218,7 @@ VLM_TEST_SETTINGS = {
|
||||
marks=[large_gpu_mark(min_gb=32)],
|
||||
),
|
||||
"blip2": VLMTestInfo(
|
||||
# TODO: Change back to 2.7b once head_dim = 80 is supported
|
||||
models=["Salesforce/blip2-opt-6.7b"],
|
||||
models=["Salesforce/blip2-opt-2.7b"],
|
||||
test_type=VLMTestType.IMAGE,
|
||||
prompt_formatter=lambda img_prompt: f"Question: {img_prompt} Answer:",
|
||||
img_idx_to_prompt=lambda idx: "",
|
||||
@ -340,8 +336,7 @@ VLM_TEST_SETTINGS = {
|
||||
"h2ovl": VLMTestInfo(
|
||||
models = [
|
||||
"h2oai/h2ovl-mississippi-800m",
|
||||
# TODO: Re-enable once head_dim = 80 is supported
|
||||
# "h2oai/h2ovl-mississippi-2b",
|
||||
"h2oai/h2ovl-mississippi-2b",
|
||||
],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
prompt_formatter=lambda img_prompt: f"<|prompt|>{img_prompt}<|end|><|answer|>", # noqa: E501
|
||||
|
@ -83,7 +83,7 @@ MODELS = [
|
||||
QWEN2_CONFIG,
|
||||
PHI3_CONFIG,
|
||||
GPT2_CONFIG,
|
||||
# STABLELM_CONFIG, # enable this when v1 support head_size=80
|
||||
STABLELM_CONFIG,
|
||||
DOLPHIN_CONFIG,
|
||||
# STARCODER_CONFIG, # broken
|
||||
]
|
||||
|
@ -169,7 +169,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
|
||||
"ExaoneForCausalLM": _HfExamplesInfo("LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"), # noqa: E501
|
||||
"Fairseq2LlamaForCausalLM": _HfExamplesInfo("mgleize/fairseq2-dummy-Llama-3.2-1B"), # noqa: E501
|
||||
"FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"),
|
||||
"FalconH1ForCausalLM":_HfExamplesInfo("tiiuae/Falcon-H1-1.5B-Instruct",
|
||||
"FalconH1ForCausalLM":_HfExamplesInfo("tiiuae/Falcon-H1-0.5B-Base",
|
||||
min_transformers_version="4.53"),
|
||||
"GemmaForCausalLM": _HfExamplesInfo("google/gemma-1.1-2b-it"),
|
||||
"Gemma2ForCausalLM": _HfExamplesInfo("google/gemma-2-9b"),
|
||||
@ -240,8 +240,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
|
||||
"OrionForCausalLM": _HfExamplesInfo("OrionStarAI/Orion-14B-Chat",
|
||||
trust_remote_code=True),
|
||||
"PersimmonForCausalLM": _HfExamplesInfo("adept/persimmon-8b-chat"),
|
||||
"PhiForCausalLM": _HfExamplesInfo("microsoft/phi-2", v0_only=True),
|
||||
"PhiForCausalLM": _HfExamplesInfo("microsoft/phi-2"),
|
||||
"Phi3ForCausalLM": _HfExamplesInfo("microsoft/Phi-3-mini-4k-instruct"),
|
||||
# Blocksparse attention not supported in V1 yet
|
||||
"Phi3SmallForCausalLM": _HfExamplesInfo("microsoft/Phi-3-small-8k-instruct",
|
||||
trust_remote_code=True,
|
||||
v0_only=True),
|
||||
@ -258,10 +259,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
|
||||
"Qwen3MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen3-30B-A3B"),
|
||||
"Qwen3ForSequenceClassification": _HfExamplesInfo("tomaarsen/Qwen3-Reranker-0.6B-seq-cls"), # noqa: E501
|
||||
"RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b"),
|
||||
"StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b", # noqa: E501
|
||||
v0_only=True),
|
||||
"StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t",
|
||||
v0_only=True),
|
||||
"StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b"), # noqa: E501
|
||||
"StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t"),
|
||||
"Starcoder2ForCausalLM": _HfExamplesInfo("bigcode/starcoder2-3b"),
|
||||
"SolarForCausalLM": _HfExamplesInfo("upstage/solar-pro-preview-instruct"),
|
||||
"TeleChat2ForCausalLM": _HfExamplesInfo("Tele-AI/TeleChat2-3B",
|
||||
@ -330,8 +329,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
||||
"AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria"),
|
||||
"AyaVisionForConditionalGeneration": _HfExamplesInfo("CohereForAI/aya-vision-8b"), # noqa: E501
|
||||
"Blip2ForConditionalGeneration": _HfExamplesInfo("Salesforce/blip2-opt-2.7b", # noqa: E501
|
||||
extras={"6b": "Salesforce/blip2-opt-6.7b"}, # noqa: E501
|
||||
v0_only=True),
|
||||
extras={"6b": "Salesforce/blip2-opt-6.7b"}), # noqa: E501
|
||||
"ChameleonForConditionalGeneration": _HfExamplesInfo("facebook/chameleon-7b"), # noqa: E501
|
||||
"DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-tiny", # noqa: E501
|
||||
extras={"fork": "Isotr0py/deepseek-vl2-tiny"}, # noqa: E501
|
||||
@ -359,8 +357,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
||||
trust_remote_code=True),
|
||||
"KimiVLForConditionalGeneration": _HfExamplesInfo("moonshotai/Kimi-VL-A3B-Instruct", # noqa: E501
|
||||
extras={"thinking": "moonshotai/Kimi-VL-A3B-Thinking"}, # noqa: E501
|
||||
trust_remote_code=True,
|
||||
v0_only=True),
|
||||
trust_remote_code=True),
|
||||
"Llama4ForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct", # noqa: E501
|
||||
max_model_len=10240),
|
||||
"LlavaForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-1.5-7b-hf",
|
||||
|
@ -22,7 +22,8 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
|
||||
# FIXME: Possible memory leak in the previous tests?
|
||||
if model_arch == "GraniteSpeechForConditionalGeneration":
|
||||
if model_arch in ("GraniteSpeechForConditionalGeneration",
|
||||
"KimiVLForConditionalGeneration"):
|
||||
pytest.skip("Avoid OOM")
|
||||
|
||||
# Avoid OOM and reduce initialization time by only using 1 layer
|
||||
|
@ -9,9 +9,9 @@ import torch.cuda
|
||||
from vllm.model_executor.models import (is_pooling_model,
|
||||
is_text_generation_model,
|
||||
supports_multimodal)
|
||||
from vllm.model_executor.models.adapters import (as_classification_model,
|
||||
as_embedding_model,
|
||||
as_reward_model)
|
||||
from vllm.model_executor.models.adapters import (as_embedding_model,
|
||||
as_reward_model,
|
||||
as_seq_cls_model)
|
||||
from vllm.model_executor.models.registry import (_MULTIMODAL_MODELS,
|
||||
_SPECULATIVE_DECODING_MODELS,
|
||||
_TEXT_GENERATION_MODELS,
|
||||
@ -38,7 +38,7 @@ def test_registry_imports(model_arch):
|
||||
assert is_text_generation_model(model_cls)
|
||||
|
||||
# All vLLM models should be convertible to a pooling model
|
||||
assert is_pooling_model(as_classification_model(model_cls))
|
||||
assert is_pooling_model(as_seq_cls_model(model_cls))
|
||||
assert is_pooling_model(as_embedding_model(model_cls))
|
||||
assert is_pooling_model(as_reward_model(model_cls))
|
||||
|
||||
|
@ -172,9 +172,10 @@ async def test_fetch_video_http(video_url: str, num_frames: int):
|
||||
"num_frames": num_frames,
|
||||
}})
|
||||
|
||||
video_sync = connector.fetch_video(video_url)
|
||||
video_async = await connector.fetch_video_async(video_url)
|
||||
assert np.array_equal(video_sync[0], video_async[0])
|
||||
video_sync, metadata_sync = connector.fetch_video(video_url)
|
||||
video_async, metadata_async = await connector.fetch_video_async(video_url)
|
||||
assert np.array_equal(video_sync, video_async)
|
||||
assert metadata_sync == metadata_async
|
||||
|
||||
|
||||
# Used for the next two tests related to `merge_and_sort_multimodal_metadata`.
|
||||
|
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# Copyright © 2025, Oracle and/or its affiliates.
|
||||
"""Tests RTN quantization startup and generation,
|
||||
doesn't test correctness
|
||||
|
@ -52,7 +52,7 @@ def test_get_field():
|
||||
("distilbert/distilgpt2", "generate", "generate"),
|
||||
("intfloat/multilingual-e5-small", "pooling", "embed"),
|
||||
("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify"),
|
||||
("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "score"),
|
||||
("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "classify"),
|
||||
("Qwen/Qwen2.5-Math-RM-72B", "pooling", "reward"),
|
||||
("openai/whisper-small", "transcription", "transcription"),
|
||||
],
|
||||
@ -72,6 +72,32 @@ def test_auto_task(model_id, expected_runner_type, expected_task):
|
||||
assert config.task == expected_task
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("model_id", "expected_runner_type", "expected_task"),
|
||||
[
|
||||
("distilbert/distilgpt2", "pooling", "embed"),
|
||||
("intfloat/multilingual-e5-small", "pooling", "embed"),
|
||||
("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify"),
|
||||
("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "classify"),
|
||||
("Qwen/Qwen2.5-Math-RM-72B", "pooling", "embed"),
|
||||
("openai/whisper-small", "pooling", "embed"),
|
||||
],
|
||||
)
|
||||
def test_score_task(model_id, expected_runner_type, expected_task):
|
||||
config = ModelConfig(
|
||||
model_id,
|
||||
task="score",
|
||||
tokenizer=model_id,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
seed=0,
|
||||
dtype="float16",
|
||||
)
|
||||
|
||||
assert config.runner_type == expected_runner_type
|
||||
assert config.task == expected_task
|
||||
|
||||
|
||||
@pytest.mark.parametrize(("model_id", "bad_task"), [
|
||||
("Qwen/Qwen2.5-Math-RM-72B", "generate"),
|
||||
])
|
||||
|
@ -20,10 +20,11 @@ from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
|
||||
from vllm.utils import (CacheInfo, FlexibleArgumentParser, LRUCache,
|
||||
MemorySnapshot, PlaceholderModule, StoreBoolean,
|
||||
bind_kv_cache, common_broadcastable_dtype,
|
||||
deprecate_kwargs, get_open_port, is_lossless_cast,
|
||||
make_zmq_path, make_zmq_socket, memory_profiling,
|
||||
merge_async_iterators, sha256, split_zmq_path,
|
||||
supports_kw, swap_dict_values)
|
||||
deprecate_kwargs, get_open_port, get_tcp_uri,
|
||||
is_lossless_cast, join_host_port, make_zmq_path,
|
||||
make_zmq_socket, memory_profiling,
|
||||
merge_async_iterators, sha256, split_host_port,
|
||||
split_zmq_path, supports_kw, swap_dict_values)
|
||||
|
||||
from .utils import create_new_process_for_each_test, error_on_warning
|
||||
|
||||
@ -876,3 +877,44 @@ def test_make_zmq_socket_ipv6():
|
||||
def test_make_zmq_path():
|
||||
assert make_zmq_path("tcp", "127.0.0.1", "5555") == "tcp://127.0.0.1:5555"
|
||||
assert make_zmq_path("tcp", "::1", "5555") == "tcp://[::1]:5555"
|
||||
|
||||
|
||||
def test_get_tcp_uri():
|
||||
assert get_tcp_uri("127.0.0.1", 5555) == "tcp://127.0.0.1:5555"
|
||||
assert get_tcp_uri("::1", 5555) == "tcp://[::1]:5555"
|
||||
|
||||
|
||||
def test_split_host_port():
|
||||
# valid ipv4
|
||||
assert split_host_port("127.0.0.1:5555") == ("127.0.0.1", 5555)
|
||||
# invalid ipv4
|
||||
with pytest.raises(ValueError):
|
||||
# multi colon
|
||||
assert split_host_port("127.0.0.1::5555")
|
||||
with pytest.raises(ValueError):
|
||||
# tailing colon
|
||||
assert split_host_port("127.0.0.1:5555:")
|
||||
with pytest.raises(ValueError):
|
||||
# no colon
|
||||
assert split_host_port("127.0.0.15555")
|
||||
with pytest.raises(ValueError):
|
||||
# none int port
|
||||
assert split_host_port("127.0.0.1:5555a")
|
||||
|
||||
# valid ipv6
|
||||
assert split_host_port("[::1]:5555") == ("::1", 5555)
|
||||
# invalid ipv6
|
||||
with pytest.raises(ValueError):
|
||||
# multi colon
|
||||
assert split_host_port("[::1]::5555")
|
||||
with pytest.raises(IndexError):
|
||||
# no colon
|
||||
assert split_host_port("[::1]5555")
|
||||
with pytest.raises(ValueError):
|
||||
# none int port
|
||||
assert split_host_port("[::1]:5555a")
|
||||
|
||||
|
||||
def test_join_host_port():
|
||||
assert join_host_port("127.0.0.1", 5555) == "127.0.0.1:5555"
|
||||
assert join_host_port("::1", 5555) == "[::1]:5555"
|
||||
|
372
tests/tool_use/test_minimax_tool_parser.py
Normal file
372
tests/tool_use/test_minimax_tool_parser.py
Normal file
@ -0,0 +1,372 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# ruff: noqa: E501
|
||||
|
||||
import json
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall
|
||||
from vllm.entrypoints.openai.tool_parsers import MinimaxToolParser
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
|
||||
# Use a common model that is likely to be available
|
||||
MODEL = "MiniMaxAi/MiniMax-M1-40k"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def minimax_tokenizer():
|
||||
return get_tokenizer(tokenizer_name=MODEL)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def minimax_tool_parser(minimax_tokenizer):
|
||||
return MinimaxToolParser(minimax_tokenizer)
|
||||
|
||||
|
||||
def assert_tool_calls(actual_tool_calls: list[ToolCall],
|
||||
expected_tool_calls: list[ToolCall]):
|
||||
assert len(actual_tool_calls) == len(expected_tool_calls)
|
||||
|
||||
for actual_tool_call, expected_tool_call in zip(actual_tool_calls,
|
||||
expected_tool_calls):
|
||||
assert isinstance(actual_tool_call.id, str)
|
||||
assert len(actual_tool_call.id) > 16
|
||||
|
||||
assert actual_tool_call.type == "function"
|
||||
assert actual_tool_call.function == expected_tool_call.function
|
||||
|
||||
|
||||
def test_extract_tool_calls_no_tools(minimax_tool_parser):
|
||||
model_output = "This is a test"
|
||||
extracted_tool_calls = minimax_tool_parser.extract_tool_calls(
|
||||
model_output, request=None) # type: ignore[arg-type]
|
||||
assert not extracted_tool_calls.tools_called
|
||||
assert extracted_tool_calls.tool_calls == []
|
||||
assert extracted_tool_calls.content == model_output
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
ids=[
|
||||
"single_tool_call",
|
||||
"multiple_tool_calls",
|
||||
"tool_call_with_content_before",
|
||||
"tool_call_with_single_line_json",
|
||||
"tool_call_incomplete_tag",
|
||||
],
|
||||
argnames=["model_output", "expected_tool_calls", "expected_content"],
|
||||
argvalues=[
|
||||
(
|
||||
"""<tool_calls>
|
||||
{"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}
|
||||
</tool_calls>""",
|
||||
[
|
||||
ToolCall(function=FunctionCall(
|
||||
name="get_current_weather",
|
||||
arguments=json.dumps({
|
||||
"city": "Dallas",
|
||||
"state": "TX",
|
||||
"unit": "fahrenheit",
|
||||
}),
|
||||
))
|
||||
],
|
||||
None,
|
||||
),
|
||||
(
|
||||
"""<tool_calls>
|
||||
{"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}
|
||||
{"name": "get_current_weather", "arguments": {"city": "Orlando", "state": "FL", "unit": "fahrenheit"}}
|
||||
</tool_calls>""",
|
||||
[
|
||||
ToolCall(function=FunctionCall(
|
||||
name="get_current_weather",
|
||||
arguments=json.dumps({
|
||||
"city": "Dallas",
|
||||
"state": "TX",
|
||||
"unit": "fahrenheit",
|
||||
}),
|
||||
)),
|
||||
ToolCall(function=FunctionCall(
|
||||
name="get_current_weather",
|
||||
arguments=json.dumps({
|
||||
"city": "Orlando",
|
||||
"state": "FL",
|
||||
"unit": "fahrenheit",
|
||||
}),
|
||||
)),
|
||||
],
|
||||
None,
|
||||
),
|
||||
(
|
||||
"""I'll help you check the weather. <tool_calls>
|
||||
{"name": "get_current_weather", "arguments": {"city": "Seattle", "state": "WA", "unit": "celsius"}}
|
||||
</tool_calls>""",
|
||||
[
|
||||
ToolCall(function=FunctionCall(
|
||||
name="get_current_weather",
|
||||
arguments=json.dumps({
|
||||
"city": "Seattle",
|
||||
"state": "WA",
|
||||
"unit": "celsius",
|
||||
}),
|
||||
))
|
||||
],
|
||||
"I'll help you check the weather.",
|
||||
),
|
||||
(
|
||||
"""<tool_calls>
|
||||
{"name": "get_current_weather", "arguments": {"city": "New York", "state": "NY", "unit": "celsius"}}
|
||||
</tool_calls>""",
|
||||
[
|
||||
ToolCall(function=FunctionCall(
|
||||
name="get_current_weather",
|
||||
arguments=json.dumps({
|
||||
"city": "New York",
|
||||
"state": "NY",
|
||||
"unit": "celsius",
|
||||
}),
|
||||
))
|
||||
],
|
||||
None,
|
||||
),
|
||||
(
|
||||
"""<tool_calls>
|
||||
{"name": "get_current_weather", "arguments": {"city": "Boston", "state": "MA"}}""",
|
||||
[
|
||||
ToolCall(function=FunctionCall(
|
||||
name="get_current_weather",
|
||||
arguments=json.dumps({
|
||||
"city": "Boston",
|
||||
"state": "MA",
|
||||
}),
|
||||
))
|
||||
],
|
||||
None,
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_extract_tool_calls(minimax_tool_parser, model_output,
|
||||
expected_tool_calls, expected_content):
|
||||
extracted_tool_calls = minimax_tool_parser.extract_tool_calls(
|
||||
model_output, request=None) # type: ignore[arg-type]
|
||||
assert extracted_tool_calls.tools_called
|
||||
|
||||
assert_tool_calls(extracted_tool_calls.tool_calls, expected_tool_calls)
|
||||
|
||||
assert extracted_tool_calls.content == expected_content
|
||||
|
||||
|
||||
def test_preprocess_model_output_with_thinking_tags(minimax_tool_parser):
|
||||
"""Test that tool calls within thinking tags are removed during preprocessing."""
|
||||
model_output = """<think>Let me think about this. <tool_calls>
|
||||
{"name": "fake_tool", "arguments": {"param": "value"}}
|
||||
</tool_calls> This should be removed.</think>
|
||||
|
||||
I'll help you with that. <tool_calls>
|
||||
{"name": "get_current_weather", "arguments": {"city": "Seattle", "state": "WA"}}
|
||||
</tool_calls>"""
|
||||
|
||||
processed_output = minimax_tool_parser.preprocess_model_output(
|
||||
model_output)
|
||||
|
||||
# The tool call within thinking tags should be removed
|
||||
assert "fake_tool" not in processed_output
|
||||
# But the thinking tag itself should remain
|
||||
assert "<think>" in processed_output
|
||||
assert "</think>" in processed_output
|
||||
# The actual tool call outside thinking tags should remain
|
||||
assert "get_current_weather" in processed_output
|
||||
|
||||
|
||||
def test_extract_tool_calls_with_thinking_tags(minimax_tool_parser):
|
||||
"""Test tool extraction when thinking tags contain tool calls that should be ignored."""
|
||||
model_output = """<think>I should use a tool. <tool_calls>
|
||||
{"name": "ignored_tool", "arguments": {"should": "ignore"}}
|
||||
</tool_calls></think>
|
||||
|
||||
Let me help you with the weather. <tool_calls>
|
||||
{"name": "get_current_weather", "arguments": {"city": "Miami", "state": "FL", "unit": "fahrenheit"}}
|
||||
</tool_calls>"""
|
||||
|
||||
extracted_tool_calls = minimax_tool_parser.extract_tool_calls(
|
||||
model_output, request=None) # type: ignore[arg-type]
|
||||
|
||||
assert extracted_tool_calls.tools_called
|
||||
assert len(extracted_tool_calls.tool_calls) == 1
|
||||
assert extracted_tool_calls.tool_calls[
|
||||
0].function.name == "get_current_weather"
|
||||
|
||||
# Content extraction is based on the position of the first <tool_calls> in the original model_output
|
||||
# Since preprocessing removes tool calls within thinking tags, the actual first <tool_calls> is the external one
|
||||
expected_content = """<think>I should use a tool. <tool_calls>
|
||||
{"name": "ignored_tool", "arguments": {"should": "ignore"}}
|
||||
</tool_calls></think>
|
||||
|
||||
Let me help you with the weather."""
|
||||
assert extracted_tool_calls.content == expected_content
|
||||
|
||||
|
||||
def test_extract_tool_calls_invalid_json(minimax_tool_parser):
|
||||
"""Test that invalid JSON in tool calls is handled gracefully."""
|
||||
model_output = """<tool_calls>
|
||||
{"name": "valid_tool", "arguments": {"city": "Seattle"}}
|
||||
{invalid json here}
|
||||
{"name": "another_valid_tool", "arguments": {"param": "value"}}
|
||||
</tool_calls>"""
|
||||
|
||||
extracted_tool_calls = minimax_tool_parser.extract_tool_calls(
|
||||
model_output, request=None) # type: ignore[arg-type]
|
||||
|
||||
assert extracted_tool_calls.tools_called
|
||||
# Should extract only the valid JSON tool calls
|
||||
assert len(extracted_tool_calls.tool_calls) == 2
|
||||
assert extracted_tool_calls.tool_calls[0].function.name == "valid_tool"
|
||||
assert extracted_tool_calls.tool_calls[
|
||||
1].function.name == "another_valid_tool"
|
||||
|
||||
|
||||
def test_extract_tool_calls_missing_name_or_arguments(minimax_tool_parser):
|
||||
"""Test that tool calls missing name or arguments are filtered out."""
|
||||
model_output = """<tool_calls>
|
||||
{"name": "valid_tool", "arguments": {"city": "Seattle"}}
|
||||
{"name": "missing_args"}
|
||||
{"arguments": {"city": "Portland"}}
|
||||
{"name": "another_valid_tool", "arguments": {"param": "value"}}
|
||||
</tool_calls>"""
|
||||
|
||||
extracted_tool_calls = minimax_tool_parser.extract_tool_calls(
|
||||
model_output, request=None) # type: ignore[arg-type]
|
||||
|
||||
assert extracted_tool_calls.tools_called
|
||||
# Should extract only the valid tool calls with both name and arguments
|
||||
assert len(extracted_tool_calls.tool_calls) == 2
|
||||
assert extracted_tool_calls.tool_calls[0].function.name == "valid_tool"
|
||||
assert extracted_tool_calls.tool_calls[
|
||||
1].function.name == "another_valid_tool"
|
||||
|
||||
|
||||
def test_streaming_basic_functionality(minimax_tool_parser):
|
||||
"""Test basic streaming functionality."""
|
||||
# Reset streaming state
|
||||
minimax_tool_parser.current_tool_name_sent = False
|
||||
minimax_tool_parser.prev_tool_call_arr = []
|
||||
minimax_tool_parser.current_tool_id = -1
|
||||
minimax_tool_parser.streamed_args_for_tool = []
|
||||
|
||||
# Test with a simple tool call
|
||||
current_text = """<tool_calls>
|
||||
{"name": "get_current_weather", "arguments": {"city": "Seattle"}}
|
||||
</tool_calls>"""
|
||||
|
||||
# First call should handle the initial setup
|
||||
result = minimax_tool_parser.extract_tool_calls_streaming(
|
||||
previous_text="",
|
||||
current_text=current_text,
|
||||
delta_text="</tool_calls>",
|
||||
previous_token_ids=[],
|
||||
current_token_ids=[],
|
||||
delta_token_ids=[],
|
||||
request=None,
|
||||
)
|
||||
|
||||
# The result might be None or contain tool call information
|
||||
# This depends on the internal state management
|
||||
if result is not None and hasattr(result,
|
||||
'tool_calls') and result.tool_calls:
|
||||
assert len(result.tool_calls) >= 0
|
||||
|
||||
|
||||
def test_streaming_with_content_before_tool_calls(minimax_tool_parser):
|
||||
"""Test streaming when there's content before tool calls."""
|
||||
# Reset streaming state
|
||||
minimax_tool_parser.current_tool_name_sent = False
|
||||
minimax_tool_parser.prev_tool_call_arr = []
|
||||
minimax_tool_parser.current_tool_id = -1
|
||||
minimax_tool_parser.streamed_args_for_tool = []
|
||||
|
||||
current_text = "I'll help you with that. <tool_calls>"
|
||||
|
||||
# When there's content before tool calls, it should be returned as content
|
||||
result = minimax_tool_parser.extract_tool_calls_streaming(
|
||||
previous_text="I'll help you",
|
||||
current_text=current_text,
|
||||
delta_text=" with that. <tool_calls>",
|
||||
previous_token_ids=[],
|
||||
current_token_ids=[],
|
||||
delta_token_ids=[],
|
||||
request=None,
|
||||
)
|
||||
|
||||
if result is not None and hasattr(result, 'content'):
|
||||
# Should contain some content
|
||||
assert result.content is not None
|
||||
|
||||
|
||||
def test_streaming_no_tool_calls(minimax_tool_parser):
|
||||
"""Test streaming when there are no tool calls."""
|
||||
current_text = "This is just regular text without any tool calls."
|
||||
|
||||
result = minimax_tool_parser.extract_tool_calls_streaming(
|
||||
previous_text="This is just regular text",
|
||||
current_text=current_text,
|
||||
delta_text=" without any tool calls.",
|
||||
previous_token_ids=[],
|
||||
current_token_ids=[],
|
||||
delta_token_ids=[],
|
||||
request=None,
|
||||
)
|
||||
|
||||
# Should return the delta text as content
|
||||
assert result is not None
|
||||
assert hasattr(result, 'content')
|
||||
assert result.content == " without any tool calls."
|
||||
|
||||
|
||||
def test_streaming_with_thinking_tags(minimax_tool_parser):
|
||||
"""Test streaming with thinking tags that contain tool calls."""
|
||||
# Reset streaming state
|
||||
minimax_tool_parser.current_tool_name_sent = False
|
||||
minimax_tool_parser.prev_tool_call_arr = []
|
||||
minimax_tool_parser.current_tool_id = -1
|
||||
minimax_tool_parser.streamed_args_for_tool = []
|
||||
|
||||
current_text = """<think><tool_calls>{"name": "ignored", "arguments": {}}</tool_calls></think><tool_calls>{"name": "real_tool", "arguments": {"param": "value"}}</tool_calls>"""
|
||||
|
||||
result = minimax_tool_parser.extract_tool_calls_streaming(
|
||||
previous_text="",
|
||||
current_text=current_text,
|
||||
delta_text=current_text,
|
||||
previous_token_ids=[],
|
||||
current_token_ids=[],
|
||||
delta_token_ids=[],
|
||||
request=None,
|
||||
)
|
||||
|
||||
# The preprocessing should remove tool calls from thinking tags
|
||||
# and only process the real tool call
|
||||
if result is not None and hasattr(result,
|
||||
'tool_calls') and result.tool_calls:
|
||||
for tool_call in result.tool_calls:
|
||||
assert tool_call.function.name != "ignored"
|
||||
|
||||
|
||||
def test_extract_tool_calls_multiline_json_not_supported(minimax_tool_parser):
|
||||
"""Test that multiline JSON in tool calls is not currently supported."""
|
||||
model_output = """<tool_calls>
|
||||
{
|
||||
"name": "get_current_weather",
|
||||
"arguments": {
|
||||
"city": "New York",
|
||||
"state": "NY",
|
||||
"unit": "celsius"
|
||||
}
|
||||
}
|
||||
</tool_calls>"""
|
||||
|
||||
extracted_tool_calls = minimax_tool_parser.extract_tool_calls(
|
||||
model_output, request=None) # type: ignore[arg-type]
|
||||
|
||||
# Multiline JSON is currently not supported, should return no tools called
|
||||
assert not extracted_tool_calls.tools_called
|
||||
assert extracted_tool_calls.tool_calls == []
|
||||
assert extracted_tool_calls.content is None
|
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import json
|
||||
|
||||
|
@ -9,7 +9,7 @@ import torch
|
||||
from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig,
|
||||
SchedulerConfig, SpeculativeConfig, VllmConfig)
|
||||
from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.sampling_params import GuidedDecodingParams, SamplingParams
|
||||
from vllm.v1.core.sched.output import CachedRequestData, SchedulerOutput
|
||||
from vllm.v1.core.sched.scheduler import Scheduler
|
||||
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
|
||||
@ -17,6 +17,7 @@ from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
|
||||
from vllm.v1.outputs import ModelRunnerOutput
|
||||
from vllm.v1.request import Request, RequestStatus
|
||||
from vllm.v1.structured_output import StructuredOutputManager
|
||||
from vllm.v1.structured_output.request import StructuredOutputRequest
|
||||
|
||||
EOS_TOKEN_ID = 50256
|
||||
|
||||
@ -33,6 +34,7 @@ def create_scheduler(
|
||||
block_size: int = 16,
|
||||
max_model_len: Optional[int] = None,
|
||||
num_speculative_tokens: Optional[int] = None,
|
||||
skip_tokenizer_init: bool = False,
|
||||
) -> Scheduler:
|
||||
'''Create scheduler under test.
|
||||
|
||||
@ -65,6 +67,7 @@ def create_scheduler(
|
||||
trust_remote_code=True,
|
||||
dtype="float16",
|
||||
seed=42,
|
||||
skip_tokenizer_init=skip_tokenizer_init,
|
||||
)
|
||||
# Cache config, optionally force APC
|
||||
kwargs_cache = ({} if enable_prefix_caching is None else {
|
||||
@ -1857,3 +1860,39 @@ def test_priority_scheduling_heap_property():
|
||||
# Verify requests were scheduled in priority order (lowest value first)
|
||||
expected_priorities = sorted(priorities)
|
||||
assert scheduled_priorities == expected_priorities
|
||||
|
||||
|
||||
def test_schedule_skip_tokenizer_init():
|
||||
scheduler = create_scheduler(skip_tokenizer_init=True)
|
||||
requests = create_requests(num_requests=5)
|
||||
for request in requests:
|
||||
scheduler.add_request(request)
|
||||
output = scheduler.schedule()
|
||||
assert len(output.scheduled_new_reqs) == len(requests)
|
||||
assert output.grammar_bitmask is None
|
||||
|
||||
|
||||
def test_schedule_skip_tokenizer_init_structured_output_request():
|
||||
scheduler = create_scheduler(skip_tokenizer_init=True)
|
||||
guided_params = GuidedDecodingParams(regex="[0-9]+")
|
||||
sampling_params = SamplingParams(
|
||||
ignore_eos=False,
|
||||
max_tokens=16,
|
||||
guided_decoding=guided_params,
|
||||
)
|
||||
request = Request(
|
||||
request_id="0",
|
||||
prompt_token_ids=[0, 1],
|
||||
multi_modal_inputs=None,
|
||||
multi_modal_hashes=None,
|
||||
multi_modal_placeholders=None,
|
||||
sampling_params=sampling_params,
|
||||
pooling_params=None,
|
||||
eos_token_id=EOS_TOKEN_ID,
|
||||
structured_output_request=StructuredOutputRequest(sampling_params),
|
||||
)
|
||||
scheduler.add_request(request)
|
||||
output = scheduler.schedule()
|
||||
assert len(output.scheduled_new_reqs) == 0
|
||||
assert len(scheduler.running) == 0
|
||||
assert len(scheduler.waiting) == 1
|
||||
|
@ -26,8 +26,8 @@ from vllm.v1.engine import EngineCoreRequest
|
||||
from vllm.v1.engine.core import EngineCore
|
||||
from vllm.v1.engine.core_client import (AsyncMPClient, EngineCoreClient,
|
||||
SyncMPClient)
|
||||
from vllm.v1.engine.utils import CoreEngineProcManager
|
||||
from vllm.v1.executor.abstract import Executor
|
||||
from vllm.v1.utils import CoreEngineProcManager
|
||||
|
||||
from ...distributed.conftest import MockSubscriber
|
||||
from ...utils import create_new_process_for_each_test
|
||||
@ -563,7 +563,7 @@ def test_engine_core_proc_instantiation_cuda_empty(
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
m.setenv("CUDA_VISIBLE_DEVICES", "") # No CUDA devices
|
||||
|
||||
from vllm.v1.utils import EngineZmqAddresses
|
||||
from vllm.v1.engine.utils import EngineZmqAddresses
|
||||
|
||||
def mock_startup_handshake(self, handshake_socket, on_head_node,
|
||||
parallel_config):
|
||||
@ -580,7 +580,7 @@ def test_engine_core_proc_instantiation_cuda_empty(
|
||||
trust_remote_code=True).create_engine_config()
|
||||
engine_core_proc = EngineCoreProc(
|
||||
vllm_config=vllm_config,
|
||||
on_head_node=True,
|
||||
local_client=True,
|
||||
handshake_address="tcp://127.0.0.1:12345",
|
||||
executor_class=mock_executor_class,
|
||||
log_stats=False,
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user