mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 14:53:52 +08:00
Compare commits
167 Commits
v0.11.1rc1
...
8a81d776ce
Author | SHA1 | Date | |
---|---|---|---|
8a81d776ce | |||
f6fdacd82c | |||
d31f7844f8 | |||
7a6c8c3fa1 | |||
221bf72577 | |||
b3aba04e5a | |||
8a297115e2 | |||
191eed0bb9 | |||
fb860670da | |||
83e760c57d | |||
c2bba69065 | |||
e133d6d218 | |||
a1946c9f61 | |||
9f020f4f31 | |||
3b45075206 | |||
168e578efc | |||
6ac5e06f7c | |||
5c2acb270a | |||
b26b70bec4 | |||
ab4be40fc5 | |||
245e4f2c01 | |||
1d165d6d85 | |||
83004020fd | |||
12e21701e7 | |||
30a33b92ee | |||
7c572544e4 | |||
c312320764 | |||
c981f0ea78 | |||
6367bde739 | |||
f50cc221ea | |||
acedc74b1a | |||
d29483b58a | |||
950cf9e58e | |||
3125d79950 | |||
e33ee23ee3 | |||
b10c64c834 | |||
0925b28a8e | |||
99722d5f0e | |||
4c91a28e30 | |||
b038d9c40c | |||
2ba60ec7fe | |||
bd7157a071 | |||
be429d0cfd | |||
c253745eb8 | |||
daec4d2624 | |||
6c9fdbf725 | |||
483ea64611 | |||
e20eba753b | |||
bbc1b29665 | |||
acb1bfa601 | |||
75c7ad9918 | |||
5550ff9c25 | |||
3aeb19a39e | |||
8c017b3490 | |||
9c2c2287a0 | |||
fec2b341ad | |||
87bc0c492f | |||
fe3b9372ad | |||
bde9e2272a | |||
08405609cc | |||
ab81379ea6 | |||
4ffd6e8942 | |||
965c5f4914 | |||
4d055ef465 | |||
17c540a993 | |||
4d4d6bad19 | |||
11ae016bd7 | |||
41d3071918 | |||
fb5e10d3fb | |||
b2f78cbad4 | |||
23583ee28c | |||
01c977e96d | |||
b3dda72c23 | |||
fb0571b077 | |||
2ed8b6b3d0 | |||
013abde6ef | |||
a5464dcf92 | |||
ac3ed5a815 | |||
e6ba2000ae | |||
aa255ff55a | |||
7bb736d00e | |||
9f4e30904b | |||
5afd3276df | |||
43721bc67f | |||
02d709a6f1 | |||
4a510ab487 | |||
314fa8abbf | |||
334535b6fb | |||
dcbb3f1871 | |||
00417f4e44 | |||
ed344f4116 | |||
e51928793e | |||
d2740fafbf | |||
17838e50ef | |||
44c8555621 | |||
f7d318de2b | |||
76f0d05bc6 | |||
7d8975de84 | |||
785d8b6410 | |||
f6cdc9a02f | |||
509cdc0370 | |||
9b6504c307 | |||
e19b16dde6 | |||
582f2c6be7 | |||
f8a0acbdbe | |||
1317034379 | |||
0ecc553ee6 | |||
f96bc3649c | |||
938c43ea7f | |||
0a9ef0cfce | |||
e5b438a247 | |||
0b99f5d302 | |||
1f491aa0c8 | |||
de92d916fe | |||
a1063628a4 | |||
d796375258 | |||
14f8456344 | |||
4794c2bd92 | |||
d3cbaa08dc | |||
828523ad8e | |||
136a17fe6e | |||
f57438338d | |||
5d598680e3 | |||
8f4b313c37 | |||
f93e348010 | |||
f54f85129e | |||
d4d1a6024f | |||
db1764e4e0 | |||
7f83b4ee8e | |||
5c3bae1a6a | |||
5210dc3940 | |||
650b51f9f9 | |||
6256697997 | |||
71557a5f7c | |||
f3c378ffa7 | |||
f5ed68ef63 | |||
efdef57b1f | |||
b8a4572157 | |||
302ef403a2 | |||
8865da157b | |||
f0862eae43 | |||
8c851f6d04 | |||
7cfa420f49 | |||
a27b288e4a | |||
e471d7ca7e | |||
c43ca8259e | |||
85a65e7f51 | |||
a2986b3e33 | |||
96b9aa5aa0 | |||
e66d787bce | |||
bfad142e25 | |||
9354660036 | |||
07ca70af8d | |||
2dcd12d357 | |||
579d2e5458 | |||
0512c04aee | |||
7e0ef4084a | |||
4aed506b65 | |||
a86b4c58e8 | |||
ff4810ba73 | |||
9d6964926e | |||
0e65818910 | |||
380f17527c | |||
b92ab3deda | |||
acaa2c0a4a | |||
82af928c41 | |||
87efc681db |
12
.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
Normal file
12
.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
Normal file
@ -0,0 +1,12 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
|
||||
model_name: "HandH1998/QQQ-Llama-3-8b-g128"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.419
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.416
|
||||
limit: 1000
|
||||
num_fewshot: 5
|
@ -0,0 +1,12 @@
|
||||
# For hf script, without -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 100 -t 8
|
||||
model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
|
||||
backend: "vllm-vlm"
|
||||
tasks:
|
||||
- name: "chartqa"
|
||||
metrics:
|
||||
- name: "relaxed_accuracy,none"
|
||||
# TODO(zhewenl): model card is 0.90, but the actual score is 0.80.
|
||||
value: 0.80
|
||||
limit: 100
|
||||
num_fewshot: 0
|
@ -0,0 +1,10 @@
|
||||
# For hf script, without -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 250 -t 8 -f 5
|
||||
model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
|
||||
tasks:
|
||||
- name: "mmlu_pro"
|
||||
metrics:
|
||||
- name: "exact_match,custom-extract"
|
||||
value: 0.80
|
||||
limit: 250 # will run on 250 * 14 subjects = 3500 samples
|
||||
num_fewshot: 5
|
@ -1,4 +1,5 @@
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
|
||||
# For vllm script, with -t option (tensor parallel size)
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -l 1319 -t 1
|
||||
model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
|
@ -0,0 +1,12 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m Qwen/Qwen2.5-VL-7B-Instruct -l 2500 -t 1
|
||||
|
||||
model_name: "Qwen/Qwen2.5-VL-7B-Instruct"
|
||||
backend: "vllm-vlm"
|
||||
tasks:
|
||||
- name: "chartqa"
|
||||
metrics:
|
||||
- name: "relaxed_accuracy,none"
|
||||
value: 0.855
|
||||
limit: 2500
|
||||
num_fewshot: 0
|
1
.buildkite/lm-eval-harness/configs/models-large-h100.txt
Normal file
1
.buildkite/lm-eval-harness/configs/models-large-h100.txt
Normal file
@ -0,0 +1 @@
|
||||
Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
|
@ -0,0 +1 @@
|
||||
Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
|
1
.buildkite/lm-eval-harness/configs/models-mm-small.txt
Normal file
1
.buildkite/lm-eval-harness/configs/models-mm-small.txt
Normal file
@ -0,0 +1 @@
|
||||
Qwen2.5-VL-7B-Instruct.yaml
|
44
.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
Executable file
44
.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
Executable file
@ -0,0 +1,44 @@
|
||||
#!/bin/bash
|
||||
# We can use this script to compute baseline accuracy on chartqa for vllm.
|
||||
#
|
||||
# Make sure you have lm-eval-harness installed:
|
||||
# pip install lm-eval==0.4.9
|
||||
|
||||
usage() {
|
||||
echo``
|
||||
echo "Runs lm eval harness on ChartQA using multimodal vllm."
|
||||
echo "This pathway is intended to be used to create baselines for "
|
||||
echo "our correctness tests in vllm's CI."
|
||||
echo
|
||||
echo "usage: ${0} <options>"
|
||||
echo
|
||||
echo " -m - huggingface stub or local directory of the model"
|
||||
echo " -l - limit number of samples to run"
|
||||
echo " -t - tensor parallel size to run at"
|
||||
echo
|
||||
}
|
||||
|
||||
while getopts "m:l:t:" OPT; do
|
||||
case ${OPT} in
|
||||
m )
|
||||
MODEL="$OPTARG"
|
||||
;;
|
||||
l )
|
||||
LIMIT="$OPTARG"
|
||||
;;
|
||||
t )
|
||||
TP_SIZE="$OPTARG"
|
||||
;;
|
||||
\? )
|
||||
usage
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
lm_eval --model vllm-vlm \
|
||||
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE" \
|
||||
--tasks chartqa \
|
||||
--batch_size auto \
|
||||
--apply_chat_template \
|
||||
--limit $LIMIT
|
0
.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
Normal file → Executable file
0
.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
Normal file → Executable file
@ -0,0 +1,50 @@
|
||||
#!/bin/bash
|
||||
# We can use this script to compute baseline accuracy on MMLUPRO for vllm.
|
||||
# We use this for fp8, which HF does not support.
|
||||
#
|
||||
# Make sure you have lm-eval-harness installed:
|
||||
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
|
||||
|
||||
usage() {
|
||||
echo``
|
||||
echo "Runs lm eval harness on MMLU Pro using huggingface transformers."
|
||||
echo "This pathway is intended to be used to create baselines for "
|
||||
echo "our automated nm-test-accuracy workflow"
|
||||
echo
|
||||
echo "usage: ${0} <options>"
|
||||
echo
|
||||
echo " -m - huggingface stub or local directory of the model"
|
||||
echo " -l - limit number of samples to run"
|
||||
echo " -f - number of fewshot samples to use"
|
||||
echo " -t - tensor parallel size to run at"
|
||||
echo
|
||||
}
|
||||
|
||||
while getopts "m:b:l:f:t:" OPT; do
|
||||
case ${OPT} in
|
||||
m )
|
||||
MODEL="$OPTARG"
|
||||
;;
|
||||
b )
|
||||
BATCH_SIZE="$OPTARG"
|
||||
;;
|
||||
l )
|
||||
LIMIT="$OPTARG"
|
||||
;;
|
||||
f )
|
||||
FEWSHOT="$OPTARG"
|
||||
;;
|
||||
t )
|
||||
TP_SIZE="$OPTARG"
|
||||
;;
|
||||
\? )
|
||||
usage
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
lm_eval --model vllm \
|
||||
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \
|
||||
--tasks mmlu_pro --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
|
||||
--batch_size auto
|
@ -19,21 +19,27 @@ RTOL = 0.08
|
||||
def launch_lm_eval(eval_config, tp_size):
|
||||
trust_remote_code = eval_config.get("trust_remote_code", False)
|
||||
max_model_len = eval_config.get("max_model_len", 4096)
|
||||
batch_size = eval_config.get("batch_size", "auto")
|
||||
backend = eval_config.get("backend", "vllm")
|
||||
model_args = (
|
||||
f"pretrained={eval_config['model_name']},"
|
||||
f"tensor_parallel_size={tp_size},"
|
||||
f"enforce_eager=true,"
|
||||
f"add_bos_token=true,"
|
||||
f"trust_remote_code={trust_remote_code},"
|
||||
f"max_model_len={max_model_len}"
|
||||
f"max_model_len={max_model_len},"
|
||||
)
|
||||
results = lm_eval.simple_evaluate(
|
||||
model="vllm",
|
||||
model=backend,
|
||||
model_args=model_args,
|
||||
tasks=[task["name"] for task in eval_config["tasks"]],
|
||||
num_fewshot=eval_config["num_fewshot"],
|
||||
limit=eval_config["limit"],
|
||||
batch_size="auto",
|
||||
# TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
|
||||
# text models. however, this is regressing measured strict-match for
|
||||
# existing text models in CI, so only apply it for mm.
|
||||
apply_chat_template=backend == "vllm-vlm",
|
||||
batch_size=batch_size,
|
||||
)
|
||||
return results
|
||||
|
||||
|
@ -70,7 +70,7 @@ function cpu_tests() {
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pytest -x -s -v \
|
||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
|
||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs"
|
||||
|
||||
# Note: disable it until supports V1
|
||||
# Run AWQ test
|
||||
|
@ -63,7 +63,7 @@ steps:
|
||||
|
||||
- label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins
|
||||
timeout_in_minutes: 10
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
source_file_dependencies:
|
||||
@ -353,7 +353,7 @@ steps:
|
||||
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
||||
|
||||
- label: V1 Test others (CPU) # 5 mins
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
source_file_dependencies:
|
||||
@ -459,6 +459,7 @@ steps:
|
||||
- pytest -v -s compile/test_fusion_all_reduce.py
|
||||
- pytest -v -s compile/test_decorator.py
|
||||
- pytest -v -s compile/test_noop_elimination.py
|
||||
- pytest -v -s compile/test_aot_compile.py
|
||||
|
||||
- label: PyTorch Fullgraph Smoke Test # 15min
|
||||
timeout_in_minutes: 30
|
||||
@ -487,14 +488,14 @@ steps:
|
||||
|
||||
- label: Kernels Core Operation Test # 48min
|
||||
timeout_in_minutes: 75
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- tests/kernels/core
|
||||
commands:
|
||||
- pytest -v -s kernels/core
|
||||
- pytest -v -s kernels/core kernels/test_top_k_per_row.py
|
||||
|
||||
- label: Kernels Attention Test %N # 23min
|
||||
timeout_in_minutes: 35
|
||||
@ -603,7 +604,8 @@ steps:
|
||||
# since torchao nightly is only compatible with torch nightly currently
|
||||
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
|
||||
# we can only upgrade after this is resolved
|
||||
- pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
|
||||
# TODO(jerryzh168): resolve the above comment
|
||||
- uv pip install --system torchao==0.13.0
|
||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/
|
||||
|
||||
- label: LM Eval Small Models # 53min
|
||||
@ -631,7 +633,7 @@ steps:
|
||||
|
||||
- label: OpenAI-Compatible Tool Use # 23 min
|
||||
timeout_in_minutes: 35
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
fast_check: false
|
||||
|
@ -416,8 +416,8 @@ steps:
|
||||
- pytest -v -s compile/test_basic_correctness.py
|
||||
- pytest -v -s compile/piecewise/
|
||||
|
||||
- label: PyTorch Fullgraph Test # 20min
|
||||
timeout_in_minutes: 30
|
||||
- label: PyTorch Fullgraph Test # 22min
|
||||
timeout_in_minutes: 35
|
||||
mirror_hardwares: [amdexperimental]
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
@ -425,6 +425,7 @@ steps:
|
||||
- tests/compile
|
||||
commands:
|
||||
- pytest -v -s compile/test_full_graph.py
|
||||
- pytest -v -s compile/test_fusions_e2e.py
|
||||
|
||||
- label: Kernels Core Operation Test # 48min
|
||||
timeout_in_minutes: 75
|
||||
@ -527,8 +528,9 @@ steps:
|
||||
# since torchao nightly is only compatible with torch nightly currently
|
||||
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
|
||||
# we can only upgrade after this is resolved
|
||||
- pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
|
||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/
|
||||
# TODO(jerryzh168): resolve the above comment
|
||||
- uv pip install --system torchao==0.13.0
|
||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
|
||||
|
||||
- label: LM Eval Small Models # 53min
|
||||
timeout_in_minutes: 75
|
||||
@ -733,6 +735,16 @@ steps:
|
||||
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
|
||||
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
||||
|
||||
- label: Multi-Modal Accuracy Eval (Small Models) # 50min
|
||||
timeout_in_minutes: 70
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
source_file_dependencies:
|
||||
- vllm/multimodal/
|
||||
- vllm/inputs/
|
||||
- vllm/v1/core/
|
||||
commands:
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
|
||||
|
||||
- label: Multi-Modal Models Test (Extended) 1
|
||||
mirror_hardwares: [amdexperimental]
|
||||
optional: true
|
||||
@ -796,8 +808,8 @@ steps:
|
||||
# Whisper needs spawn method to avoid deadlock
|
||||
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
|
||||
|
||||
- label: Blackwell Test # 38 min
|
||||
timeout_in_minutes: 60
|
||||
- label: Blackwell Test # 21 min
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
# optional: true
|
||||
@ -810,8 +822,6 @@ steps:
|
||||
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/compilation/fusion.py
|
||||
- vllm/compilation/fusion_attn.py
|
||||
commands:
|
||||
- nvidia-smi
|
||||
- python3 examples/offline_inference/basic/chat.py
|
||||
@ -828,15 +838,32 @@ steps:
|
||||
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
|
||||
- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
|
||||
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
|
||||
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
|
||||
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
|
||||
# Fusion
|
||||
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
||||
- pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
|
||||
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
||||
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
||||
- pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
|
||||
- pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
|
||||
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
|
||||
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
|
||||
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
||||
|
||||
- label: Blackwell Fusion Tests # 30 min
|
||||
timeout_in_minutes: 40
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/fp4/
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/compilation/
|
||||
# can affect pattern matching
|
||||
- vllm/model_executor/layers/layernorm.py
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
commands:
|
||||
- nvidia-smi
|
||||
- pytest -v -s tests/compile/test_fusion_attn.py
|
||||
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
||||
# this runner has 2 GPUs available even though num_gpus=2 is not set
|
||||
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
||||
- pytest -v -s tests/compile/test_fusions_e2e.py
|
||||
|
||||
- label: Blackwell GPT-OSS Eval
|
||||
timeout_in_minutes: 60
|
||||
@ -993,6 +1020,11 @@ steps:
|
||||
- pytest -v -s plugins_tests/test_io_processor_plugins.py
|
||||
- pip uninstall prithvi_io_processor_plugin -y
|
||||
# end io_processor plugins test
|
||||
# begin stat_logger plugins test
|
||||
- pip install -e ./plugins/vllm_add_dummy_stat_logger
|
||||
- pytest -v -s plugins_tests/test_stats_logger_plugins.py
|
||||
- pip uninstall dummy_stat_logger -y
|
||||
# end stat_logger plugins test
|
||||
# other tests continue here:
|
||||
- pytest -v -s plugins_tests/test_scheduler_plugins.py
|
||||
- pip install -e ./plugins/vllm_add_dummy_model
|
||||
@ -1057,6 +1089,17 @@ steps:
|
||||
- tests/weight_loading
|
||||
commands:
|
||||
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
|
||||
|
||||
- label: NixlConnector PD accuracy tests (Distributed) # 30min
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
|
||||
- tests/v1/kv_connector/nixl_integration/
|
||||
commands:
|
||||
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
||||
- bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
|
||||
|
||||
|
||||
##### multi gpus test #####
|
||||
@ -1089,7 +1132,7 @@ steps:
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
||||
|
||||
##### H200 test #####
|
||||
- label: Distrubted Tests (H200) # optional
|
||||
- label: Distributed Tests (H200) # optional
|
||||
gpu: h200
|
||||
optional: true
|
||||
working_dir: "/vllm-workspace/"
|
||||
@ -1097,6 +1140,8 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s tests/compile/test_async_tp.py
|
||||
- pytest -v -s tests/compile/test_sequence_parallelism.py
|
||||
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
||||
- pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
|
||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
|
||||
|
||||
|
5
.github/CODEOWNERS
vendored
5
.github/CODEOWNERS
vendored
@ -5,9 +5,7 @@
|
||||
/vllm/attention @LucasWilkinson
|
||||
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
|
||||
/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
|
||||
/vllm/model_executor/layers/fused_moe @mgoin
|
||||
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche
|
||||
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
|
||||
/vllm/model_executor/layers/mamba @tdoublep
|
||||
/vllm/model_executor/model_loader @22quinn
|
||||
@ -26,7 +24,6 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
||||
/vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
|
||||
|
||||
# vLLM V1
|
||||
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
|
||||
/vllm/v1/attention @LucasWilkinson
|
||||
/vllm/v1/attention/backends/flashinfer.py @mgoin
|
||||
/vllm/v1/attention/backends/triton_attn.py @tdoublep
|
||||
@ -60,7 +57,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
||||
/tests/v1/offloading @ApostaC
|
||||
|
||||
# Transformers backend
|
||||
/vllm/model_executor/models/transformers.py @hmellor
|
||||
/vllm/model_executor/models/transformers @hmellor
|
||||
/tests/models/test_transformers.py @hmellor
|
||||
|
||||
# Docs
|
||||
|
@ -4,7 +4,6 @@ MD013: false
|
||||
MD024:
|
||||
siblings_only: true
|
||||
MD033: false
|
||||
MD042: false
|
||||
MD045: false
|
||||
MD046: false
|
||||
MD051: false
|
||||
|
@ -31,6 +31,7 @@ import time
|
||||
import uuid
|
||||
import warnings
|
||||
from collections.abc import AsyncGenerator
|
||||
from contextlib import nullcontext
|
||||
from dataclasses import dataclass
|
||||
|
||||
import datasets
|
||||
@ -501,15 +502,9 @@ async def benchmark(
|
||||
|
||||
pbar = None if disable_tqdm else tqdm(total=len(input_requests))
|
||||
|
||||
# This can be used once the minimum Python version is 3.10 or higher,
|
||||
# and it will simplify the code in limited_request_func.
|
||||
# semaphore = (asyncio.Semaphore(max_concurrency)
|
||||
# if max_concurrency else contextlib.nullcontext())
|
||||
semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
|
||||
semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else nullcontext()
|
||||
|
||||
async def limited_request_func(request_func_input, pbar):
|
||||
if semaphore is None:
|
||||
return await request_func(request_func_input=request_func_input, pbar=pbar)
|
||||
async with semaphore:
|
||||
return await request_func(request_func_input=request_func_input, pbar=pbar)
|
||||
|
||||
|
@ -10,7 +10,8 @@ import torch
|
||||
from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
|
||||
from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
|
||||
from vllm.triton_utils import triton
|
||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||
|
||||
|
||||
def with_triton_mode(fn):
|
||||
|
@ -10,7 +10,8 @@ import vllm.model_executor.layers.activation # noqa F401
|
||||
from vllm.model_executor.custom_op import CustomOp
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.triton_utils import triton
|
||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||
|
||||
batch_size_range = [1, 16, 32, 64, 128]
|
||||
seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
|
||||
|
@ -7,7 +7,8 @@ import torch
|
||||
|
||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||
|
||||
|
||||
@torch.inference_mode()
|
||||
|
@ -631,7 +631,7 @@ def main(args: argparse.Namespace):
|
||||
else:
|
||||
ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size")
|
||||
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
||||
dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
|
||||
dtype = torch.float16 if current_platform.is_rocm() else config.dtype
|
||||
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
|
||||
use_int8_w8a16 = args.dtype == "int8_w8a16"
|
||||
block_quant_shape = get_weight_block_size_safety(config)
|
||||
|
@ -344,7 +344,7 @@ def main(args: argparse.Namespace):
|
||||
topk = config.num_experts_per_tok
|
||||
|
||||
hidden_size = config.hidden_size
|
||||
dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
|
||||
dtype = torch.float16 if current_platform.is_rocm() else config.dtype
|
||||
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
|
||||
use_int8_w8a16 = args.dtype == "int8_w8a16"
|
||||
use_customized_permute = args.use_customized_permute
|
||||
|
@ -9,9 +9,9 @@ import torch
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.logger import init_logger
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import (
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.torch_utils import (
|
||||
STR_DTYPE_TO_TORCH_DTYPE,
|
||||
FlexibleArgumentParser,
|
||||
create_kv_caches_with_random,
|
||||
)
|
||||
|
||||
|
@ -1,155 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import itertools
|
||||
|
||||
import torch
|
||||
|
||||
from vllm import _custom_ops as vllm_ops
|
||||
from vllm.triton_utils import triton
|
||||
|
||||
|
||||
def polynorm_naive(
|
||||
x: torch.Tensor,
|
||||
weight: torch.Tensor,
|
||||
bias: torch.Tensor,
|
||||
eps: float = 1e-6,
|
||||
):
|
||||
orig_shape = x.shape
|
||||
x = x.view(-1, x.shape[-1])
|
||||
|
||||
def norm(x, eps: float):
|
||||
return x / torch.sqrt(x.pow(2).mean(-1, keepdim=True) + eps)
|
||||
|
||||
x = x.float()
|
||||
return (
|
||||
(
|
||||
weight[0] * norm(x**3, eps)
|
||||
+ weight[1] * norm(x**2, eps)
|
||||
+ weight[2] * norm(x, eps)
|
||||
+ bias
|
||||
)
|
||||
.to(weight.dtype)
|
||||
.view(orig_shape)
|
||||
)
|
||||
|
||||
|
||||
def polynorm_vllm(
|
||||
x: torch.Tensor,
|
||||
weight: torch.Tensor,
|
||||
bias: torch.Tensor,
|
||||
eps: float = 1e-6,
|
||||
):
|
||||
orig_shape = x.shape
|
||||
x = x.view(-1, x.shape[-1])
|
||||
|
||||
out = torch.empty_like(x)
|
||||
vllm_ops.poly_norm(out, x, weight, bias, eps)
|
||||
output = out
|
||||
|
||||
output = output.view(orig_shape)
|
||||
return output
|
||||
|
||||
|
||||
def calculate_diff(batch_size, seq_len, hidden_dim):
|
||||
dtype = torch.bfloat16
|
||||
x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda")
|
||||
weight = torch.ones(3, dtype=dtype, device="cuda")
|
||||
bias = torch.ones(1, dtype=dtype, device="cuda")
|
||||
|
||||
output_naive = polynorm_naive(x, weight, bias)
|
||||
output_vllm = polynorm_vllm(x, weight, bias)
|
||||
|
||||
if torch.allclose(output_naive, output_vllm, atol=1e-2, rtol=1e-2):
|
||||
print("✅ All implementations match")
|
||||
else:
|
||||
print("❌ Implementations differ")
|
||||
|
||||
|
||||
batch_size_range = [2**i for i in range(0, 7, 2)]
|
||||
seq_length_range = [2**i for i in range(6, 11, 1)]
|
||||
dim_range = [2048, 4096]
|
||||
configs = list(itertools.product(dim_range, batch_size_range, seq_length_range))
|
||||
|
||||
|
||||
def get_benchmark():
|
||||
@triton.testing.perf_report(
|
||||
triton.testing.Benchmark(
|
||||
x_names=["dim", "batch_size", "seq_len"],
|
||||
x_vals=[list(_) for _ in configs],
|
||||
line_arg="provider",
|
||||
line_vals=["naive", "vllm"],
|
||||
line_names=["Naive", "vLLM"],
|
||||
styles=[("blue", "-"), ("red", "-")],
|
||||
ylabel="us",
|
||||
plot_name="polynorm-perf",
|
||||
args={},
|
||||
)
|
||||
)
|
||||
def benchmark(dim, batch_size, seq_len, provider):
|
||||
dtype = torch.bfloat16
|
||||
hidden_dim = dim * 4
|
||||
|
||||
x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda")
|
||||
weight = torch.ones(3, dtype=dtype, device="cuda")
|
||||
bias = torch.ones(1, dtype=dtype, device="cuda")
|
||||
|
||||
quantiles = [0.5, 0.2, 0.8]
|
||||
|
||||
if provider == "naive":
|
||||
ms, min_ms, max_ms = triton.testing.do_bench(
|
||||
lambda: polynorm_naive(x, weight, bias),
|
||||
quantiles=quantiles,
|
||||
)
|
||||
else:
|
||||
ms, min_ms, max_ms = triton.testing.do_bench(
|
||||
lambda: polynorm_vllm(x, weight, bias),
|
||||
quantiles=quantiles,
|
||||
)
|
||||
|
||||
return 1000 * ms, 1000 * max_ms, 1000 * min_ms
|
||||
|
||||
return benchmark
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--batch-size",
|
||||
type=int,
|
||||
default=4,
|
||||
help="Batch size",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--seq-len",
|
||||
type=int,
|
||||
default=128,
|
||||
help="Sequence length",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--hidden-dim",
|
||||
type=int,
|
||||
default=8192,
|
||||
help="Intermediate size of MLP",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--save-path",
|
||||
type=str,
|
||||
default="./configs/polnorm/",
|
||||
help="Path to save polnorm benchmark results",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Run correctness test
|
||||
calculate_diff(
|
||||
batch_size=args.batch_size,
|
||||
seq_len=args.seq_len,
|
||||
hidden_dim=args.hidden_dim,
|
||||
)
|
||||
|
||||
benchmark = get_benchmark()
|
||||
# Run performance benchmark
|
||||
benchmark.run(print_data=True, save_path=args.save_path)
|
@ -7,7 +7,8 @@ import torch
|
||||
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||
|
||||
|
||||
@torch.inference_mode()
|
||||
|
@ -9,9 +9,9 @@ from tabulate import tabulate
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.logger import init_logger
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import (
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.torch_utils import (
|
||||
STR_DTYPE_TO_TORCH_DTYPE,
|
||||
FlexibleArgumentParser,
|
||||
create_kv_caches_with_random,
|
||||
)
|
||||
|
||||
|
@ -12,9 +12,9 @@ from vllm.attention.ops.triton_reshape_and_cache_flash import (
|
||||
)
|
||||
from vllm.logger import init_logger
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import (
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.torch_utils import (
|
||||
STR_DTYPE_TO_TORCH_DTYPE,
|
||||
FlexibleArgumentParser,
|
||||
create_kv_caches_with_random_flash,
|
||||
)
|
||||
|
||||
|
@ -1251,7 +1251,7 @@ async def main() -> None:
|
||||
default=None,
|
||||
help="The model name used in the API. "
|
||||
"If not specified, the model name will be the "
|
||||
"same as the ``--model`` argument. ",
|
||||
"same as the `--model` argument. ",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
|
@ -22,10 +22,10 @@ else()
|
||||
CONFIGURE_COMMAND ""
|
||||
BUILD_COMMAND ""
|
||||
)
|
||||
FetchContent_Populate(qutlass)
|
||||
set(qutlass_SOURCE_DIR "${qutlass_SOURCE_DIR}")
|
||||
endif()
|
||||
|
||||
FetchContent_Populate(qutlass)
|
||||
|
||||
if(NOT qutlass_SOURCE_DIR)
|
||||
message(FATAL_ERROR "[QUTLASS] source directory could not be resolved.")
|
||||
endif()
|
||||
|
@ -38,7 +38,7 @@ else()
|
||||
FetchContent_Declare(
|
||||
vllm-flash-attn
|
||||
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
|
||||
GIT_TAG 8f468e7da54a8e2f98abfa7c38636aac91c0cba1
|
||||
GIT_TAG a893712401d70362fbb299cd9c4b3476e8e9ed54
|
||||
GIT_PROGRESS TRUE
|
||||
# Don't share the vllm-flash-attn build between build types
|
||||
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
||||
|
@ -125,32 +125,37 @@ public:
|
||||
}
|
||||
|
||||
static void set_split_kv (KernelArguments& args) {
|
||||
// printf("set_split_kv start");
|
||||
if (args.split_kv >= 1) return;
|
||||
auto [H, K, D, B] = args.problem_shape;
|
||||
// std::cout << H << " " << K << " " << D << " " << B << "\n";
|
||||
int sm_count = args.hw_info.sm_count;
|
||||
// printf(" sm_count = %d\n", sm_count);
|
||||
int max_splits = ceil_div(K, 128);
|
||||
max_splits = min(16, max_splits);
|
||||
float seq_length_k = static_cast<float>(K) / 1024.0f;
|
||||
int max_splits = 1;
|
||||
|
||||
// TODO: This avoids a hang when the batch size larger than 1 and
|
||||
// there is more than 1 kv_splits.
|
||||
// Discuss with NVIDIA how this can be fixed.
|
||||
if (B > 1) {
|
||||
max_splits = min(1, max_splits);
|
||||
if (B <= 4 && seq_length_k >= 16) {
|
||||
max_splits = 16;
|
||||
}
|
||||
|
||||
// printf(" max_splits = %d\n", max_splits);
|
||||
else if (B <= 8 && seq_length_k >= 4) {
|
||||
max_splits = 8;
|
||||
}
|
||||
else if ((B <= 16 && seq_length_k >= 8) ||
|
||||
(B == 48 && seq_length_k >= 32)) {
|
||||
max_splits = 4;
|
||||
}
|
||||
else if ((B <= 32 && seq_length_k >= 16) ||
|
||||
(B == 96 && seq_length_k >= 16)) {
|
||||
max_splits = 2;
|
||||
}
|
||||
else {
|
||||
max_splits = 1;
|
||||
}
|
||||
|
||||
// Wave-aware scheduling: ensure integer number of waves in K dimension
|
||||
int sms_per_batch = max(1, sm_count / B);
|
||||
// printf(" sms_per_batch = %d\n", sms_per_batch);
|
||||
int split_heur = min(max_splits, sms_per_batch);
|
||||
int waves = ceil_div(B * split_heur, sm_count);
|
||||
int k_waves = ceil_div(max_splits, split_heur);
|
||||
int split_wave_aware = ceil_div(max_splits, k_waves);
|
||||
args.split_kv = split_wave_aware;
|
||||
// printf(" args.split_kv = %d\n", args.split_kv);
|
||||
|
||||
}
|
||||
|
||||
/// Determines whether the GEMM can execute the given problem.
|
||||
|
@ -5,11 +5,11 @@
|
||||
|
||||
namespace vllm {
|
||||
|
||||
// vllm_kernel_override_batch_invariant(); returns true
|
||||
// if env VLLM_KERNEL_OVERRIDE_BATCH_INVARIANT=1
|
||||
inline bool vllm_kernel_override_batch_invariant() {
|
||||
// vllm_is_batch_invariant(); returns true
|
||||
// if env VLLM_BATCH_INVARIANT=1
|
||||
inline bool vllm_is_batch_invariant() {
|
||||
static bool cached = []() {
|
||||
std::string env_key = "VLLM_KERNEL_OVERRIDE_BATCH_INVARIANT";
|
||||
std::string env_key = "VLLM_BATCH_INVARIANT";
|
||||
const char* val = std::getenv(env_key.c_str());
|
||||
return (val && std::atoi(val) != 0) ? 1 : 0;
|
||||
}();
|
||||
|
@ -2,6 +2,7 @@
|
||||
#include "dispatch_utils.h"
|
||||
#include "cub_helpers.h"
|
||||
#include "core/batch_invariant.hpp"
|
||||
#include "quantization/vectorization_utils.cuh"
|
||||
|
||||
#include <torch/cuda.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
@ -18,11 +19,22 @@ __global__ void rms_norm_kernel(
|
||||
const float epsilon, const int num_tokens, const int hidden_size) {
|
||||
__shared__ float s_variance;
|
||||
float variance = 0.0f;
|
||||
const scalar_t* input_row = input + blockIdx.x * input_stride;
|
||||
|
||||
for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
|
||||
const float x = (float)input[blockIdx.x * input_stride + idx];
|
||||
constexpr int VEC_SIZE = 8;
|
||||
auto vec_op = [&variance](const vec_n_t<scalar_t, VEC_SIZE>& vec) {
|
||||
#pragma unroll
|
||||
for (int i = 0; i < VEC_SIZE; ++i) {
|
||||
float x = static_cast<float>(vec.val[i]);
|
||||
variance += x * x;
|
||||
}
|
||||
};
|
||||
auto scalar_op = [&variance](const scalar_t& val) {
|
||||
float x = static_cast<float>(val);
|
||||
variance += x * x;
|
||||
}
|
||||
};
|
||||
vllm::vectorize_read_with_alignment<VEC_SIZE>(
|
||||
input_row, hidden_size, threadIdx.x, blockDim.x, vec_op, scalar_op);
|
||||
|
||||
using BlockReduce = cub::BlockReduce<float, 1024>;
|
||||
__shared__ typename BlockReduce::TempStorage reduceStore;
|
||||
@ -136,211 +148,6 @@ fused_add_rms_norm_kernel(
|
||||
}
|
||||
}
|
||||
|
||||
/* Function specialization in the case of FP16/BF16 tensors.
|
||||
Additional optimizations we can make in this case are
|
||||
packed and vectorized operations, which help with the
|
||||
memory latency bottleneck.
|
||||
|
||||
_f16VecPN struct extends _f16Vec to add operations specifically required for
|
||||
polynomial normalization (poly norm).
|
||||
The original _f16Vec does not include the sum-of-powers computation or
|
||||
in-place polynomial normalization logic. */
|
||||
template <typename scalar_t, int width>
|
||||
struct alignas(16) _f16VecPN : _f16Vec<scalar_t, width> {
|
||||
using Base = _f16Vec<scalar_t, width>;
|
||||
using Converter = typename Base::Converter;
|
||||
using T1 = typename Base::T1;
|
||||
using T2 = typename Base::T2;
|
||||
using Base::data;
|
||||
|
||||
__device__ auto sum_pows() const {
|
||||
float s2 = 0.0f, s4 = 0.0f, s6 = 0.0f;
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < width; i += 2) {
|
||||
float2 z = Converter::convert(T2{data[i], data[i + 1]});
|
||||
float x2 = z.x * z.x;
|
||||
float x4 = x2 * x2;
|
||||
float x6 = x4 * x2;
|
||||
|
||||
float y2 = z.y * z.y;
|
||||
float y4 = y2 * y2;
|
||||
float y6 = y4 * y2;
|
||||
|
||||
s2 += x2 + y2;
|
||||
s4 += x4 + y4;
|
||||
s6 += x6 + y6;
|
||||
}
|
||||
return std::make_tuple(s2, s4, s6);
|
||||
}
|
||||
|
||||
__device__ void poly_norm_inplace(const float w2_inv_std,
|
||||
const float w1_inv_std2,
|
||||
const float w0_inv_std3, const float bias) {
|
||||
#pragma unroll
|
||||
for (int i = 0; i < width; i += 2) {
|
||||
float2 z = Converter::convert(T2{data[i], data[i + 1]});
|
||||
|
||||
float x2 = z.x * z.x;
|
||||
float x3 = x2 * z.x;
|
||||
z.x = w2_inv_std * z.x + w1_inv_std2 * x2 + w0_inv_std3 * x3 + bias;
|
||||
|
||||
float y2 = z.y * z.y;
|
||||
float y3 = y2 * z.y;
|
||||
z.y = w2_inv_std * z.y + w1_inv_std2 * y2 + w0_inv_std3 * y3 + bias;
|
||||
|
||||
auto out = Converter::convert(z);
|
||||
data[i] = out.x;
|
||||
data[i + 1] = out.y;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename scalar_t, int width>
|
||||
__global__ std::enable_if_t<(width > 0) && _typeConvert<scalar_t>::exists>
|
||||
poly_norm_kernel(scalar_t* __restrict__ out, // [..., hidden_size]
|
||||
const scalar_t* __restrict__ input, // [..., hidden_size]
|
||||
const scalar_t* __restrict__ weight, // [3]
|
||||
const scalar_t* __restrict__ bias, // [1]
|
||||
const float epsilon, const int hidden_size) {
|
||||
// Sanity checks on our vector struct and type-punned pointer arithmetic
|
||||
static_assert(std::is_pod_v<_f16VecPN<scalar_t, width>>);
|
||||
static_assert(sizeof(_f16VecPN<scalar_t, width>) == sizeof(scalar_t) * width);
|
||||
|
||||
/* These and the argument pointers are all declared `restrict` as they are
|
||||
not aliased in practice. Argument pointers should not be dereferenced
|
||||
in this kernel as that would be undefined behavior */
|
||||
auto* __restrict__ input_v =
|
||||
reinterpret_cast<const _f16VecPN<scalar_t, width>*>(input);
|
||||
const int vec_hidden_size = hidden_size / width;
|
||||
float variance = 0.0f;
|
||||
float variance2 = 0.0f;
|
||||
float variance3 = 0.0f;
|
||||
|
||||
for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) {
|
||||
int id = blockIdx.x * vec_hidden_size + idx;
|
||||
_f16VecPN<scalar_t, width> temp = input_v[id];
|
||||
auto [x2, x4, x6] = temp.sum_pows();
|
||||
|
||||
variance += x2;
|
||||
variance2 += x4;
|
||||
variance3 += x6;
|
||||
}
|
||||
|
||||
float3 thread_variances = make_float3(variance, variance2, variance3);
|
||||
|
||||
struct SumOp {
|
||||
__device__ float3 operator()(const float3& a, const float3& b) const {
|
||||
return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
|
||||
}
|
||||
};
|
||||
|
||||
using BlockReduce = cub::BlockReduce<float3, 1024>;
|
||||
__shared__ typename BlockReduce::TempStorage reduceStore;
|
||||
float3 block_variances =
|
||||
BlockReduce(reduceStore).Reduce(thread_variances, SumOp{}, blockDim.x);
|
||||
|
||||
variance = block_variances.x;
|
||||
variance2 = block_variances.y;
|
||||
variance3 = block_variances.z;
|
||||
|
||||
__shared__ float s_w2_inv_std;
|
||||
__shared__ float s_w1_inv_std2;
|
||||
__shared__ float s_w0_inv_std3;
|
||||
__shared__ float s_bias;
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
float w0 = (float)weight[0];
|
||||
float w1 = (float)weight[1];
|
||||
float w2 = (float)weight[2];
|
||||
s_bias = (float)bias[0];
|
||||
|
||||
s_w2_inv_std = w2 * rsqrtf(variance / hidden_size + epsilon);
|
||||
s_w1_inv_std2 = w1 * rsqrtf(variance2 / hidden_size + epsilon);
|
||||
s_w0_inv_std3 = w0 * rsqrtf(variance3 / hidden_size + epsilon);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
auto* __restrict__ out_v = reinterpret_cast<_f16VecPN<scalar_t, width>*>(out);
|
||||
|
||||
for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) {
|
||||
int id = blockIdx.x * vec_hidden_size + idx;
|
||||
_f16VecPN<scalar_t, width> temp = input_v[id];
|
||||
temp.poly_norm_inplace(s_w2_inv_std, s_w1_inv_std2, s_w0_inv_std3, s_bias);
|
||||
out_v[id] = temp;
|
||||
}
|
||||
}
|
||||
|
||||
/* Generic poly_norm_kernel
|
||||
The width field is not used here but necessary for other specializations.
|
||||
*/
|
||||
template <typename scalar_t, int width>
|
||||
__global__ std::enable_if_t<(width == 0) || !_typeConvert<scalar_t>::exists>
|
||||
poly_norm_kernel(scalar_t* __restrict__ out, // [..., hidden_size]
|
||||
const scalar_t* __restrict__ input, // [..., hidden_size]
|
||||
const scalar_t* __restrict__ weight, // [3]
|
||||
const scalar_t* __restrict__ bias, // [1]
|
||||
const float epsilon, const int hidden_size) {
|
||||
float variance = 0.0f;
|
||||
float variance2 = 0.0f;
|
||||
float variance3 = 0.0f;
|
||||
|
||||
for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
|
||||
float x = (float)input[blockIdx.x * hidden_size + idx];
|
||||
float x2 = x * x;
|
||||
float x4 = x2 * x2;
|
||||
float x6 = x4 * x2;
|
||||
|
||||
variance += x2;
|
||||
variance2 += x4;
|
||||
variance3 += x6;
|
||||
}
|
||||
|
||||
float3 thread_variances = make_float3(variance, variance2, variance3);
|
||||
|
||||
struct SumOp {
|
||||
__device__ float3 operator()(const float3& a, const float3& b) const {
|
||||
return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
|
||||
}
|
||||
};
|
||||
|
||||
using BlockReduce = cub::BlockReduce<float3, 1024>;
|
||||
__shared__ typename BlockReduce::TempStorage reduceStore;
|
||||
float3 block_variances =
|
||||
BlockReduce(reduceStore).Reduce(thread_variances, SumOp{}, blockDim.x);
|
||||
|
||||
variance = block_variances.x;
|
||||
variance2 = block_variances.y;
|
||||
variance3 = block_variances.z;
|
||||
|
||||
__shared__ float s_w2_inv_std;
|
||||
__shared__ float s_w1_inv_std2;
|
||||
__shared__ float s_w0_inv_std3;
|
||||
__shared__ float s_bias;
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
float w0 = (float)weight[0];
|
||||
float w1 = (float)weight[1];
|
||||
float w2 = (float)weight[2];
|
||||
s_bias = (float)bias[0];
|
||||
|
||||
s_w2_inv_std = w2 * rsqrtf(variance / hidden_size + epsilon);
|
||||
s_w1_inv_std2 = w1 * rsqrtf(variance2 / hidden_size + epsilon);
|
||||
s_w0_inv_std3 = w0 * rsqrtf(variance3 / hidden_size + epsilon);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
|
||||
float x = (float)input[blockIdx.x * hidden_size + idx];
|
||||
float x2 = x * x;
|
||||
float x3 = x2 * x;
|
||||
|
||||
out[blockIdx.x * hidden_size + idx] =
|
||||
(scalar_t)(x * s_w2_inv_std + x2 * s_w1_inv_std2 + x3 * s_w0_inv_std3 +
|
||||
s_bias);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace vllm
|
||||
|
||||
void rms_norm(torch::Tensor& out, // [..., hidden_size]
|
||||
@ -352,18 +159,26 @@ void rms_norm(torch::Tensor& out, // [..., hidden_size]
|
||||
TORCH_CHECK(weight.is_contiguous());
|
||||
|
||||
int hidden_size = input.size(-1);
|
||||
int num_tokens = input.numel() / hidden_size;
|
||||
int64_t input_stride = input.stride(-2);
|
||||
|
||||
// We cannot just use `input.stride(-2)` if the tensor is not row-major.
|
||||
// Instead, we use a 2d view to get the second-innermost stride.
|
||||
// That way the dimensions (except the last one) can be arbitrarily permuted.
|
||||
torch::Tensor input_view = input.view({-1, hidden_size});
|
||||
|
||||
int num_tokens = input_view.numel() / hidden_size;
|
||||
int64_t input_stride = input_view.stride(-2);
|
||||
|
||||
dim3 grid(num_tokens);
|
||||
dim3 block(std::min(hidden_size, 1024));
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(input_view));
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "rms_norm_kernel", [&] {
|
||||
vllm::rms_norm_kernel<scalar_t><<<grid, block, 0, stream>>>(
|
||||
out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), input_stride,
|
||||
weight.data_ptr<scalar_t>(), epsilon, num_tokens, hidden_size);
|
||||
});
|
||||
VLLM_DISPATCH_FLOATING_TYPES(
|
||||
input_view.scalar_type(), "rms_norm_kernel", [&] {
|
||||
vllm::rms_norm_kernel<scalar_t><<<grid, block, 0, stream>>>(
|
||||
out.data_ptr<scalar_t>(), input_view.data_ptr<scalar_t>(),
|
||||
input_stride, weight.data_ptr<scalar_t>(), epsilon, num_tokens,
|
||||
hidden_size);
|
||||
});
|
||||
}
|
||||
|
||||
#define LAUNCH_FUSED_ADD_RMS_NORM(width) \
|
||||
@ -380,6 +195,8 @@ void fused_add_rms_norm(torch::Tensor& input, // [..., hidden_size]
|
||||
torch::Tensor& residual, // [..., hidden_size]
|
||||
torch::Tensor& weight, // [hidden_size]
|
||||
double epsilon) {
|
||||
TORCH_CHECK(weight.scalar_type() == input.scalar_type());
|
||||
TORCH_CHECK(input.scalar_type() == residual.scalar_type());
|
||||
TORCH_CHECK(residual.is_contiguous());
|
||||
TORCH_CHECK(weight.is_contiguous());
|
||||
int hidden_size = input.size(-1);
|
||||
@ -414,7 +231,7 @@ void fused_add_rms_norm(torch::Tensor& input, // [..., hidden_size]
|
||||
wt_ptr % req_alignment_bytes == 0;
|
||||
bool offsets_are_multiple_of_vector_width =
|
||||
hidden_size % vector_width == 0 && input_stride % vector_width == 0;
|
||||
bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
|
||||
bool batch_invariant_launch = vllm::vllm_is_batch_invariant();
|
||||
if (ptrs_are_aligned && offsets_are_multiple_of_vector_width &&
|
||||
!batch_invariant_launch) {
|
||||
LAUNCH_FUSED_ADD_RMS_NORM(8);
|
||||
@ -422,50 +239,3 @@ void fused_add_rms_norm(torch::Tensor& input, // [..., hidden_size]
|
||||
LAUNCH_FUSED_ADD_RMS_NORM(0);
|
||||
}
|
||||
}
|
||||
|
||||
#define LAUNCH_FUSED_POLY_NORM(width) \
|
||||
VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "poly_norm_kernel", [&] { \
|
||||
vllm::poly_norm_kernel<scalar_t, width><<<grid, block, 0, stream>>>( \
|
||||
out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), \
|
||||
weight.data_ptr<scalar_t>(), bias.data_ptr<scalar_t>(), epsilon, \
|
||||
hidden_size); \
|
||||
});
|
||||
|
||||
void poly_norm(torch::Tensor& out, // [..., hidden_size]
|
||||
torch::Tensor& input, // [..., hidden_size]
|
||||
torch::Tensor& weight, // [3]
|
||||
torch::Tensor& bias, // [1]
|
||||
double epsilon) {
|
||||
TORCH_CHECK(out.is_contiguous());
|
||||
TORCH_CHECK(input.is_contiguous());
|
||||
TORCH_CHECK(out.data_ptr() != input.data_ptr());
|
||||
|
||||
int hidden_size = input.size(-1);
|
||||
int num_tokens = input.numel() / hidden_size;
|
||||
|
||||
dim3 grid(num_tokens);
|
||||
/* This kernel is memory-latency bound in many scenarios.
|
||||
When num_tokens is large, a smaller block size allows
|
||||
for increased block occupancy on CUs and better latency
|
||||
hiding on global mem ops. */
|
||||
const int max_block_size = (num_tokens < 256) ? 1024 : 256;
|
||||
dim3 block(std::min(hidden_size, max_block_size));
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
/*If the tensor types are FP16/BF16, try to use the optimized kernel
|
||||
with packed + vectorized ops.
|
||||
Max optimization is achieved with a width-8 vector of FP16/BF16s
|
||||
since we can load at most 128 bits at once in a global memory op.
|
||||
However, this requires each tensor's data to be aligned to 16
|
||||
bytes.
|
||||
*/
|
||||
auto inp_ptr = reinterpret_cast<std::uintptr_t>(input.data_ptr());
|
||||
auto out_ptr = reinterpret_cast<std::uintptr_t>(out.data_ptr());
|
||||
bool ptrs_are_aligned = inp_ptr % 16 == 0 && out_ptr % 16 == 0;
|
||||
bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
|
||||
if (ptrs_are_aligned && hidden_size % 8 == 0 && !batch_invariant_launch) {
|
||||
LAUNCH_FUSED_POLY_NORM(8);
|
||||
} else {
|
||||
LAUNCH_FUSED_POLY_NORM(0);
|
||||
}
|
||||
}
|
||||
|
@ -10,6 +10,7 @@
|
||||
#include "dispatch_utils.h"
|
||||
#include "cub_helpers.h"
|
||||
#include "core/batch_invariant.hpp"
|
||||
#include "quantization/vectorization_utils.cuh"
|
||||
|
||||
#include <torch/cuda.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
@ -28,10 +29,22 @@ __global__ void rms_norm_static_fp8_quant_kernel(
|
||||
__shared__ float s_variance;
|
||||
float variance = 0.0f;
|
||||
|
||||
for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
|
||||
const float x = (float)input[blockIdx.x * input_stride + idx];
|
||||
const scalar_t* input_row = input + blockIdx.x * input_stride;
|
||||
|
||||
constexpr int VEC_SIZE = 8;
|
||||
auto vec_op = [&variance](const vec_n_t<scalar_t, VEC_SIZE>& vec) {
|
||||
#pragma unroll
|
||||
for (int i = 0; i < VEC_SIZE; ++i) {
|
||||
float x = static_cast<float>(vec.val[i]);
|
||||
variance += x * x;
|
||||
}
|
||||
};
|
||||
auto scalar_op = [&variance](const scalar_t& val) {
|
||||
float x = static_cast<float>(val);
|
||||
variance += x * x;
|
||||
}
|
||||
};
|
||||
vllm::vectorize_read_with_alignment<VEC_SIZE>(
|
||||
input_row, hidden_size, threadIdx.x, blockDim.x, vec_op, scalar_op);
|
||||
|
||||
using BlockReduce = cub::BlockReduce<float, 1024>;
|
||||
__shared__ typename BlockReduce::TempStorage reduceStore;
|
||||
@ -216,6 +229,8 @@ void fused_add_rms_norm_static_fp8_quant(
|
||||
double epsilon) {
|
||||
TORCH_CHECK(out.is_contiguous());
|
||||
TORCH_CHECK(residual.is_contiguous());
|
||||
TORCH_CHECK(residual.scalar_type() == input.scalar_type());
|
||||
TORCH_CHECK(weight.scalar_type() == input.scalar_type());
|
||||
int hidden_size = input.size(-1);
|
||||
int input_stride = input.stride(-2);
|
||||
int num_tokens = input.numel() / hidden_size;
|
||||
@ -241,7 +256,7 @@ void fused_add_rms_norm_static_fp8_quant(
|
||||
auto wt_ptr = reinterpret_cast<std::uintptr_t>(weight.data_ptr());
|
||||
bool ptrs_are_aligned =
|
||||
inp_ptr % 16 == 0 && res_ptr % 16 == 0 && wt_ptr % 16 == 0;
|
||||
bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
|
||||
bool batch_invariant_launch = vllm::vllm_is_batch_invariant();
|
||||
if (ptrs_are_aligned && hidden_size % 8 == 0 && input_stride % 8 == 0 &&
|
||||
!batch_invariant_launch) {
|
||||
LAUNCH_FUSED_ADD_RMS_NORM(8);
|
||||
|
@ -8,12 +8,77 @@
|
||||
|
||||
#include "../cuda_compat.h"
|
||||
#include "../dispatch_utils.h"
|
||||
#include "core/math.hpp"
|
||||
|
||||
#define CEILDIV(x, y) (((x) + (y) - 1) / (y))
|
||||
|
||||
namespace vllm {
|
||||
namespace moe {
|
||||
|
||||
namespace batched_moe_align_block_size {
|
||||
|
||||
// Note num_threads needs to be 1024 for BlockScan Reduction in the kernel.
|
||||
static constexpr int32_t num_threads = 1024;
|
||||
static constexpr int32_t num_blocks = 1;
|
||||
__global__ void batched_moe_align_block_size_kernel(
|
||||
int32_t const num_batches, int32_t const max_tokens_per_batch,
|
||||
int32_t const block_size, int32_t const* __restrict__ batch_num_tokens,
|
||||
int32_t* __restrict__ sorted_ids, int32_t* __restrict__ block_ids,
|
||||
int32_t* __restrict__ num_tokens_post_pad) {
|
||||
// TODO(varun): This is a naive implementation. Could be optimized.
|
||||
|
||||
size_t const batch_id = threadIdx.x;
|
||||
size_t const stride = blockDim.x * gridDim.x;
|
||||
int32_t const num_blocks_per_batch =
|
||||
CEILDIV(max_tokens_per_batch, block_size);
|
||||
int32_t const sorted_ids_size =
|
||||
num_blocks_per_batch * num_batches * block_size;
|
||||
int32_t const block_ids_size = sorted_ids_size / block_size;
|
||||
int32_t const SENTINEL =
|
||||
num_batches * max_tokens_per_batch; // To denote invalid entries.
|
||||
// Intialize sorted_ids
|
||||
for (size_t i = threadIdx.x; i < sorted_ids_size; i += stride) {
|
||||
sorted_ids[i] = SENTINEL;
|
||||
}
|
||||
// Intialize expert_ids with -1
|
||||
for (size_t i = threadIdx.x; i < block_ids_size; i += stride) {
|
||||
block_ids[i] = -1;
|
||||
}
|
||||
|
||||
int32_t b_num_tokens = 0;
|
||||
if (batch_id < num_batches) {
|
||||
b_num_tokens = batch_num_tokens[batch_id];
|
||||
}
|
||||
int32_t const ceil_b_num_tokens =
|
||||
CEILDIV(b_num_tokens, block_size) * block_size;
|
||||
|
||||
// Compute prefix sum over token counts per expert
|
||||
using BlockScan = cub::BlockScan<int32_t, 1024>;
|
||||
__shared__ typename BlockScan::TempStorage temp_storage;
|
||||
int cumsum_val;
|
||||
BlockScan(temp_storage).ExclusiveSum(ceil_b_num_tokens, cumsum_val);
|
||||
__syncthreads();
|
||||
|
||||
bool const is_last_batch = batch_id == (num_batches - 1);
|
||||
if (is_last_batch) {
|
||||
*num_tokens_post_pad = cumsum_val + ceil_b_num_tokens;
|
||||
}
|
||||
|
||||
if (batch_id < num_batches) {
|
||||
int32_t const batch_offset = batch_id * max_tokens_per_batch;
|
||||
for (size_t i = 0; i < b_num_tokens; ++i) {
|
||||
sorted_ids[cumsum_val + i] = batch_offset + i;
|
||||
}
|
||||
|
||||
int32_t const block_start = cumsum_val / block_size;
|
||||
int32_t const num_blocks = ceil_b_num_tokens / block_size;
|
||||
for (size_t i = 0; i < num_blocks; ++i) {
|
||||
block_ids[block_start + i] = batch_id;
|
||||
}
|
||||
}
|
||||
}
|
||||
} // namespace batched_moe_align_block_size
|
||||
|
||||
template <typename scalar_t>
|
||||
__global__ void moe_align_block_size_kernel(
|
||||
const scalar_t* __restrict__ topk_ids,
|
||||
@ -280,6 +345,33 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
|
||||
});
|
||||
}
|
||||
|
||||
void batched_moe_align_block_size(int64_t max_tokens_per_batch,
|
||||
int64_t block_size,
|
||||
torch::Tensor const& batch_num_tokens,
|
||||
torch::Tensor sorted_ids,
|
||||
torch::Tensor batch_ids,
|
||||
torch::Tensor num_tokens_post_pad) {
|
||||
namespace batched_kernel = vllm::moe::batched_moe_align_block_size;
|
||||
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
int32_t const B = batch_num_tokens.size(0);
|
||||
int32_t const num_blocks_per_batch =
|
||||
round_to_next_multiple_of(max_tokens_per_batch, block_size) / block_size;
|
||||
int32_t const num_blocks = num_blocks_per_batch * B;
|
||||
int64_t const sorted_ids_size = num_blocks * block_size;
|
||||
|
||||
TORCH_CHECK(sorted_ids.size(0) == sorted_ids_size);
|
||||
TORCH_CHECK(batch_ids.size(0) == sorted_ids_size / block_size);
|
||||
TORCH_CHECK(num_tokens_post_pad.size(0) == 1);
|
||||
TORCH_CHECK(B <= batched_kernel::num_threads);
|
||||
|
||||
batched_kernel::batched_moe_align_block_size_kernel<<<
|
||||
batched_kernel::num_blocks, batched_kernel::num_threads, 0, stream>>>(
|
||||
B, max_tokens_per_batch, block_size, batch_num_tokens.data_ptr<int32_t>(),
|
||||
sorted_ids.data_ptr<int32_t>(), batch_ids.data_ptr<int32_t>(),
|
||||
num_tokens_post_pad.data_ptr<int32_t>());
|
||||
}
|
||||
|
||||
void moe_sum(torch::Tensor& input, // [num_tokens, topk, hidden_size]
|
||||
torch::Tensor& output) // [num_tokens, hidden_size]
|
||||
{
|
||||
|
@ -4,7 +4,7 @@
|
||||
|
||||
void topk_softmax(torch::Tensor& topk_weights, torch::Tensor& topk_indices,
|
||||
torch::Tensor& token_expert_indices,
|
||||
torch::Tensor& gating_output);
|
||||
torch::Tensor& gating_output, bool renormalize);
|
||||
|
||||
void moe_sum(torch::Tensor& input, torch::Tensor& output);
|
||||
|
||||
@ -12,6 +12,14 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
|
||||
int64_t block_size, torch::Tensor sorted_token_ids,
|
||||
torch::Tensor experts_ids,
|
||||
torch::Tensor num_tokens_post_pad);
|
||||
|
||||
void batched_moe_align_block_size(int64_t max_tokens_per_batch,
|
||||
int64_t block_size,
|
||||
torch::Tensor const& expert_num_tokens,
|
||||
torch::Tensor sorted_ids,
|
||||
torch::Tensor expert_ids,
|
||||
torch::Tensor num_tokens_post_pad);
|
||||
|
||||
#ifndef USE_ROCM
|
||||
torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
|
||||
torch::Tensor b_qweight, torch::Tensor b_scales,
|
||||
|
@ -16,12 +16,23 @@
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include <type_traits>
|
||||
#include <torch/all.h>
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
#include "../cuda_compat.h"
|
||||
#include "../cub_helpers.h"
|
||||
|
||||
#ifndef USE_ROCM
|
||||
#include <cuda_bf16.h>
|
||||
#include <cuda_fp16.h>
|
||||
#else
|
||||
#include <hip/hip_bf16.h>
|
||||
#include <hip/hip_fp16.h>
|
||||
typedef __hip_bfloat16 __nv_bfloat16;
|
||||
typedef __hip_bfloat162 __nv_bfloat162;
|
||||
#endif
|
||||
|
||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
|
||||
@ -36,16 +47,27 @@ template <
|
||||
/// Alignment requirement in bytes
|
||||
int Alignment = sizeof(T) * N
|
||||
>
|
||||
class alignas(Alignment) AlignedArray {
|
||||
float data[N];
|
||||
struct alignas(Alignment) AlignedArray {
|
||||
T data[N];
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ float toFloat(T value) {
|
||||
if constexpr (std::is_same_v<T, float>) {
|
||||
return value;
|
||||
} else if constexpr (std::is_same_v<T, __nv_bfloat16>) {
|
||||
return __bfloat162float(value);
|
||||
} else if constexpr (std::is_same_v<T, __half>) {
|
||||
return __half2float(value);
|
||||
}
|
||||
}
|
||||
|
||||
// ====================== Softmax things ===============================
|
||||
// We have our own implementation of softmax here so we can support transposing the output
|
||||
// in the softmax kernel when we extend this module to support expert-choice routing.
|
||||
template <int TPB>
|
||||
template <int TPB, typename InputType>
|
||||
__launch_bounds__(TPB) __global__
|
||||
void moeSoftmax(const float* input, const bool* finished, float* output, const int num_cols)
|
||||
void moeSoftmax(const InputType* input, const bool* finished, float* output, const int num_cols)
|
||||
{
|
||||
using BlockReduce = cub::BlockReduce<float, TPB>;
|
||||
__shared__ typename BlockReduce::TempStorage tmpStorage;
|
||||
@ -66,7 +88,8 @@ __launch_bounds__(TPB) __global__
|
||||
for (int ii = threadIdx.x; ii < num_cols; ii += TPB)
|
||||
{
|
||||
const int idx = thread_row_offset + ii;
|
||||
threadData = max(static_cast<float>(input[idx]), threadData);
|
||||
const float val = toFloat(input[idx]);
|
||||
threadData = max(val, threadData);
|
||||
}
|
||||
|
||||
const float maxElem = BlockReduce(tmpStorage).Reduce(threadData, CubMaxOp());
|
||||
@ -81,7 +104,8 @@ __launch_bounds__(TPB) __global__
|
||||
for (int ii = threadIdx.x; ii < num_cols; ii += TPB)
|
||||
{
|
||||
const int idx = thread_row_offset + ii;
|
||||
threadData += exp((static_cast<float>(input[idx]) - float_max));
|
||||
const float val = toFloat(input[idx]);
|
||||
threadData += expf(val - float_max);
|
||||
}
|
||||
|
||||
const auto Z = BlockReduce(tmpStorage).Reduce(threadData, CubAddOp());
|
||||
@ -95,8 +119,9 @@ __launch_bounds__(TPB) __global__
|
||||
for (int ii = threadIdx.x; ii < num_cols; ii += TPB)
|
||||
{
|
||||
const int idx = thread_row_offset + ii;
|
||||
const float val = exp((static_cast<float>(input[idx]) - float_max)) * normalizing_factor;
|
||||
output[idx] = val;
|
||||
const float val = toFloat(input[idx]);
|
||||
const float softmax_val = expf(val - float_max) * normalizing_factor;
|
||||
output[idx] = softmax_val;
|
||||
}
|
||||
}
|
||||
|
||||
@ -110,7 +135,8 @@ __launch_bounds__(TPB) __global__ void moeTopK(
|
||||
const int num_experts,
|
||||
const int k,
|
||||
const int start_expert,
|
||||
const int end_expert)
|
||||
const int end_expert,
|
||||
const bool renormalize)
|
||||
{
|
||||
|
||||
using cub_kvp = cub::KeyValuePair<int, float>;
|
||||
@ -125,6 +151,7 @@ __launch_bounds__(TPB) __global__ void moeTopK(
|
||||
|
||||
const bool row_is_active = finished ? !finished[block_row] : true;
|
||||
const int thread_read_offset = blockIdx.x * num_experts;
|
||||
float selected_sum = 0.f;
|
||||
for (int k_idx = 0; k_idx < k; ++k_idx)
|
||||
{
|
||||
thread_kvp.key = 0;
|
||||
@ -163,9 +190,23 @@ __launch_bounds__(TPB) __global__ void moeTopK(
|
||||
indices[idx] = should_process_row ? (expert - start_expert) : num_experts;
|
||||
assert(indices[idx] >= 0);
|
||||
source_rows[idx] = k_idx * num_rows + block_row;
|
||||
if (renormalize) {
|
||||
selected_sum += result_kvp.value;
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
// Renormalize the k weights for this row to sum to 1, if requested.
|
||||
if (renormalize) {
|
||||
if (threadIdx.x == 0) {
|
||||
const float denom = selected_sum > 0.f ? selected_sum : 1.f;
|
||||
for (int k_idx = 0; k_idx < k; ++k_idx) {
|
||||
const int idx = k * block_row + k_idx;
|
||||
output[idx] = output[idx] / denom;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ====================== TopK softmax things ===============================
|
||||
@ -184,21 +225,30 @@ __launch_bounds__(TPB) __global__ void moeTopK(
|
||||
2) This implementation assumes k is small, but will work for any k.
|
||||
*/
|
||||
|
||||
template <int VPT, int NUM_EXPERTS, int WARPS_PER_CTA, int BYTES_PER_LDG, int WARP_SIZE_PARAM, typename IndType>
|
||||
template <int VPT, int NUM_EXPERTS, int WARPS_PER_CTA, int BYTES_PER_LDG, int WARP_SIZE_PARAM, typename IndType, typename InputType = float>
|
||||
__launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
|
||||
void topkGatingSoftmax(const float* input, const bool* finished, float* output, const int num_rows, IndType* indices,
|
||||
int* source_rows, const int k, const int start_expert, const int end_expert)
|
||||
void topkGatingSoftmax(const InputType* input, const bool* finished, float* output, const int num_rows, IndType* indices,
|
||||
int* source_rows, const int k, const int start_expert, const int end_expert, const bool renormalize)
|
||||
{
|
||||
static_assert(std::is_same_v<InputType, float> || std::is_same_v<InputType, __nv_bfloat16> ||
|
||||
std::is_same_v<InputType, __half>,
|
||||
"InputType must be float, __nv_bfloat16, or __half");
|
||||
|
||||
// We begin by enforcing compile time assertions and setting up compile time constants.
|
||||
static_assert(BYTES_PER_LDG == (BYTES_PER_LDG & -BYTES_PER_LDG), "BYTES_PER_LDG must be power of 2");
|
||||
static_assert(BYTES_PER_LDG <= 16, "BYTES_PER_LDG must be leq 16");
|
||||
|
||||
// Number of bytes each thread pulls in per load
|
||||
static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(float);
|
||||
static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(InputType);
|
||||
static constexpr int ELTS_PER_ROW = NUM_EXPERTS;
|
||||
static constexpr int THREADS_PER_ROW = ELTS_PER_ROW / VPT;
|
||||
static constexpr int LDG_PER_THREAD = VPT / ELTS_PER_LDG;
|
||||
|
||||
if constexpr (std::is_same_v<InputType, __nv_bfloat16> || std::is_same_v<InputType, __half>) {
|
||||
static_assert(ELTS_PER_LDG == 1 || ELTS_PER_LDG % 2 == 0,
|
||||
"ELTS_PER_LDG must be 1 or even for 16-bit conversion");
|
||||
}
|
||||
|
||||
// Restrictions based on previous section.
|
||||
static_assert(VPT % ELTS_PER_LDG == 0, "The elements per thread must be a multiple of the elements per ldg");
|
||||
static_assert(WARP_SIZE_PARAM % THREADS_PER_ROW == 0, "The threads per row must cleanly divide the threads per warp");
|
||||
@ -236,27 +286,71 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
|
||||
|
||||
// We finally start setting up the read pointers for each thread. First, each thread jumps to the start of the
|
||||
// row it will read.
|
||||
const float* thread_row_ptr = input + thread_row * ELTS_PER_ROW;
|
||||
const InputType* thread_row_ptr = input + thread_row * ELTS_PER_ROW;
|
||||
|
||||
// Now, we compute the group each thread belong to in order to determine the first column to start loads.
|
||||
const int thread_group_idx = threadIdx.x % THREADS_PER_ROW;
|
||||
const int first_elt_read_by_thread = thread_group_idx * ELTS_PER_LDG;
|
||||
const float* thread_read_ptr = thread_row_ptr + first_elt_read_by_thread;
|
||||
|
||||
// Determine the pointer type to use to read in the data depending on the BYTES_PER_LDG template param. In theory,
|
||||
// this can support all powers of 2 up to 16.
|
||||
// NOTE(woosuk): The original implementation uses CUTLASS aligned array here.
|
||||
// We defined our own aligned array and use it here to avoid the dependency on CUTLASS.
|
||||
using AccessType = AlignedArray<float, ELTS_PER_LDG>;
|
||||
const InputType* thread_read_ptr = thread_row_ptr + first_elt_read_by_thread;
|
||||
|
||||
// Finally, we pull in the data from global mem
|
||||
float row_chunk[VPT];
|
||||
AccessType* row_chunk_vec_ptr = reinterpret_cast<AccessType*>(&row_chunk);
|
||||
const AccessType* vec_thread_read_ptr = reinterpret_cast<const AccessType*>(thread_read_ptr);
|
||||
|
||||
// NOTE(zhuhaoran): dispatch different input types loading, BF16/FP16 convert to float
|
||||
if constexpr (std::is_same_v<InputType, float>) {
|
||||
using VecType = AlignedArray<float, ELTS_PER_LDG>;
|
||||
VecType* row_chunk_vec_ptr = reinterpret_cast<VecType*>(&row_chunk);
|
||||
const VecType* vec_thread_read_ptr = reinterpret_cast<const VecType*>(thread_read_ptr);
|
||||
#pragma unroll
|
||||
for (int ii = 0; ii < LDG_PER_THREAD; ++ii)
|
||||
{
|
||||
row_chunk_vec_ptr[ii] = vec_thread_read_ptr[ii * THREADS_PER_ROW];
|
||||
for (int ii = 0; ii < LDG_PER_THREAD; ++ii) {
|
||||
row_chunk_vec_ptr[ii] = vec_thread_read_ptr[ii * THREADS_PER_ROW];
|
||||
}
|
||||
} else if constexpr (std::is_same_v<InputType, __nv_bfloat16>) {
|
||||
if constexpr (ELTS_PER_LDG >= 2) {
|
||||
using VecType = AlignedArray<__nv_bfloat16, ELTS_PER_LDG>;
|
||||
float2* row_chunk_f2 = reinterpret_cast<float2*>(row_chunk);
|
||||
const VecType* vec_thread_read_ptr = reinterpret_cast<const VecType*>(thread_read_ptr);
|
||||
#pragma unroll
|
||||
for (int ii = 0; ii < LDG_PER_THREAD; ++ii) {
|
||||
VecType vec = vec_thread_read_ptr[ii * THREADS_PER_ROW];
|
||||
int base_idx_f2 = ii * ELTS_PER_LDG / 2;
|
||||
#pragma unroll
|
||||
for (int jj = 0; jj < ELTS_PER_LDG / 2; ++jj) {
|
||||
row_chunk_f2[base_idx_f2 + jj] = __bfloat1622float2(
|
||||
*reinterpret_cast<const __nv_bfloat162*>(vec.data + jj * 2)
|
||||
);
|
||||
}
|
||||
}
|
||||
} else { // ELTS_PER_LDG == 1
|
||||
#pragma unroll
|
||||
for (int ii = 0; ii < LDG_PER_THREAD; ++ii) {
|
||||
const __nv_bfloat16* scalar_ptr = thread_read_ptr + ii * THREADS_PER_ROW;
|
||||
row_chunk[ii] = __bfloat162float(*scalar_ptr);
|
||||
}
|
||||
}
|
||||
} else if constexpr (std::is_same_v<InputType, __half>) {
|
||||
if constexpr (ELTS_PER_LDG >= 2) {
|
||||
using VecType = AlignedArray<__half, ELTS_PER_LDG>;
|
||||
float2* row_chunk_f2 = reinterpret_cast<float2*>(row_chunk);
|
||||
const VecType* vec_thread_read_ptr = reinterpret_cast<const VecType*>(thread_read_ptr);
|
||||
#pragma unroll
|
||||
for (int ii = 0; ii < LDG_PER_THREAD; ++ii) {
|
||||
VecType vec = vec_thread_read_ptr[ii * THREADS_PER_ROW];
|
||||
int base_idx_f2 = ii * ELTS_PER_LDG / 2;
|
||||
#pragma unroll
|
||||
for (int jj = 0; jj < ELTS_PER_LDG / 2; ++jj) {
|
||||
row_chunk_f2[base_idx_f2 + jj] = __half22float2(
|
||||
*reinterpret_cast<const __half2*>(vec.data + jj * 2)
|
||||
);
|
||||
}
|
||||
}
|
||||
} else { // ELTS_PER_LDG == 1
|
||||
#pragma unroll
|
||||
for (int ii = 0; ii < LDG_PER_THREAD; ++ii) {
|
||||
const __half* scalar_ptr = thread_read_ptr + ii * THREADS_PER_ROW;
|
||||
row_chunk[ii] = __half2float(*scalar_ptr);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// First, we perform a max reduce within the thread. We can do the max in fp16 safely (I think) and just
|
||||
@ -310,6 +404,7 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
|
||||
int start_col = first_elt_read_by_thread;
|
||||
static constexpr int COLS_PER_GROUP_LDG = ELTS_PER_LDG * THREADS_PER_ROW;
|
||||
|
||||
float selected_sum = 0.f;
|
||||
for (int k_idx = 0; k_idx < k; ++k_idx)
|
||||
{
|
||||
// First, each thread does the local argmax
|
||||
@ -363,6 +458,9 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
|
||||
output[idx] = max_val;
|
||||
indices[idx] = should_process_row ? (expert - start_expert) : NUM_EXPERTS;
|
||||
source_rows[idx] = k_idx * num_rows + thread_row;
|
||||
if (renormalize) {
|
||||
selected_sum += max_val;
|
||||
}
|
||||
}
|
||||
|
||||
// Finally, we clear the value in the thread with the current max if there is another iteration to run.
|
||||
@ -380,15 +478,28 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Renormalize the k weights for this row to sum to 1, if requested.
|
||||
if (renormalize) {
|
||||
if (thread_group_idx == 0)
|
||||
{
|
||||
const float denom = selected_sum > 0.f ? selected_sum : 1.f;
|
||||
for (int k_idx = 0; k_idx < k; ++k_idx)
|
||||
{
|
||||
const int idx = k * thread_row + k_idx;
|
||||
output[idx] = output[idx] / denom;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
namespace detail
|
||||
{
|
||||
// Constructs some constants needed to partition the work across threads at compile time.
|
||||
template <int EXPERTS, int BYTES_PER_LDG, int WARP_SIZE_PARAM>
|
||||
template <int EXPERTS, int BYTES_PER_LDG, int WARP_SIZE_PARAM, typename InputType>
|
||||
struct TopkConstants
|
||||
{
|
||||
static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(float);
|
||||
static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(InputType);
|
||||
static_assert(EXPERTS / (ELTS_PER_LDG * WARP_SIZE_PARAM) == 0 || EXPERTS % (ELTS_PER_LDG * WARP_SIZE_PARAM) == 0, "");
|
||||
static constexpr int VECs_PER_THREAD = MAX(1, EXPERTS / (ELTS_PER_LDG * WARP_SIZE_PARAM));
|
||||
static constexpr int VPT = VECs_PER_THREAD * ELTS_PER_LDG;
|
||||
@ -397,20 +508,21 @@ struct TopkConstants
|
||||
};
|
||||
} // namespace detail
|
||||
|
||||
template <int EXPERTS, int WARPS_PER_TB, int WARP_SIZE_PARAM, int MAX_BYTES_PER_LDG, typename IndType>
|
||||
void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, float* output, IndType* indices,
|
||||
int* source_row, const int num_rows, const int k, const int start_expert, const int end_expert, cudaStream_t stream)
|
||||
template <int EXPERTS, int WARPS_PER_TB, int WARP_SIZE_PARAM, int MAX_BYTES_PER_LDG, typename IndType, typename InputType>
|
||||
void topkGatingSoftmaxLauncherHelper(const InputType* input, const bool* finished, float* output, IndType* indices,
|
||||
int* source_row, const int num_rows, const int k, const int start_expert, const int end_expert, const bool renormalize,
|
||||
cudaStream_t stream)
|
||||
{
|
||||
static constexpr int BYTES_PER_LDG = MIN(MAX_BYTES_PER_LDG, sizeof(float) * EXPERTS);
|
||||
using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG, WARP_SIZE_PARAM>;
|
||||
static constexpr int BYTES_PER_LDG = MIN(MAX_BYTES_PER_LDG, sizeof(InputType) * EXPERTS);
|
||||
using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG, WARP_SIZE_PARAM, InputType>;
|
||||
static constexpr int VPT = Constants::VPT;
|
||||
static constexpr int ROWS_PER_WARP = Constants::ROWS_PER_WARP;
|
||||
const int num_warps = (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP;
|
||||
const int num_blocks = (num_warps + WARPS_PER_TB - 1) / WARPS_PER_TB;
|
||||
|
||||
dim3 block_dim(WARP_SIZE_PARAM, WARPS_PER_TB);
|
||||
topkGatingSoftmax<VPT, EXPERTS, WARPS_PER_TB, BYTES_PER_LDG, WARP_SIZE_PARAM><<<num_blocks, block_dim, 0, stream>>>(
|
||||
input, finished, output, num_rows, indices, source_row, k, start_expert, end_expert);
|
||||
topkGatingSoftmax<VPT, EXPERTS, WARPS_PER_TB, BYTES_PER_LDG, WARP_SIZE_PARAM, IndType, InputType><<<num_blocks, block_dim, 0, stream>>>(
|
||||
input, finished, output, num_rows, indices, source_row, k, start_expert, end_expert, renormalize);
|
||||
}
|
||||
|
||||
#ifndef USE_ROCM
|
||||
@ -418,26 +530,26 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f
|
||||
static_assert(WARP_SIZE == 32, \
|
||||
"Unsupported warp size. Only 32 is supported for CUDA"); \
|
||||
topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, WARP_SIZE, MAX_BYTES>( \
|
||||
gating_output, nullptr, topk_weights, topk_indices, \
|
||||
token_expert_indices, num_tokens, topk, 0, num_experts, stream);
|
||||
gating_output, nullptr, topk_weights, topk_indices, token_expert_indices, \
|
||||
num_tokens, topk, 0, num_experts, renormalize, stream);
|
||||
#else
|
||||
#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB, MAX_BYTES) \
|
||||
if (WARP_SIZE == 64) { \
|
||||
topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 64, MAX_BYTES>( \
|
||||
gating_output, nullptr, topk_weights, topk_indices, \
|
||||
token_expert_indices, num_tokens, topk, 0, num_experts, stream); \
|
||||
gating_output, nullptr, topk_weights, topk_indices, token_expert_indices, \
|
||||
num_tokens, topk, 0, num_experts, renormalize, stream); \
|
||||
} else if (WARP_SIZE == 32) { \
|
||||
topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 32, MAX_BYTES>( \
|
||||
gating_output, nullptr, topk_weights, topk_indices, \
|
||||
token_expert_indices, num_tokens, topk, 0, num_experts, stream); \
|
||||
gating_output, nullptr, topk_weights, topk_indices, token_expert_indices, \
|
||||
num_tokens, topk, 0, num_experts, renormalize, stream); \
|
||||
} else { \
|
||||
assert(false && "Unsupported warp size. Only 32 and 64 are supported for ROCm"); \
|
||||
}
|
||||
#endif
|
||||
|
||||
template <typename IndType>
|
||||
template <typename IndType, typename InputType>
|
||||
void topkGatingSoftmaxKernelLauncher(
|
||||
const float* gating_output,
|
||||
const InputType* gating_output,
|
||||
float* topk_weights,
|
||||
IndType* topk_indices,
|
||||
int* token_expert_indices,
|
||||
@ -445,11 +557,15 @@ void topkGatingSoftmaxKernelLauncher(
|
||||
const int num_tokens,
|
||||
const int num_experts,
|
||||
const int topk,
|
||||
const bool renormalize,
|
||||
cudaStream_t stream) {
|
||||
static constexpr int WARPS_PER_TB = 4;
|
||||
static constexpr int BYTES_PER_LDG_POWER_OF_2 = 16;
|
||||
#ifndef USE_ROCM
|
||||
static constexpr int BYTES_PER_LDG_MULTIPLE_64 = 8;
|
||||
// for bfloat16 dtype, we need 4 bytes loading to make sure num_experts
|
||||
// elements can be loaded by a warp
|
||||
static constexpr int BYTES_PER_LDG_MULTIPLE_64 =
|
||||
(std::is_same_v<InputType, __nv_bfloat16> || std::is_same_v<InputType, __half>) ? 4 : 8;
|
||||
#endif
|
||||
switch (num_experts) {
|
||||
case 1:
|
||||
@ -506,11 +622,11 @@ void topkGatingSoftmaxKernelLauncher(
|
||||
TORCH_CHECK(softmax_workspace != nullptr,
|
||||
"softmax_workspace must be provided for num_experts that are not a power of 2 or multiple of 64.");
|
||||
static constexpr int TPB = 256;
|
||||
moeSoftmax<TPB><<<num_tokens, TPB, 0, stream>>>(
|
||||
moeSoftmax<TPB, InputType><<<num_tokens, TPB, 0, stream>>>(
|
||||
gating_output, nullptr, softmax_workspace, num_experts);
|
||||
moeTopK<TPB><<<num_tokens, TPB, 0, stream>>>(
|
||||
softmax_workspace, nullptr, topk_weights, topk_indices, token_expert_indices,
|
||||
num_experts, topk, 0, num_experts);
|
||||
num_experts, topk, 0, num_experts, renormalize);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -518,11 +634,50 @@ void topkGatingSoftmaxKernelLauncher(
|
||||
} // namespace moe
|
||||
} // namespace vllm
|
||||
|
||||
|
||||
template<typename ComputeType>
|
||||
void dispatch_topk_softmax_launch(
|
||||
torch::Tensor& gating_output,
|
||||
torch::Tensor& topk_weights,
|
||||
torch::Tensor& topk_indices,
|
||||
torch::Tensor& token_expert_indices,
|
||||
torch::Tensor& softmax_workspace,
|
||||
int num_tokens, int num_experts, int topk, bool renormalize, cudaStream_t stream)
|
||||
{
|
||||
if (topk_indices.scalar_type() == at::ScalarType::Int) {
|
||||
vllm::moe::topkGatingSoftmaxKernelLauncher<int, ComputeType>(
|
||||
reinterpret_cast<const ComputeType*>(gating_output.data_ptr()),
|
||||
topk_weights.data_ptr<float>(),
|
||||
topk_indices.data_ptr<int>(),
|
||||
token_expert_indices.data_ptr<int>(),
|
||||
softmax_workspace.data_ptr<float>(),
|
||||
num_tokens, num_experts, topk, renormalize, stream);
|
||||
} else if (topk_indices.scalar_type() == at::ScalarType::UInt32) {
|
||||
vllm::moe::topkGatingSoftmaxKernelLauncher<uint32_t, ComputeType>(
|
||||
reinterpret_cast<const ComputeType*>(gating_output.data_ptr()),
|
||||
topk_weights.data_ptr<float>(),
|
||||
topk_indices.data_ptr<uint32_t>(),
|
||||
token_expert_indices.data_ptr<int>(),
|
||||
softmax_workspace.data_ptr<float>(),
|
||||
num_tokens, num_experts, topk, renormalize, stream);
|
||||
} else {
|
||||
TORCH_CHECK(topk_indices.scalar_type() == at::ScalarType::Long);
|
||||
vllm::moe::topkGatingSoftmaxKernelLauncher<int64_t, ComputeType>(
|
||||
reinterpret_cast<const ComputeType*>(gating_output.data_ptr()),
|
||||
topk_weights.data_ptr<float>(),
|
||||
topk_indices.data_ptr<int64_t>(),
|
||||
token_expert_indices.data_ptr<int>(),
|
||||
softmax_workspace.data_ptr<float>(),
|
||||
num_tokens, num_experts, topk, renormalize, stream);
|
||||
}
|
||||
}
|
||||
|
||||
void topk_softmax(
|
||||
torch::Tensor& topk_weights, // [num_tokens, topk]
|
||||
torch::Tensor& topk_indices, // [num_tokens, topk]
|
||||
torch::Tensor& token_expert_indices, // [num_tokens, topk]
|
||||
torch::Tensor& gating_output) // [num_tokens, num_experts]
|
||||
torch::Tensor& gating_output, // [num_tokens, num_experts]
|
||||
bool renormalize)
|
||||
{
|
||||
const int num_experts = gating_output.size(-1);
|
||||
const auto num_tokens = gating_output.numel() / num_experts;
|
||||
@ -534,45 +689,19 @@ void topk_softmax(
|
||||
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(gating_output));
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
torch::Tensor softmax_workspace = torch::empty({workspace_size}, gating_output.options());
|
||||
const auto workspace_options = gating_output.options().dtype(at::ScalarType::Float);
|
||||
torch::Tensor softmax_workspace = torch::empty({workspace_size}, workspace_options);
|
||||
|
||||
if(topk_indices.scalar_type() == at::ScalarType::Int)
|
||||
{
|
||||
vllm::moe::topkGatingSoftmaxKernelLauncher(
|
||||
gating_output.data_ptr<float>(),
|
||||
topk_weights.data_ptr<float>(),
|
||||
topk_indices.data_ptr<int>(),
|
||||
token_expert_indices.data_ptr<int>(),
|
||||
softmax_workspace.data_ptr<float>(),
|
||||
num_tokens,
|
||||
num_experts,
|
||||
topk,
|
||||
stream);
|
||||
}
|
||||
else if (topk_indices.scalar_type() == at::ScalarType::UInt32)
|
||||
{
|
||||
vllm::moe::topkGatingSoftmaxKernelLauncher(
|
||||
gating_output.data_ptr<float>(),
|
||||
topk_weights.data_ptr<float>(),
|
||||
topk_indices.data_ptr<uint32_t>(),
|
||||
token_expert_indices.data_ptr<int>(),
|
||||
softmax_workspace.data_ptr<float>(),
|
||||
num_tokens,
|
||||
num_experts,
|
||||
topk,
|
||||
stream);
|
||||
}
|
||||
else {
|
||||
TORCH_CHECK(topk_indices.scalar_type() == at::ScalarType::Long);
|
||||
vllm::moe::topkGatingSoftmaxKernelLauncher(
|
||||
gating_output.data_ptr<float>(),
|
||||
topk_weights.data_ptr<float>(),
|
||||
topk_indices.data_ptr<int64_t>(),
|
||||
token_expert_indices.data_ptr<int>(),
|
||||
softmax_workspace.data_ptr<float>(),
|
||||
num_tokens,
|
||||
num_experts,
|
||||
topk,
|
||||
stream);
|
||||
if (gating_output.scalar_type() == at::ScalarType::Float) {
|
||||
dispatch_topk_softmax_launch<float>(gating_output, topk_weights, topk_indices,
|
||||
token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize, stream);
|
||||
} else if (gating_output.scalar_type() == at::ScalarType::Half) {
|
||||
dispatch_topk_softmax_launch<__half>(gating_output, topk_weights, topk_indices,
|
||||
token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize, stream);
|
||||
} else if (gating_output.scalar_type() == at::ScalarType::BFloat16) {
|
||||
dispatch_topk_softmax_launch<__nv_bfloat16>(gating_output, topk_weights, topk_indices,
|
||||
token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize, stream);
|
||||
} else {
|
||||
TORCH_CHECK(false, "Unsupported gating_output data type: ", gating_output.scalar_type());
|
||||
}
|
||||
}
|
||||
|
@ -5,7 +5,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
|
||||
// Apply topk softmax to the gating outputs.
|
||||
m.def(
|
||||
"topk_softmax(Tensor! topk_weights, Tensor! topk_indices, Tensor! "
|
||||
"token_expert_indices, Tensor gating_output) -> ()");
|
||||
"token_expert_indices, Tensor gating_output, bool renormalize) -> ()");
|
||||
m.impl("topk_softmax", torch::kCUDA, &topk_softmax);
|
||||
|
||||
// Calculate the result of moe by summing up the partial results
|
||||
@ -22,6 +22,17 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
|
||||
" Tensor! num_tokens_post_pad) -> ()");
|
||||
m.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size);
|
||||
|
||||
// Aligning the number of tokens to be processed by each expert such
|
||||
// that it is divisible by the block size, but for the batched case.
|
||||
m.def(
|
||||
"batched_moe_align_block_size(int max_tokens_per_batch,"
|
||||
" int block_size, Tensor expert_num_tokens,"
|
||||
" Tensor! sorted_token_ids,"
|
||||
" Tensor! experts_ids,"
|
||||
" Tensor! num_tokens_post_pad) -> ()");
|
||||
m.impl("batched_moe_align_block_size", torch::kCUDA,
|
||||
&batched_moe_align_block_size);
|
||||
|
||||
#ifndef USE_ROCM
|
||||
m.def(
|
||||
"moe_wna16_gemm(Tensor input, Tensor! output, Tensor b_qweight, "
|
||||
|
@ -92,9 +92,6 @@ void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
|
||||
void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
|
||||
torch::Tensor& weight, double epsilon);
|
||||
|
||||
void poly_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
|
||||
torch::Tensor& bias, double epsilon);
|
||||
|
||||
void apply_repetition_penalties_(torch::Tensor& logits,
|
||||
const torch::Tensor& prompt_mask,
|
||||
const torch::Tensor& output_mask,
|
||||
|
@ -145,7 +145,11 @@ void rms_norm_dynamic_per_token_quant(
|
||||
if (scale_ub.has_value()) {
|
||||
TORCH_CHECK(out.dtype() == kFp8Type);
|
||||
}
|
||||
TORCH_CHECK(weight.dtype() == input.dtype());
|
||||
TORCH_CHECK(scales.dtype() == torch::kFloat32);
|
||||
if (residual) {
|
||||
TORCH_CHECK(residual->scalar_type() == input.scalar_type());
|
||||
}
|
||||
|
||||
VLLM_DISPATCH_FLOATING_TYPES(
|
||||
input.scalar_type(), "rms_norm_dynamic_per_token_quant_dispatch", [&] {
|
||||
|
@ -175,12 +175,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
"float epsilon) -> ()");
|
||||
ops.impl("fused_add_rms_norm", torch::kCUDA, &fused_add_rms_norm);
|
||||
|
||||
// Polynomial Normalization.
|
||||
ops.def(
|
||||
"poly_norm(Tensor! out, Tensor input, Tensor weight, Tensor bias, float "
|
||||
"epsilon) -> ()");
|
||||
ops.impl("poly_norm", torch::kCUDA, &poly_norm);
|
||||
|
||||
// Apply repetition penalties to logits in-place
|
||||
ops.def(
|
||||
"apply_repetition_penalties_(Tensor! logits, Tensor prompt_mask, "
|
||||
|
@ -359,8 +359,8 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
|
||||
# Install FlashInfer pre-compiled kernel cache and binaries
|
||||
# https://docs.flashinfer.ai/installation.html
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system flashinfer-cubin==0.4.0 \
|
||||
&& uv pip install --system flashinfer-jit-cache==0.4.0 \
|
||||
uv pip install --system flashinfer-cubin==0.4.1 \
|
||||
&& uv pip install --system flashinfer-jit-cache==0.4.1 \
|
||||
--extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
|
||||
&& flashinfer show-config
|
||||
|
||||
|
@ -246,7 +246,7 @@ RUN pip install setuptools==75.6.0 packaging==23.2 ninja==1.11.1.3 build==1.2.2.
|
||||
|
||||
|
||||
# build flashinfer for torch nightly from source around 10 mins
|
||||
# release version: v0.4.0
|
||||
# release version: v0.4.1
|
||||
# todo(elainewy): cache flashinfer build result for faster build
|
||||
ENV CCACHE_DIR=/root/.cache/ccache
|
||||
RUN --mount=type=cache,target=/root/.cache/ccache \
|
||||
@ -254,7 +254,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
|
||||
echo "git clone flashinfer..." \
|
||||
&& git clone --recursive https://github.com/flashinfer-ai/flashinfer.git \
|
||||
&& cd flashinfer \
|
||||
&& git checkout v0.4.0 \
|
||||
&& git checkout v0.4.1\
|
||||
&& git submodule update --init --recursive \
|
||||
&& echo "finish git clone flashinfer..." \
|
||||
&& rm -rf build \
|
||||
|
@ -12,7 +12,7 @@ ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}}
|
||||
RUN apt-get update -q -y && apt-get install -q -y \
|
||||
sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \
|
||||
apt-transport-https ca-certificates wget curl
|
||||
# Remove sccache
|
||||
# Remove sccache
|
||||
RUN python3 -m pip install --upgrade pip
|
||||
RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
|
||||
ARG COMMON_WORKDIR
|
||||
|
@ -20,8 +20,6 @@ API documentation for vLLM's configuration classes.
|
||||
- [vllm.config.CompilationConfig][]
|
||||
- [vllm.config.VllmConfig][]
|
||||
|
||||
[](){ #offline-inference-api }
|
||||
|
||||
## Offline Inference
|
||||
|
||||
LLM Class.
|
||||
@ -45,18 +43,14 @@ Engine classes for offline and online inference.
|
||||
|
||||
Inference parameters for vLLM APIs.
|
||||
|
||||
[](){ #sampling-params }
|
||||
|
||||
- [vllm.SamplingParams][]
|
||||
- [vllm.PoolingParams][]
|
||||
|
||||
[](){ #multi-modality }
|
||||
|
||||
## Multi-Modality
|
||||
|
||||
vLLM provides experimental support for multi-modal models through the [vllm.multimodal][] package.
|
||||
|
||||
Multi-modal inputs can be passed alongside text and token prompts to [supported models][supported-mm-models]
|
||||
Multi-modal inputs can be passed alongside text and token prompts to [supported models](../models/supported_models.md#list-of-multimodal-language-models)
|
||||
via the `multi_modal_data` field in [vllm.inputs.PromptType][].
|
||||
|
||||
Looking to add your own multi-modal model? Please follow the instructions listed [here](../contributing/model/multimodal.md).
|
||||
|
@ -4,6 +4,6 @@ This section lists the most common options for running vLLM.
|
||||
|
||||
There are three main levels of configuration, from highest priority to lowest priority:
|
||||
|
||||
- [Request parameters][completions-api] and [input arguments][sampling-params]
|
||||
- [Request parameters](../serving/openai_compatible_server.md#completions-api) and [input arguments](../api/README.md#inference-parameters)
|
||||
- [Engine arguments](./engine_args.md)
|
||||
- [Environment variables](./env_vars.md)
|
||||
|
@ -23,7 +23,7 @@ llm = LLM(model="ibm-granite/granite-3.1-8b-instruct", tensor_parallel_size=2)
|
||||
!!! note
|
||||
With tensor parallelism enabled, each process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism).
|
||||
|
||||
You can convert the model checkpoint to a sharded checkpoint using <gh-file:examples/offline_inference/save_sharded_state.py>. The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
|
||||
You can convert the model checkpoint to a sharded checkpoint using [examples/offline_inference/save_sharded_state.py](../../examples/offline_inference/save_sharded_state.py). The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
|
||||
|
||||
## Quantization
|
||||
|
||||
@ -58,12 +58,12 @@ You can adjust `compilation_config` to achieve a better balance between inferenc
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
from vllm.config import CompilationConfig, CompilationLevel
|
||||
from vllm.config import CompilationConfig, CompilationMode
|
||||
|
||||
llm = LLM(
|
||||
model="meta-llama/Llama-3.1-8B-Instruct",
|
||||
compilation_config=CompilationConfig(
|
||||
level=CompilationLevel.PIECEWISE,
|
||||
mode=CompilationMode.VLLM_COMPILE,
|
||||
# By default, it goes up to max_num_seqs
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8, 16],
|
||||
),
|
||||
|
@ -27,8 +27,6 @@ You can monitor the number of preemption requests through Prometheus metrics exp
|
||||
|
||||
In vLLM V1, the default preemption mode is `RECOMPUTE` rather than `SWAP`, as recomputation has lower overhead in the V1 architecture.
|
||||
|
||||
[](){ #chunked-prefill }
|
||||
|
||||
## Chunked Prefill
|
||||
|
||||
Chunked prefill allows vLLM to process large prefills in smaller chunks and batch them together with decode requests. This feature helps improve both throughput and latency by better balancing compute-bound (prefill) and memory-bound (decode) operations.
|
||||
@ -174,14 +172,14 @@ Regardless, you need to set `mm_encoder_tp_mode="data"` in engine arguments to u
|
||||
|
||||
Known supported models (with corresponding benchmarks):
|
||||
|
||||
- dots_ocr (<gh-pr:25466>)
|
||||
- GLM-4.1V or above (<gh-pr:23168>)
|
||||
- InternVL (<gh-pr:23909>)
|
||||
- Kimi-VL (<gh-pr:23817>)
|
||||
- Llama4 (<gh-pr:18368>)
|
||||
- MiniCPM-V-2.5 or above (<gh-pr:23327>, <gh-pr:23948>)
|
||||
- Qwen2-VL or above (<gh-pr:22742>, <gh-pr:24955>, <gh-pr:25445>)
|
||||
- Step3 (<gh-pr:22697>)
|
||||
- dots_ocr (<https://github.com/vllm-project/vllm/pull/25466>)
|
||||
- GLM-4.1V or above (<https://github.com/vllm-project/vllm/pull/23168>)
|
||||
- InternVL (<https://github.com/vllm-project/vllm/pull/23909>)
|
||||
- Kimi-VL (<https://github.com/vllm-project/vllm/pull/23817>)
|
||||
- Llama4 (<https://github.com/vllm-project/vllm/pull/18368>)
|
||||
- MiniCPM-V-2.5 or above (<https://github.com/vllm-project/vllm/pull/23327>, <https://github.com/vllm-project/vllm/pull/23948>)
|
||||
- Qwen2-VL or above (<https://github.com/vllm-project/vllm/pull/22742>, <https://github.com/vllm-project/vllm/pull/24955>, <https://github.com/vllm-project/vllm/pull/25445>)
|
||||
- Step3 (<https://github.com/vllm-project/vllm/pull/22697>)
|
||||
|
||||
## Input Processing
|
||||
|
||||
|
@ -96,7 +96,7 @@ Although it’s common to do this with GPUs, don't try to fragment 2 or 8 differ
|
||||
|
||||
### Tune your workloads
|
||||
|
||||
Although we try to have great default configs, we strongly recommend you check out the [vLLM auto-tuner](gh-file:benchmarks/auto_tune/README.md) to optimize your workloads for your use case.
|
||||
Although we try to have great default configs, we strongly recommend you check out the [vLLM auto-tuner](../../benchmarks/auto_tune/README.md) to optimize your workloads for your use case.
|
||||
|
||||
### Future Topics We'll Cover
|
||||
|
||||
|
@ -22,7 +22,7 @@ Unsure on where to start? Check out the following links for tasks to work on:
|
||||
|
||||
## License
|
||||
|
||||
See <gh-file:LICENSE>.
|
||||
See [LICENSE](../../LICENSE).
|
||||
|
||||
## Developing
|
||||
|
||||
@ -54,7 +54,7 @@ For more details about installing from source and installing for other hardware,
|
||||
For an optimized workflow when iterating on C++/CUDA kernels, see the [Incremental Compilation Workflow](./incremental_build.md) for recommendations.
|
||||
|
||||
!!! tip
|
||||
vLLM is compatible with Python versions 3.10 to 3.13. However, vLLM's default [Dockerfile](gh-file:docker/Dockerfile) ships with Python 3.12 and tests in CI (except `mypy`) are run with Python 3.12.
|
||||
vLLM is compatible with Python versions 3.10 to 3.13. However, vLLM's default [Dockerfile](../../docker/Dockerfile) ships with Python 3.12 and tests in CI (except `mypy`) are run with Python 3.12.
|
||||
|
||||
Therefore, we recommend developing with Python 3.12 to minimise the chance of your local environment clashing with our CI environment.
|
||||
|
||||
@ -88,7 +88,7 @@ vLLM's `pre-commit` hooks will now run automatically every time you commit.
|
||||
|
||||
### Documentation
|
||||
|
||||
MkDocs is a fast, simple and downright gorgeous static site generator that's geared towards building project documentation. Documentation source files are written in Markdown, and configured with a single YAML configuration file, <gh-file:mkdocs.yaml>.
|
||||
MkDocs is a fast, simple and downright gorgeous static site generator that's geared towards building project documentation. Documentation source files are written in Markdown, and configured with a single YAML configuration file, [mkdocs.yaml](../../mkdocs.yaml).
|
||||
|
||||
Get started with:
|
||||
|
||||
@ -152,7 +152,7 @@ pytest -s -v tests/test_logger.py
|
||||
If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
|
||||
|
||||
!!! important
|
||||
If you discover a security vulnerability, please follow the instructions [here](gh-file:SECURITY.md#reporting-a-vulnerability).
|
||||
If you discover a security vulnerability, please follow the instructions [here](../../SECURITY.md).
|
||||
|
||||
## Pull Requests & Code Reviews
|
||||
|
||||
@ -162,7 +162,7 @@ code quality and improve the efficiency of the review process.
|
||||
|
||||
### DCO and Signed-off-by
|
||||
|
||||
When contributing changes to this project, you must agree to the <gh-file:DCO>.
|
||||
When contributing changes to this project, you must agree to the [DCO](../../DCO).
|
||||
Commits must include a `Signed-off-by:` header which certifies agreement with
|
||||
the terms of the DCO.
|
||||
|
||||
|
@ -6,9 +6,10 @@ toc_depth: 4
|
||||
|
||||
vLLM provides comprehensive benchmarking tools for performance testing and evaluation:
|
||||
|
||||
- **[Benchmark CLI]**: `vllm bench` CLI tools and specialized benchmark scripts for interactive performance testing
|
||||
- **[Performance benchmarks][performance-benchmarks]**: Automated CI benchmarks for development
|
||||
- **[Nightly benchmarks][nightly-benchmarks]**: Comparative benchmarks against alternatives
|
||||
- **[Benchmark CLI](#benchmark-cli)**: `vllm bench` CLI tools and specialized benchmark scripts for interactive performance testing
|
||||
- **[Batch Scripts](#batch-scripts)**: Run `vllm bench` against multiple configurations conveniently
|
||||
- **[Performance benchmarks](#performance-benchmarks)**: Automated CI benchmarks for development
|
||||
- **[Nightly benchmarks](#nightly-benchmarks)**: Comparative benchmarks against alternatives
|
||||
|
||||
[Benchmark CLI]: #benchmark-cli
|
||||
|
||||
@ -29,7 +30,7 @@ th {
|
||||
| Dataset | Online | Offline | Data Path |
|
||||
|---------|--------|---------|-----------|
|
||||
| ShareGPT | ✅ | ✅ | `wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json` |
|
||||
| ShareGPT4V (Image) | ✅ | ✅ | `wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/blob/main/sharegpt4v_instruct_gpt4-vision_cap100k.json`<br>Note that the images need to be downloaded separately. For example, to download COCO's 2017 Train images:<br>`wget http://images.cocodataset.org/zips/train2017.zip` |
|
||||
| ShareGPT4V (Image) | ✅ | ✅ | `wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/resolve/main/sharegpt4v_instruct_gpt4-vision_cap100k.json`<br>Note that the images need to be downloaded separately. For example, to download COCO's 2017 Train images:<br>`wget http://images.cocodataset.org/zips/train2017.zip` |
|
||||
| ShareGPT4Video (Video) | ✅ | ✅ | `git clone https://huggingface.co/datasets/ShareGPT4Video/ShareGPT4Video` |
|
||||
| BurstGPT | ✅ | ✅ | `wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv` |
|
||||
| Sonnet (deprecated) | ✅ | ✅ | Local file: `benchmarks/sonnet.txt` |
|
||||
@ -714,7 +715,7 @@ Generate synthetic image inputs alongside random text prompts to stress-test vis
|
||||
|
||||
Notes:
|
||||
|
||||
- Works only with online benchmark via the OpenAI backend (`--backend openai-chat`) and endpoint `/v1/chat/completions`.
|
||||
- Works only with online benchmark via the OpenAI backend (`--backend openai-chat`) and endpoint `/v1/chat/completions`.
|
||||
- Video sampling is not yet implemented.
|
||||
|
||||
Start the server (example):
|
||||
@ -822,7 +823,7 @@ you should set `--endpoint /v1/embeddings` to use the Embeddings API. The backen
|
||||
- CLIP: `--backend openai-embeddings-clip`
|
||||
- VLM2Vec: `--backend openai-embeddings-vlm2vec`
|
||||
|
||||
For other models, please add your own implementation inside <gh-file:vllm/benchmarks/lib/endpoint_request_func.py> to match the expected instruction format.
|
||||
For other models, please add your own implementation inside [vllm/benchmarks/lib/endpoint_request_func.py](../../vllm/benchmarks/lib/endpoint_request_func.py) to match the expected instruction format.
|
||||
|
||||
You can use any text or multi-modal dataset to benchmark the model, as long as the model supports it.
|
||||
For example, you can use ShareGPT and VisionArena to benchmark vision-language embeddings.
|
||||
@ -924,7 +925,146 @@ throughput numbers correctly is also adjusted.
|
||||
|
||||
</details>
|
||||
|
||||
[](){ #performance-benchmarks }
|
||||
## Batch Scripts
|
||||
|
||||
### Batch Serving Script
|
||||
|
||||
[`vllm/benchmarks/serve_multi.py`](../../vllm/benchmarks/serve_multi.py) automatically starts `vllm serve` and runs `vllm bench serve` over multiple configurations.
|
||||
|
||||
#### Batch Mode
|
||||
|
||||
The basic purpose of this script is to evaluate vLLM under different settings. Follows these steps to run the script:
|
||||
|
||||
1. Construct the base command to `vllm serve`, and pass it to the `--serve-cmd` option.
|
||||
2. Construct the base command to `vllm bench serve`, and pass it to the `--bench-cmd` option.
|
||||
3. (Optional) If you would like to vary the settings of `vllm serve`, create a new JSON file and populate it with the parameter combinations you want to test. Pass the file path to `--serve-params`.
|
||||
|
||||
- Example: Tuning `--max-num-seqs` and `--max-num-batched-tokens`:
|
||||
|
||||
```json
|
||||
[
|
||||
{
|
||||
"max_num_seqs": 32,
|
||||
"max_num_batched_tokens": 1024
|
||||
},
|
||||
{
|
||||
"max_num_seqs": 64,
|
||||
"max_num_batched_tokens": 1024
|
||||
},
|
||||
{
|
||||
"max_num_seqs": 64,
|
||||
"max_num_batched_tokens": 2048
|
||||
},
|
||||
{
|
||||
"max_num_seqs": 128,
|
||||
"max_num_batched_tokens": 2048
|
||||
},
|
||||
{
|
||||
"max_num_seqs": 128,
|
||||
"max_num_batched_tokens": 4096
|
||||
},
|
||||
{
|
||||
"max_num_seqs": 256,
|
||||
"max_num_batched_tokens": 4096
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
4. (Optional) If you would like to vary the settings of `vllm bench serve`, create a new JSON file and populate it with the parameter combinations you want to test. Pass the file path to `--bench-params`.
|
||||
|
||||
- Example: Using different input/output lengths for random dataset:
|
||||
|
||||
```json
|
||||
[
|
||||
{
|
||||
"random_input_len": 128,
|
||||
"random_output_len": 32
|
||||
},
|
||||
{
|
||||
"random_input_len": 256,
|
||||
"random_output_len": 64
|
||||
},
|
||||
{
|
||||
"random_input_len": 512,
|
||||
"random_output_len": 128
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
5. Determine where you want to save the results, and pass that to `--output-dir`.
|
||||
|
||||
Example command:
|
||||
|
||||
```bash
|
||||
python vllm/benchmarks/serve_multi.py \
|
||||
--serve-cmd 'vllm serve meta-llama/Llama-2-7b-chat-hf' \
|
||||
--bench-cmd 'vllm bench serve --model meta-llama/Llama-2-7b-chat-hf --backend vllm --endpoint /v1/completions --dataset-name sharegpt --dataset-path benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json' \
|
||||
--serve-params benchmarks/serve_hparams.json \
|
||||
--bench-params benchmarks/bench_hparams.json \
|
||||
-o benchmarks/results
|
||||
```
|
||||
|
||||
!!! important
|
||||
If both `--serve-params` and `--bench-params` are passed, the script will iterate over the Cartesian product between them.
|
||||
You can use `--dry-run` to preview the commands to be run.
|
||||
|
||||
We only start the server once for each `--serve-params`, and keep it running for multiple `--bench-params`.
|
||||
Between each benchmark run, we call the `/reset_prefix_cache` and `/reset_mm_cache` endpoints to get a clean slate for the next run.
|
||||
In case you are using a custom `--serve-cmd`, you can override the commands used for resetting the state by setting `--after-bench-cmd`.
|
||||
|
||||
!!! note
|
||||
By default, each parameter combination is run 3 times to make the results more reliable. You can adjust the number of runs by setting `--num-runs`.
|
||||
|
||||
!!! tip
|
||||
You can use the `--resume` option to continue the parameter sweep if one of the runs failed.
|
||||
|
||||
#### SLA Mode
|
||||
|
||||
By passing SLA constraints via `--sla-params`, you can run this script in SLA mode, causing it to adjust either the request rate or concurrency (choose using `--sla-variable`) in order to satisfy the SLA constraints.
|
||||
|
||||
For example, to ensure E2E latency within different target values for 99% of requests:
|
||||
|
||||
```json
|
||||
[
|
||||
{
|
||||
"p99_e2el_ms": "<=200"
|
||||
},
|
||||
{
|
||||
"p99_e2el_ms": "<=500"
|
||||
},
|
||||
{
|
||||
"p99_e2el_ms": "<=1000"
|
||||
},
|
||||
{
|
||||
"p99_e2el_ms": "<=2000"
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
Example command:
|
||||
|
||||
```bash
|
||||
python vllm/benchmarks/serve_multi.py \
|
||||
--serve-cmd 'vllm serve meta-llama/Llama-2-7b-chat-hf' \
|
||||
--bench-cmd 'vllm bench serve --model meta-llama/Llama-2-7b-chat-hf --backend vllm --endpoint /v1/completions --dataset-name sharegpt --dataset-path benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json' \
|
||||
--serve-params benchmarks/serve_hparams.json \
|
||||
--bench-params benchmarks/bench_hparams.json \
|
||||
--sla-params benchmarks/sla_hparams.json \
|
||||
--sla-variable max_concurrency \
|
||||
-o benchmarks/results
|
||||
```
|
||||
|
||||
The algorithm for adjusting the SLA variable is as follows:
|
||||
|
||||
1. Run the benchmark with infinite QPS, and use the corresponding metrics to determine the initial value of the variable.
|
||||
- For example, the initial request rate is set to the concurrency under infinite QPS.
|
||||
2. If the SLA is still satisfied, keep doubling the value until the SLA is no longer satisfied. This gives a relatively narrow window that contains the point where the SLA is barely satisfied.
|
||||
3. Apply binary search over the window to find the maximum value that still satisfies the SLA.
|
||||
|
||||
!!! important
|
||||
SLA tuning is applied over each combination of `--serve-params`, `--bench-params`, and `--sla-params`.
|
||||
|
||||
For a given combination of `--serve-params` and `--bench-params`, we share the benchmark results across `--sla-params` to avoid rerunning benchmarks with the same SLA variable value.
|
||||
|
||||
## Performance Benchmarks
|
||||
|
||||
@ -962,7 +1102,7 @@ For more results visualization, check the [visualizing the results](https://gith
|
||||
|
||||
The latest performance results are hosted on the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm).
|
||||
|
||||
More information on the performance benchmarks and their parameters can be found in [Benchmark README](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md) and [performance benchmark description](gh-file:.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md).
|
||||
More information on the performance benchmarks and their parameters can be found in [Benchmark README](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md) and [performance benchmark description](../../.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md).
|
||||
|
||||
### Continuous Benchmarking
|
||||
|
||||
@ -988,12 +1128,10 @@ The benchmarking currently runs on a predefined set of models configured in the
|
||||
|
||||
All continuous benchmarking results are automatically published to the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm).
|
||||
|
||||
[](){ #nightly-benchmarks }
|
||||
|
||||
## Nightly Benchmarks
|
||||
|
||||
These compare vLLM's performance against alternatives (`tgi`, `trt-llm`, and `lmdeploy`) when there are major updates of vLLM (e.g., bumping up to a new version). They are primarily intended for consumers to evaluate when to choose vLLM over other options and are triggered on every commit with both the `perf-benchmarks` and `nightly-benchmarks` labels.
|
||||
|
||||
The latest nightly benchmark results are shared in major release blog posts such as [vLLM v0.6.0](https://blog.vllm.ai/2024/09/05/perf-update.html).
|
||||
|
||||
More information on the nightly benchmarks and their parameters can be found [here](gh-file:.buildkite/nightly-benchmarks/nightly-descriptions.md).
|
||||
More information on the nightly benchmarks and their parameters can be found [here](../../.buildkite/nightly-benchmarks/nightly-descriptions.md).
|
||||
|
@ -64,7 +64,7 @@ Download the full log file from Buildkite locally.
|
||||
|
||||
Strip timestamps and colorization:
|
||||
|
||||
<gh-file:.buildkite/scripts/ci-clean-log.sh>
|
||||
[.buildkite/scripts/ci-clean-log.sh](../../../.buildkite/scripts/ci-clean-log.sh)
|
||||
|
||||
```bash
|
||||
./ci-clean-log.sh ci.log
|
||||
@ -87,7 +87,7 @@ tail -525 ci_build.log | wl-copy
|
||||
|
||||
CI test failures may be flaky. Use a bash loop to run repeatedly:
|
||||
|
||||
<gh-file:.buildkite/scripts/rerun-test.sh>
|
||||
[.buildkite/scripts/rerun-test.sh](../../../.buildkite/scripts/rerun-test.sh)
|
||||
|
||||
```bash
|
||||
./rerun-test.sh tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]
|
||||
|
@ -5,7 +5,7 @@ release in CI/CD. It is standard practice to submit a PR to update the
|
||||
PyTorch version as early as possible when a new [PyTorch stable
|
||||
release](https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-cadence) becomes available.
|
||||
This process is non-trivial due to the gap between PyTorch
|
||||
releases. Using <gh-pr:16859> as an example, this document outlines common steps to achieve this
|
||||
releases. Using <https://github.com/vllm-project/vllm/pull/16859> as an example, this document outlines common steps to achieve this
|
||||
update along with a list of potential issues and how to address them.
|
||||
|
||||
## Test PyTorch release candidates (RCs)
|
||||
@ -85,7 +85,7 @@ and timeout. Additionally, since vLLM's fastcheck pipeline runs in read-only mod
|
||||
it doesn't populate the cache, so re-running it to warm up the cache
|
||||
is ineffective.
|
||||
|
||||
While ongoing efforts like [#17419](gh-issue:17419)
|
||||
While ongoing efforts like <https://github.com/vllm-project/vllm/issues/17419>
|
||||
address the long build time at its source, the current workaround is to set `VLLM_CI_BRANCH`
|
||||
to a custom branch provided by @khluu (`VLLM_CI_BRANCH=khluu/use_postmerge_q`)
|
||||
when manually triggering a build on Buildkite. This branch accomplishes two things:
|
||||
@ -138,5 +138,5 @@ to handle some platforms separately. The separation of requirements and Dockerfi
|
||||
for different platforms in vLLM CI/CD allows us to selectively choose
|
||||
which platforms to update. For instance, updating XPU requires the corresponding
|
||||
release from [Intel Extension for PyTorch](https://github.com/intel/intel-extension-for-pytorch) by Intel.
|
||||
While <gh-pr:16859> updated vLLM to PyTorch 2.7.0 on CPU, CUDA, and ROCm,
|
||||
<gh-pr:17444> completed the update for XPU.
|
||||
While <https://github.com/vllm-project/vllm/pull/16859> updated vLLM to PyTorch 2.7.0 on CPU, CUDA, and ROCm,
|
||||
<https://github.com/vllm-project/vllm/pull/17444> completed the update for XPU.
|
||||
|
@ -1,6 +1,6 @@
|
||||
# Dockerfile
|
||||
|
||||
We provide a <gh-file:docker/Dockerfile> to construct the image for running an OpenAI compatible server with vLLM.
|
||||
We provide a [docker/Dockerfile](../../../docker/Dockerfile) to construct the image for running an OpenAI compatible server with vLLM.
|
||||
More information about deploying with Docker can be found [here](../../deployment/docker.md).
|
||||
|
||||
Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes:
|
||||
|
@ -1,7 +1,7 @@
|
||||
# Summary
|
||||
|
||||
!!! important
|
||||
Many decoder language models can now be automatically loaded using the [Transformers backend][transformers-backend] without having to implement them in vLLM. See if `vllm serve <model>` works first!
|
||||
Many decoder language models can now be automatically loaded using the [Transformers backend](../../models/supported_models.md#transformers) without having to implement them in vLLM. See if `vllm serve <model>` works first!
|
||||
|
||||
vLLM models are specialized [PyTorch](https://pytorch.org/) models that take advantage of various [features](../../features/README.md#compatibility-matrix) to optimize their performance.
|
||||
|
||||
|
@ -5,7 +5,7 @@ This guide walks you through the steps to implement a basic vLLM model.
|
||||
## 1. Bring your model code
|
||||
|
||||
First, clone the PyTorch model code from the source repository.
|
||||
For instance, vLLM's [OPT model](gh-file:vllm/model_executor/models/opt.py) was adapted from
|
||||
For instance, vLLM's [OPT model](../../../vllm/model_executor/models/opt.py) was adapted from
|
||||
HuggingFace's [modeling_opt.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py) file.
|
||||
|
||||
!!! warning
|
||||
@ -83,7 +83,7 @@ def forward(
|
||||
Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings.
|
||||
If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM.
|
||||
|
||||
For reference, check out our [Llama implementation](gh-file:vllm/model_executor/models/llama.py). vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out <gh-dir:vllm/model_executor/models> for more examples.
|
||||
For reference, check out our [Llama implementation](../../../vllm/model_executor/models/llama.py). vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out [vllm/model_executor/models](../../../vllm/model_executor/models) for more examples.
|
||||
|
||||
## 3. (Optional) Implement tensor parallelism and quantization support
|
||||
|
||||
@ -130,22 +130,22 @@ We consider 3 different scenarios:
|
||||
2. Models that combine Mamba layers (either Mamba-1 or Mamba-2) together with attention layers.
|
||||
3. Models that combine Mamba-like mechanisms (e.g., Linear Attention, ShortConv) together with attention layers.
|
||||
|
||||
For case (1), we recommend looking at the implementation of [`MambaForCausalLM`](gh-file:vllm/model_executor/models/mamba.py) (for Mamba-1) or [`Mamba2ForCausalLM`](gh-file:vllm/model_executor/models/mamba2.py) (for Mamba-2) as a reference.
|
||||
For case (1), we recommend looking at the implementation of [`MambaForCausalLM`](../../../vllm/model_executor/models/mamba.py) (for Mamba-1) or [`Mamba2ForCausalLM`](../../../vllm/model_executor/models/mamba2.py) (for Mamba-2) as a reference.
|
||||
The model should inherit protocol `IsAttentionFree` and also implement class methods `get_mamba_state_dtype_from_config` and `get_mamba_state_shape_from_config` to calculate the state shapes and data types from the config.
|
||||
For the mamba layers themselves, please use the [`MambaMixer`](gh-file:vllm/model_executor/layers/mamba/mamba_mixer.py) (for Mamba-1) or [`MambaMixer2`](gh-file:vllm/model_executor/layers/mamba/mamba_mixer2.py) (for Mamba-2) classes.
|
||||
For the mamba layers themselves, please use the [`MambaMixer`](../../../vllm/model_executor/layers/mamba/mamba_mixer.py) (for Mamba-1) or [`MambaMixer2`](../../../vllm/model_executor/layers/mamba/mamba_mixer2.py) (for Mamba-2) classes.
|
||||
Please *do not* use the `MambaCacheManager` (deprecated in V1) or replicate any of the V0-specific code paths in the existing model implementations.
|
||||
V0-only classes and code will be removed in the very near future.
|
||||
The model should also be added to the `MODELS_CONFIG_MAP` dictionary in <gh-file:vllm/model_executor/models/config.py> to ensure that the runtime defaults are optimized.
|
||||
The model should also be added to the `MODELS_CONFIG_MAP` dictionary in [vllm/model_executor/models/config.py](../../../vllm/model_executor/models/config.py) to ensure that the runtime defaults are optimized.
|
||||
|
||||
For case (2), we recommend using as a reference the implementation of [`JambaForCausalLM`](gh-file:vllm/model_executor/models/jamba.py) (for an example of a model that uses Mamba-1 and attention together) or [`BambaForCausalLM`](gh-file:vllm/model_executor/models/bamba.py) (for an example of a model that uses Mamba-2 and attention together).
|
||||
For case (2), we recommend using as a reference the implementation of [`JambaForCausalLM`](../../../vllm/model_executor/models/jamba.py) (for an example of a model that uses Mamba-1 and attention together) or [`BambaForCausalLM`](../../../vllm/model_executor/models/bamba.py) (for an example of a model that uses Mamba-2 and attention together).
|
||||
These models should follow the same instructions as case (1), but they should inherit protocol `IsHybrid` (instead of `IsAttentionFree`) and it is *not* necessary to add them to the `MODELS_CONFIG_MAP` (their runtime defaults will be inferred from the protocol).
|
||||
|
||||
For case (3), we recommend looking at the implementation of [`MiniMaxText01ForCausalLM`](gh-file:vllm/model_executor/models/minimax_text_01.py) or [`Lfm2ForCausalLM`](gh-file:vllm/model_executor/models/lfm2.py) as a reference, which use custom "mamba-like" layers `MiniMaxText01LinearAttention` and `ShortConv` respectively.
|
||||
For case (3), we recommend looking at the implementation of [`MiniMaxText01ForCausalLM`](../../../vllm/model_executor/models/minimax_text_01.py) or [`Lfm2ForCausalLM`](../../../vllm/model_executor/models/lfm2.py) as a reference, which use custom "mamba-like" layers `MiniMaxText01LinearAttention` and `ShortConv` respectively.
|
||||
Please follow the same guidelines as case (2) for implementing these models.
|
||||
We use "mamba-like" to refer to layers that posses a state that is updated in-place, rather than being appended-to (like KV cache for attention).
|
||||
For implementing new custom mamba-like layers, one should inherit from `MambaBase` and implement the methods `get_state_dtype`, `get_state_shape` to calculate the data types and state shapes at runtime, as well as `mamba_type` and `get_attn_backend`.
|
||||
It is also necessary to implement the "attention meta-data" class which handles the meta-data that is common across all layers.
|
||||
Please see [`LinearAttentionMetadata`](gh-file:vllm/v1/attention/backends/linear_attn.py) or [`ShortConvAttentionMetadata`](gh-file:v1/attention/backends/short_conv_attn.py) for examples of this.
|
||||
Please see [`LinearAttentionMetadata`](../../../vllm/v1/attention/backends/linear_attn.py) or [`ShortConvAttentionMetadata`](../../../vllm/v1/attention/backends/short_conv_attn.py) for examples of this.
|
||||
Finally, if one wants to support torch compile and CUDA graphs, it necessary to wrap the call to the mamba-like layer inside a custom op and register it.
|
||||
Please see the calls to `direct_register_custom_op` in <gh-file:vllm/model_executor/models/minimax_text_01.py> or <gh-file:vllm/model_executor/layers/mamba/short_conv.py> for examples of this.
|
||||
The new custom op should then be added to the list `_attention_ops` in <gh-file:vllm/config/compilation.py> to ensure that piecewise CUDA graphs works as intended.
|
||||
Please see the calls to `direct_register_custom_op` in [vllm/model_executor/models/minimax_text_01.py](../../../vllm/model_executor/models/minimax_text_01.py) or [vllm/model_executor/layers/mamba/short_conv.py](../../../vllm/model_executor/layers/mamba/short_conv.py) for examples of this.
|
||||
The new custom op should then be added to the list `_attention_ops` in [vllm/config/compilation.py](../../../vllm/config/compilation.py) to ensure that piecewise CUDA graphs works as intended.
|
||||
|
@ -507,7 +507,7 @@ return a schema of the tensors outputted by the HF processor that are related to
|
||||
```
|
||||
|
||||
!!! note
|
||||
Our [actual code](gh-file:vllm/model_executor/models/llava.py) additionally supports
|
||||
Our [actual code](../../../vllm/model_executor/models/llava.py) additionally supports
|
||||
pre-computed image embeddings, which can be passed to be model via the `image_embeds` argument.
|
||||
|
||||
=== "With postprocessing: Fuyu"
|
||||
@ -569,7 +569,7 @@ return a schema of the tensors outputted by the HF processor that are related to
|
||||
```
|
||||
|
||||
!!! note
|
||||
Our [actual code](gh-file:vllm/model_executor/models/fuyu.py) has special handling
|
||||
Our [actual code](../../../vllm/model_executor/models/fuyu.py) has special handling
|
||||
for text-only inputs to prevent unnecessary warnings from HF processor.
|
||||
|
||||
!!! note
|
||||
@ -828,8 +828,8 @@ Some HF processors directly insert feature tokens without replacing anything in
|
||||
|
||||
Examples:
|
||||
|
||||
- BLIP-2 (insert at start of prompt): <gh-file:vllm/model_executor/models/blip2.py>
|
||||
- Molmo (insert after `<|endoftext|>` token): <gh-file:vllm/model_executor/models/molmo.py>
|
||||
- BLIP-2 (insert at start of prompt): [vllm/model_executor/models/blip2.py](../../../vllm/model_executor/models/blip2.py)
|
||||
- Molmo (insert after `<|endoftext|>` token): [vllm/model_executor/models/molmo.py](../../../vllm/model_executor/models/molmo.py)
|
||||
|
||||
### Handling prompt updates unrelated to multi-modal data
|
||||
|
||||
@ -837,9 +837,9 @@ Examples:
|
||||
|
||||
Examples:
|
||||
|
||||
- Chameleon (appends `sep_token`): <gh-file:vllm/model_executor/models/chameleon.py>
|
||||
- Fuyu (appends `boa_token`): <gh-file:vllm/model_executor/models/fuyu.py>
|
||||
- Molmo (applies chat template which is not defined elsewhere): <gh-file:vllm/model_executor/models/molmo.py>
|
||||
- Chameleon (appends `sep_token`): [vllm/model_executor/models/chameleon.py](../../../vllm/model_executor/models/chameleon.py)
|
||||
- Fuyu (appends `boa_token`): [vllm/model_executor/models/fuyu.py](../../../vllm/model_executor/models/fuyu.py)
|
||||
- Molmo (applies chat template which is not defined elsewhere): [vllm/model_executor/models/molmo.py](../../../vllm/model_executor/models/molmo.py)
|
||||
|
||||
### Custom HF processor
|
||||
|
||||
@ -847,6 +847,6 @@ Some models don't define an HF processor class on HF Hub. In that case, you can
|
||||
|
||||
Examples:
|
||||
|
||||
- DeepSeek-VL2: <gh-file:vllm/model_executor/models/deepseek_vl2.py>
|
||||
- InternVL: <gh-file:vllm/model_executor/models/internvl.py>
|
||||
- Qwen-VL: <gh-file:vllm/model_executor/models/qwen_vl.py>
|
||||
- DeepSeek-VL2: [vllm/model_executor/models/deepseek_vl2.py](../../../vllm/model_executor/models/deepseek_vl2.py)
|
||||
- InternVL: [vllm/model_executor/models/internvl.py](../../../vllm/model_executor/models/internvl.py)
|
||||
- Qwen-VL: [vllm/model_executor/models/qwen_vl.py](../../../vllm/model_executor/models/qwen_vl.py)
|
||||
|
@ -8,11 +8,11 @@ This page provides detailed instructions on how to do so.
|
||||
|
||||
## Built-in models
|
||||
|
||||
To add a model directly to the vLLM library, start by forking our [GitHub repository](https://github.com/vllm-project/vllm) and then [build it from source][build-from-source].
|
||||
To add a model directly to the vLLM library, start by forking our [GitHub repository](https://github.com/vllm-project/vllm) and then [build it from source](../../getting_started/installation/gpu.md#build-wheel-from-source).
|
||||
This gives you the ability to modify the codebase and test your model.
|
||||
|
||||
After you have implemented your model (see [tutorial](basic.md)), put it into the <gh-dir:vllm/model_executor/models> directory.
|
||||
Then, add your model class to `_VLLM_MODELS` in <gh-file:vllm/model_executor/models/registry.py> so that it is automatically registered upon importing vLLM.
|
||||
After you have implemented your model (see [tutorial](basic.md)), put it into the [vllm/model_executor/models](../../../vllm/model_executor/models) directory.
|
||||
Then, add your model class to `_VLLM_MODELS` in [vllm/model_executor/models/registry.py](../../../vllm/model_executor/models/registry.py) so that it is automatically registered upon importing vLLM.
|
||||
Finally, update our [list of supported models](../../models/supported_models.md) to promote your model!
|
||||
|
||||
!!! important
|
||||
|
@ -9,7 +9,7 @@ Without them, the CI for your PR will fail.
|
||||
|
||||
### Model loading
|
||||
|
||||
Include an example HuggingFace repository for your model in <gh-file:tests/models/registry.py>.
|
||||
Include an example HuggingFace repository for your model in [tests/models/registry.py](../../../tests/models/registry.py).
|
||||
This enables a unit test that loads dummy weights to ensure that the model can be initialized in vLLM.
|
||||
|
||||
!!! important
|
||||
@ -26,26 +26,24 @@ Passing these tests provides more confidence that your implementation is correct
|
||||
|
||||
### Model correctness
|
||||
|
||||
These tests compare the model outputs of vLLM against [HF Transformers](https://github.com/huggingface/transformers). You can add new tests under the subdirectories of <gh-dir:tests/models>.
|
||||
These tests compare the model outputs of vLLM against [HF Transformers](https://github.com/huggingface/transformers). You can add new tests under the subdirectories of [tests/models](../../../tests/models).
|
||||
|
||||
#### Generative models
|
||||
|
||||
For [generative models](../../models/generative_models.md), there are two levels of correctness tests, as defined in <gh-file:tests/models/utils.py>:
|
||||
For [generative models](../../models/generative_models.md), there are two levels of correctness tests, as defined in [tests/models/utils.py](../../../tests/models/utils.py):
|
||||
|
||||
- Exact correctness (`check_outputs_equal`): The text outputted by vLLM should exactly match the text outputted by HF.
|
||||
- Logprobs similarity (`check_logprobs_close`): The logprobs outputted by vLLM should be in the top-k logprobs outputted by HF, and vice versa.
|
||||
|
||||
#### Pooling models
|
||||
|
||||
For [pooling models](../../models/pooling_models.md), we simply check the cosine similarity, as defined in <gh-file:tests/models/utils.py>.
|
||||
|
||||
[](){ #mm-processing-tests }
|
||||
For [pooling models](../../models/pooling_models.md), we simply check the cosine similarity, as defined in [tests/models/utils.py](../../../tests/models/utils.py).
|
||||
|
||||
### Multi-modal processing
|
||||
|
||||
#### Common tests
|
||||
|
||||
Adding your model to <gh-file:tests/models/multimodal/processing/test_common.py> verifies that the following input combinations result in the same outputs:
|
||||
Adding your model to [tests/models/multimodal/processing/test_common.py](../../../tests/models/multimodal/processing/test_common.py) verifies that the following input combinations result in the same outputs:
|
||||
|
||||
- Text + multi-modal data
|
||||
- Tokens + multi-modal data
|
||||
@ -54,6 +52,6 @@ Adding your model to <gh-file:tests/models/multimodal/processing/test_common.py>
|
||||
|
||||
#### Model-specific tests
|
||||
|
||||
You can add a new file under <gh-dir:tests/models/multimodal/processing> to run tests that only apply to your model.
|
||||
You can add a new file under [tests/models/multimodal/processing](../../../tests/models/multimodal/processing) to run tests that only apply to your model.
|
||||
|
||||
For example, if the HF processor for your model accepts user-specified keyword arguments, you can verify that the keyword arguments are being applied correctly, such as in <gh-file:tests/models/multimodal/processing/test_phi3v.py>.
|
||||
For example, if the HF processor for your model accepts user-specified keyword arguments, you can verify that the keyword arguments are being applied correctly, such as in [tests/models/multimodal/processing/test_phi3v.py](../../../tests/models/multimodal/processing/test_phi3v.py).
|
||||
|
@ -248,9 +248,9 @@ No extra registration is required beyond having your model class available via t
|
||||
|
||||
## Examples in-tree
|
||||
|
||||
- Whisper encoder–decoder (audio-only): <gh-file:vllm/model_executor/models/whisper.py>
|
||||
- Voxtral decoder-only (audio embeddings + LLM): <gh-file:vllm/model_executor/models/voxtral.py>
|
||||
- Gemma3n decoder-only with fixed instruction prompt: <gh-file:vllm/model_executor/models/gemma3n_mm.py>
|
||||
- Whisper encoder–decoder (audio-only): [vllm/model_executor/models/whisper.py](../../../vllm/model_executor/models/whisper.py)
|
||||
- Voxtral decoder-only (audio embeddings + LLM): [vllm/model_executor/models/voxtral.py](../../../vllm/model_executor/models/voxtral.py)
|
||||
- Gemma3n decoder-only with fixed instruction prompt: [vllm/model_executor/models/gemma3n_mm.py](../../../vllm/model_executor/models/gemma3n_mm.py)
|
||||
|
||||
## Test with the API
|
||||
|
||||
@ -278,7 +278,7 @@ Once your model implements `SupportsTranscription`, you can test the endpoints (
|
||||
http://localhost:8000/v1/audio/translations
|
||||
```
|
||||
|
||||
Or check out more examples in <gh-file:examples/online_serving>.
|
||||
Or check out more examples in [examples/online_serving](../../../examples/online_serving).
|
||||
|
||||
!!! note
|
||||
- If your model handles chunking internally (e.g., via its processor or encoder), set `min_energy_split_window_size=None` in the returned `SpeechToTextConfig` to disable server-side chunking.
|
||||
|
@ -33,7 +33,7 @@ Traces can be visualized using <https://ui.perfetto.dev/>.
|
||||
|
||||
#### Offline Inference
|
||||
|
||||
Refer to <gh-file:examples/offline_inference/simple_profiling.py> for an example.
|
||||
Refer to [examples/offline_inference/simple_profiling.py](../../examples/offline_inference/simple_profiling.py) for an example.
|
||||
|
||||
#### OpenAI Server
|
||||
|
||||
@ -180,9 +180,13 @@ The profiling traces generated by the continuous profiling workflow are publicly
|
||||
The Python standard library includes
|
||||
[cProfile](https://docs.python.org/3/library/profile.html) for profiling Python
|
||||
code. vLLM includes a couple of helpers that make it easy to apply it to a section of vLLM.
|
||||
Both the `vllm.utils.cprofile` and `vllm.utils.cprofile_context` functions can be
|
||||
Both the `vllm.utils.profiling.cprofile` and `vllm.utils.profiling.cprofile_context` functions can be
|
||||
used to profile a section of code.
|
||||
|
||||
!!! note
|
||||
The legacy import paths `vllm.utils.cprofile` and `vllm.utils.cprofile_context` are deprecated.
|
||||
Please use `vllm.utils.profiling.cprofile` and `vllm.utils.profiling.cprofile_context` instead.
|
||||
|
||||
### Example usage - decorator
|
||||
|
||||
The first helper is a Python decorator that can be used to profile a function.
|
||||
@ -190,9 +194,9 @@ If a filename is specified, the profile will be saved to that file. If no filena
|
||||
specified, profile data will be printed to stdout.
|
||||
|
||||
```python
|
||||
import vllm.utils
|
||||
from vllm.utils.profiling import cprofile
|
||||
|
||||
@vllm.utils.cprofile("expensive_function.prof")
|
||||
@cprofile("expensive_function.prof")
|
||||
def expensive_function():
|
||||
# some expensive code
|
||||
pass
|
||||
@ -204,13 +208,13 @@ The second helper is a context manager that can be used to profile a block of
|
||||
code. Similar to the decorator, the filename is optional.
|
||||
|
||||
```python
|
||||
import vllm.utils
|
||||
from vllm.utils.profiling import cprofile_context
|
||||
|
||||
def another_function():
|
||||
# more expensive code
|
||||
pass
|
||||
|
||||
with vllm.utils.cprofile_context("another_function.prof"):
|
||||
with cprofile_context("another_function.prof"):
|
||||
another_function()
|
||||
```
|
||||
|
||||
|
@ -1,7 +1,5 @@
|
||||
# Using Docker
|
||||
|
||||
[](){ #deployment-docker-pre-built-image }
|
||||
|
||||
## Use vLLM's Official Docker Image
|
||||
|
||||
vLLM offers an official Docker image for deployment.
|
||||
@ -10,7 +8,7 @@ The image can be used to run OpenAI compatible server and is available on Docker
|
||||
```bash
|
||||
docker run --runtime nvidia --gpus all \
|
||||
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
||||
--env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
|
||||
--env "HF_TOKEN=$HF_TOKEN" \
|
||||
-p 8000:8000 \
|
||||
--ipc=host \
|
||||
vllm/vllm-openai:latest \
|
||||
@ -22,7 +20,7 @@ This image can also be used with other container engines such as [Podman](https:
|
||||
```bash
|
||||
podman run --device nvidia.com/gpu=all \
|
||||
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
||||
--env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
|
||||
--env "HF_TOKEN=$HF_TOKEN" \
|
||||
-p 8000:8000 \
|
||||
--ipc=host \
|
||||
docker.io/vllm/vllm-openai:latest \
|
||||
@ -37,7 +35,7 @@ You can add any other [engine-args](../configuration/engine_args.md) you need af
|
||||
memory to share data between processes under the hood, particularly for tensor parallel inference.
|
||||
|
||||
!!! note
|
||||
Optional dependencies are not included in order to avoid licensing issues (e.g. <gh-issue:8030>).
|
||||
Optional dependencies are not included in order to avoid licensing issues (e.g. <https://github.com/vllm-project/vllm/issues/8030>).
|
||||
|
||||
If you need to use those dependencies (having accepted the license terms),
|
||||
create a custom Dockerfile on top of the base image with an extra layer that installs them:
|
||||
@ -62,11 +60,9 @@ You can add any other [engine-args](../configuration/engine_args.md) you need af
|
||||
RUN uv pip install --system git+https://github.com/huggingface/transformers.git
|
||||
```
|
||||
|
||||
[](){ #deployment-docker-build-image-from-source }
|
||||
|
||||
## Building vLLM's Docker Image from Source
|
||||
|
||||
You can build and run vLLM from source via the provided <gh-file:docker/Dockerfile>. To build vLLM:
|
||||
You can build and run vLLM from source via the provided [docker/Dockerfile](../../docker/Dockerfile). To build vLLM:
|
||||
|
||||
```bash
|
||||
# optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2
|
||||
@ -128,7 +124,7 @@ To run vLLM with the custom-built Docker image:
|
||||
docker run --runtime nvidia --gpus all \
|
||||
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
||||
-p 8000:8000 \
|
||||
--env "HUGGING_FACE_HUB_TOKEN=<secret>" \
|
||||
--env "HF_TOKEN=<secret>" \
|
||||
vllm/vllm-openai <args...>
|
||||
```
|
||||
|
||||
|
@ -1,11 +1,9 @@
|
||||
# Anyscale
|
||||
|
||||
[](){ #deployment-anyscale }
|
||||
|
||||
[Anyscale](https://www.anyscale.com) is a managed, multi-cloud platform developed by the creators of Ray.
|
||||
|
||||
Anyscale automates the entire lifecycle of Ray clusters in your AWS, GCP, or Azure account, delivering the flexibility of open-source Ray
|
||||
without the operational overhead of maintaining Kubernetes control planes, configuring autoscalers, managing observability stacks, or manually managing head and worker nodes with helper scripts like <gh-file:examples/online_serving/run_cluster.sh>.
|
||||
without the operational overhead of maintaining Kubernetes control planes, configuring autoscalers, managing observability stacks, or manually managing head and worker nodes with helper scripts like [examples/online_serving/run_cluster.sh](../../../examples/online_serving/run_cluster.sh).
|
||||
|
||||
When serving large language models with vLLM, Anyscale can rapidly provision [production-ready HTTPS endpoints](https://docs.anyscale.com/examples/deploy-ray-serve-llms) or [fault-tolerant batch inference jobs](https://docs.anyscale.com/examples/ray-data-llm).
|
||||
|
||||
|
@ -35,7 +35,7 @@ Deploy the following yaml file `lws.yaml`
|
||||
- name: vllm-leader
|
||||
image: docker.io/vllm/vllm-openai:latest
|
||||
env:
|
||||
- name: HUGGING_FACE_HUB_TOKEN
|
||||
- name: HF_TOKEN
|
||||
value: <your-hf-token>
|
||||
command:
|
||||
- sh
|
||||
@ -83,7 +83,7 @@ Deploy the following yaml file `lws.yaml`
|
||||
ephemeral-storage: 800Gi
|
||||
cpu: 125
|
||||
env:
|
||||
- name: HUGGING_FACE_HUB_TOKEN
|
||||
- name: HF_TOKEN
|
||||
value: <your-hf-token>
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
|
@ -36,7 +36,7 @@ pip install -U vllm \
|
||||
vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
|
||||
```
|
||||
|
||||
1. Use the script: <gh-file:examples/online_serving/retrieval_augmented_generation_with_langchain.py>
|
||||
1. Use the script: [examples/online_serving/retrieval_augmented_generation_with_langchain.py](../../../examples/online_serving/retrieval_augmented_generation_with_langchain.py)
|
||||
|
||||
1. Run the script
|
||||
|
||||
@ -74,7 +74,7 @@ pip install vllm \
|
||||
vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
|
||||
```
|
||||
|
||||
1. Use the script: <gh-file:examples/online_serving/retrieval_augmented_generation_with_llamaindex.py>
|
||||
1. Use the script: [examples/online_serving/retrieval_augmented_generation_with_llamaindex.py](../../../examples/online_serving/retrieval_augmented_generation_with_llamaindex.py)
|
||||
|
||||
1. Run the script:
|
||||
|
||||
|
@ -20,7 +20,7 @@ pip install vllm streamlit openai
|
||||
vllm serve Qwen/Qwen1.5-0.5B-Chat
|
||||
```
|
||||
|
||||
1. Use the script: <gh-file:examples/online_serving/streamlit_openai_chatbot_webserver.py>
|
||||
1. Use the script: [examples/online_serving/streamlit_openai_chatbot_webserver.py](../../../examples/online_serving/streamlit_openai_chatbot_webserver.py)
|
||||
|
||||
1. Start the streamlit web UI and start to chat:
|
||||
|
||||
|
@ -82,7 +82,7 @@ Next, start the vLLM server as a Kubernetes Deployment and Service:
|
||||
"vllm serve meta-llama/Llama-3.2-1B-Instruct"
|
||||
]
|
||||
env:
|
||||
- name: HUGGING_FACE_HUB_TOKEN
|
||||
- name: HF_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: hf-token-secret
|
||||
@ -209,7 +209,7 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
|
||||
"vllm serve mistralai/Mistral-7B-Instruct-v0.3 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024"
|
||||
]
|
||||
env:
|
||||
- name: HUGGING_FACE_HUB_TOKEN
|
||||
- name: HF_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: hf-token-secret
|
||||
@ -298,7 +298,7 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
|
||||
"vllm serve mistralai/Mistral-7B-v0.3 --port 8000 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024"
|
||||
]
|
||||
env:
|
||||
- name: HUGGING_FACE_HUB_TOKEN
|
||||
- name: HF_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: hf-token-secret
|
||||
|
@ -2,8 +2,6 @@
|
||||
|
||||
This document shows how to launch multiple vLLM serving containers and use Nginx to act as a load balancer between the servers.
|
||||
|
||||
[](){ #nginxloadbalancer-nginx-build }
|
||||
|
||||
## Build Nginx Container
|
||||
|
||||
This guide assumes that you have just cloned the vLLM project and you're currently in the vllm root directory.
|
||||
@ -27,8 +25,6 @@ Build the container:
|
||||
docker build . -f Dockerfile.nginx --tag nginx-lb
|
||||
```
|
||||
|
||||
[](){ #nginxloadbalancer-nginx-conf }
|
||||
|
||||
## Create Simple Nginx Config file
|
||||
|
||||
Create a file named `nginx_conf/nginx.conf`. Note that you can add as many servers as you'd like. In the below example we'll start with two. To add more, add another `server vllmN:8000 max_fails=3 fail_timeout=10000s;` entry to `upstream backend`.
|
||||
@ -53,8 +49,6 @@ Create a file named `nginx_conf/nginx.conf`. Note that you can add as many serve
|
||||
}
|
||||
```
|
||||
|
||||
[](){ #nginxloadbalancer-nginx-vllm-container }
|
||||
|
||||
## Build vLLM Container
|
||||
|
||||
```bash
|
||||
@ -73,16 +67,12 @@ docker build \
|
||||
--build-arg https_proxy=$https_proxy
|
||||
```
|
||||
|
||||
[](){ #nginxloadbalancer-nginx-docker-network }
|
||||
|
||||
## Create Docker Network
|
||||
|
||||
```bash
|
||||
docker network create vllm_nginx
|
||||
```
|
||||
|
||||
[](){ #nginxloadbalancer-nginx-launch-container }
|
||||
|
||||
## Launch vLLM Containers
|
||||
|
||||
Notes:
|
||||
@ -122,8 +112,6 @@ Notes:
|
||||
!!! note
|
||||
If you are behind proxy, you can pass the proxy settings to the docker run command via `-e http_proxy=$http_proxy -e https_proxy=$https_proxy`.
|
||||
|
||||
[](){ #nginxloadbalancer-nginx-launch-nginx }
|
||||
|
||||
## Launch Nginx
|
||||
|
||||
```bash
|
||||
@ -135,8 +123,6 @@ docker run \
|
||||
--name nginx-lb nginx-lb:latest
|
||||
```
|
||||
|
||||
[](){ #nginxloadbalancer-nginx-verify-nginx }
|
||||
|
||||
## Verify That vLLM Servers Are Ready
|
||||
|
||||
```bash
|
||||
|
@ -47,9 +47,9 @@ Here is a sample of `LLM` class usage:
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
```
|
||||
|
||||
More API details can be found in the [Offline Inference](#offline-inference-api) section of the API docs.
|
||||
More API details can be found in the [Offline Inference](../api/README.md#offline-inference) section of the API docs.
|
||||
|
||||
The code for the `LLM` class can be found in <gh-file:vllm/entrypoints/llm.py>.
|
||||
The code for the `LLM` class can be found in [vllm/entrypoints/llm.py](../../vllm/entrypoints/llm.py).
|
||||
|
||||
### OpenAI-Compatible API Server
|
||||
|
||||
@ -60,7 +60,7 @@ This server can be started using the `vllm serve` command.
|
||||
vllm serve <model>
|
||||
```
|
||||
|
||||
The code for the `vllm` CLI can be found in <gh-file:vllm/entrypoints/cli/main.py>.
|
||||
The code for the `vllm` CLI can be found in [vllm/entrypoints/cli/main.py](../../vllm/entrypoints/cli/main.py).
|
||||
|
||||
Sometimes you may see the API server entrypoint used directly instead of via the
|
||||
`vllm` CLI command. For example:
|
||||
@ -74,7 +74,7 @@ python -m vllm.entrypoints.openai.api_server --model <model>
|
||||
`python -m vllm.entrypoints.openai.api_server` is deprecated
|
||||
and may become unsupported in a future release.
|
||||
|
||||
That code can be found in <gh-file:vllm/entrypoints/openai/api_server.py>.
|
||||
That code can be found in [vllm/entrypoints/openai/api_server.py](../../vllm/entrypoints/openai/api_server.py).
|
||||
|
||||
More details on the API server can be found in the [OpenAI-Compatible Server](../serving/openai_compatible_server.md) document.
|
||||
|
||||
@ -101,7 +101,7 @@ processing.
|
||||
- **Output Processing**: Processes the outputs generated by the model, decoding the
|
||||
token IDs from a language model into human-readable text.
|
||||
|
||||
The code for `LLMEngine` can be found in <gh-file:vllm/engine/llm_engine.py>.
|
||||
The code for `LLMEngine` can be found in [vllm/engine/llm_engine.py](../../vllm/engine/llm_engine.py).
|
||||
|
||||
### AsyncLLMEngine
|
||||
|
||||
@ -111,9 +111,9 @@ incoming requests. The `AsyncLLMEngine` is designed for online serving, where it
|
||||
can handle multiple concurrent requests and stream outputs to clients.
|
||||
|
||||
The OpenAI-compatible API server uses the `AsyncLLMEngine`. There is also a demo
|
||||
API server that serves as a simpler example in <gh-file:vllm/entrypoints/api_server.py>.
|
||||
API server that serves as a simpler example in [vllm/entrypoints/api_server.py](../../vllm/entrypoints/api_server.py).
|
||||
|
||||
The code for `AsyncLLMEngine` can be found in <gh-file:vllm/engine/async_llm_engine.py>.
|
||||
The code for `AsyncLLMEngine` can be found in [vllm/engine/async_llm_engine.py](../../vllm/engine/async_llm_engine.py).
|
||||
|
||||
## Worker
|
||||
|
||||
|
@ -17,7 +17,7 @@ In this document we will discuss the:
|
||||
In this document, we refer to pure decode (`max_query_len=1`) or speculative decode (`max_query_len =1+num_spec_tokens`) as **uniform decode** batches, and the opposite would be **non-uniform** batches (i.e., prefill or mixed prefill-decode batches).
|
||||
|
||||
!!! note
|
||||
The following contents are mostly based on the last commit of <gh-pr:20059>.
|
||||
The following contents are mostly based on the last commit of <https://github.com/vllm-project/vllm/pull/20059>.
|
||||
|
||||
## Motivation
|
||||
|
||||
@ -92,7 +92,7 @@ where `num_tokens` can be the padded token length, and `uniform_decode` is deter
|
||||
The goal of this structure is to uniquely identify a (padded) batch with minimal possible items corresponding to a CUDA Graphs item. We are safe to exclude items like `uniform_query_len` because it is a constant at runtime for a certain setup currently. For example, it should be either `1` for a commonly pure decode or `1+num_spec_tokens` for a validation phase of speculative decode.
|
||||
|
||||
!!! note
|
||||
The prototype of `BatchDescriptor` may be extended for more general situations in the future, e.g., include more items, like `uniform_query_len` to support multiple different uniform decode lengths settings (<gh-pr:23679>), or other modifications needed to support CUDA Graphs for models whose inputs are not necessarily token length aware (for example, some multi-modal inputs).
|
||||
The prototype of `BatchDescriptor` may be extended for more general situations in the future, e.g., include more items, like `uniform_query_len` to support multiple different uniform decode lengths settings (<https://github.com/vllm-project/vllm/pull/23679>), or other modifications needed to support CUDA Graphs for models whose inputs are not necessarily token length aware (for example, some multi-modal inputs).
|
||||
|
||||
### `CudagraphDispatcher`
|
||||
|
||||
@ -167,7 +167,7 @@ class AttentionCGSupport(enum.Enum):
|
||||
"""NO CUDA Graphs support"""
|
||||
```
|
||||
|
||||
Suppose we have hybrid attention backends (e.g., in mamba mixer models). In that case, we seek the minimum capability of all backends to determine the final capability of the model, and we might resolve the incompatible CUDA Graphs mode by downgrading the mode to the best fit one. For example, downgrading `FULL` mode to `FULL_AND_PIECEWISE` mode if the minimum capability is `UNIFORM_BATCH`, or `PIECEWISE` mode if the minimum capability is `NEVER` for -O3 compilation level. For the complete fallback policy, please see the code of [initialize_cudagraph_capture][vllm.v1.worker.gpu_model_runner.GPUModelRunner.initialize_cudagraph_capture].
|
||||
Suppose we have hybrid attention backends (e.g., in mamba mixer models). In that case, we seek the minimum capability of all backends to determine the final capability of the model, and we might resolve the incompatible CUDA Graphs mode by downgrading the mode to the best fit one. For example, downgrading `FULL` mode to `FULL_AND_PIECEWISE` mode if the minimum capability is `UNIFORM_BATCH`, or `PIECEWISE` mode if the minimum capability is `NEVER` for -O3 compilation mode. For the complete fallback policy, please see the code of [initialize_cudagraph_capture][vllm.v1.worker.gpu_model_runner.GPUModelRunner.initialize_cudagraph_capture].
|
||||
|
||||
The following table lists backends that support full CUDA Graphs at the time of writing.
|
||||
|
||||
@ -202,7 +202,7 @@ os.environ.setdefault("VLLM_LOGGING_LEVEL", "DEBUG")
|
||||
import vllm
|
||||
from vllm.config import CUDAGraphMode
|
||||
|
||||
compilation_config = {"level": 3, "cudagraph_mode": "FULL_AND_PIECEWISE"}
|
||||
compilation_config = {"mode": 3, "cudagraph_mode": "FULL_AND_PIECEWISE"}
|
||||
model = vllm.LLM(
|
||||
model="meta-llama/Llama-3.1-8B-Instruct",
|
||||
dtype="auto",
|
||||
|
@ -2,7 +2,7 @@
|
||||
|
||||
## Introduction
|
||||
|
||||
FusedMoEModularKernel is implemented [here](gh-file:/vllm/model_executor/layers/fused_moe/modular_kernel.py)
|
||||
FusedMoEModularKernel is implemented [here](../..//vllm/model_executor/layers/fused_moe/modular_kernel.py)
|
||||
|
||||
Based on the format of the input activations, FusedMoE implementations are broadly classified into 2 types.
|
||||
|
||||
@ -44,7 +44,7 @@ FusedMoEModularKernel splits the FusedMoE operation into 3 parts,
|
||||
|
||||
The TopK Weight Application and Reduction components happen right after the Unpermute operation and before the All2All Combine. Note that the `FusedMoEPermuteExpertsUnpermute` is responsible for the Unpermute and `FusedMoEPrepareAndFinalize` is responsible for the All2All Combine. There is value in doing the TopK Weight Application and Reduction in the `FusedMoEPermuteExpertsUnpermute`. But some implementations choose to do it `FusedMoEPrepareAndFinalize`. In order to enable this flexibility, we have a TopKWeightAndReduce abstract class.
|
||||
|
||||
Please find the implementations of TopKWeightAndReduce [here](gh-file:vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py).
|
||||
Please find the implementations of TopKWeightAndReduce [here](../../vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py).
|
||||
|
||||
`FusedMoEPrepareAndFinalize::finalize()` method accepts a `TopKWeightAndReduce` argument that is invoked inside the method.
|
||||
The `FusedMoEModularKernel` acts as a bridge between the `FusedMoEPermuteExpertsUnpermute` and `FusedMoEPerpareAndFinalize` implementations to determine where the TopK Weight Application and Reduction happens.
|
||||
@ -138,7 +138,7 @@ Typically a FusedMoEPrepareAndFinalize type is backed by an All2All Dispatch & C
|
||||
|
||||
#### Step 1: Add an All2All manager
|
||||
|
||||
The purpose of the All2All Manager is to set up the All2All kernel implementations. The `FusedMoEPrepareAndFinalize` implementations typically fetch a kernel-implementation "handle" from the All2All Manager to invoke the Dispatch and Combine functions. Please look at the All2All Manager implementations [here](gh-file:vllm/distributed/device_communicators/all2all.py).
|
||||
The purpose of the All2All Manager is to set up the All2All kernel implementations. The `FusedMoEPrepareAndFinalize` implementations typically fetch a kernel-implementation "handle" from the All2All Manager to invoke the Dispatch and Combine functions. Please look at the All2All Manager implementations [here](../../vllm/distributed/device_communicators/all2all.py).
|
||||
|
||||
#### Step 2: Add a FusedMoEPrepareAndFinalize Type
|
||||
|
||||
@ -213,29 +213,29 @@ Please take a look at [init_prepare_finalize](https://github.com/vllm-project/vl
|
||||
|
||||
### How To Unit Test
|
||||
|
||||
We have `FusedMoEModularKernel` unit tests at [test_modular_kernel_combinations.py](gh-file:tests/kernels/moe/test_modular_kernel_combinations.py).
|
||||
We have `FusedMoEModularKernel` unit tests at [test_modular_kernel_combinations.py](../../tests/kernels/moe/test_modular_kernel_combinations.py).
|
||||
|
||||
The unit test iterates through all combinations of `FusedMoEPrepareAndFinalize` and `FusedMoEPremuteExpertsUnpermute` types and if they are
|
||||
compatible, runs some correctness tests.
|
||||
If you are adding some `FusedMoEPrepareAndFinalize` / `FusedMoEPermuteExpertsUnpermute` implementations,
|
||||
|
||||
1. Add the implementation type to `MK_ALL_PREPARE_FINALIZE_TYPES` and `MK_FUSED_EXPERT_TYPES` in [mk_objects.py](gh-file:tests/kernels/moe/modular_kernel_tools/mk_objects.py) respectively.
|
||||
1. Add the implementation type to `MK_ALL_PREPARE_FINALIZE_TYPES` and `MK_FUSED_EXPERT_TYPES` in [mk_objects.py](../../tests/kernels/moe/modular_kernel_tools/mk_objects.py) respectively.
|
||||
2. Update `Config::is_batched_prepare_finalize()`, `Config::is_batched_fused_experts()`, `Config::is_standard_fused_experts()`,
|
||||
`Config::is_fe_16bit_supported()`, `Config::is_fe_fp8_supported()`, `Config::is_fe_block_fp8_supported()`,
|
||||
`Config::is_fe_supports_chunking()` methods in [/tests/kernels/moe/modular_kernel_tools/common.py](gh-file:tests/kernels/moe/modular_kernel_tools/common.py)
|
||||
`Config::is_fe_supports_chunking()` methods in [/tests/kernels/moe/modular_kernel_tools/common.py](../../tests/kernels/moe/modular_kernel_tools/common.py)
|
||||
|
||||
Doing this will add the new implementation to the test suite.
|
||||
|
||||
### How To Check `FusedMoEPrepareAndFinalize` & `FusedMoEPermuteExpertsUnpermute` Compatibility
|
||||
|
||||
The unit test file [test_modular_kernel_combinations.py](gh-file:tests/kernels/moe/test_modular_kernel_combinations.py) can also be executed as a standalone script.
|
||||
The unit test file [test_modular_kernel_combinations.py](../../tests/kernels/moe/test_modular_kernel_combinations.py) can also be executed as a standalone script.
|
||||
Example: `python3 -m tests.kernels.moe.test_modular_kernel_combinations --pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts`
|
||||
As a side effect, this script can be used to test `FusedMoEPrepareAndFinalize` & `FusedMoEPermuteExpertsUnpermute` compatibility. When invoked
|
||||
with incompatible types, the script will error.
|
||||
|
||||
### How To Profile
|
||||
|
||||
Please take a look at [profile_modular_kernel.py](gh-file:tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py)
|
||||
Please take a look at [profile_modular_kernel.py](../../tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py)
|
||||
The script can be used to generate Torch traces for a single `FusedMoEModularKernel::forward()` call for any compatible
|
||||
`FusedMoEPrepareAndFinalize` and `FusedMoEPermuteExpertsUnpermute` types.
|
||||
Example: `python3 -m tests.kernels.moe.modular_kernel_tools.profile_modular_kernel --pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts`
|
||||
|
@ -6,7 +6,7 @@ When performing an inference with IO Processor plugins, the prompt type is defin
|
||||
|
||||
## Writing an IO Processor Plugin
|
||||
|
||||
IO Processor plugins implement the `IOProcessor` interface (<gh-file:vllm/plugins/io_processors/interface.py>):
|
||||
IO Processor plugins implement the [`IOProcessor`][vllm.plugins.io_processors.interface.IOProcessor] interface:
|
||||
|
||||
```python
|
||||
IOProcessorInput = TypeVar("IOProcessorInput")
|
||||
@ -67,9 +67,9 @@ The `parse_request` method is used for validating the user prompt and converting
|
||||
The `pre_process*` methods take the validated plugin input to generate vLLM's model prompts for regular inference.
|
||||
The `post_process*` methods take `PoolingRequestOutput` objects as input and generate a custom plugin output.
|
||||
|
||||
The `output_to_response` method is used only for online serving and converts the plugin output to the `IOProcessorResponse` type that is then returned by the API Server. The implementation of the `/io_processor_pooling` serving endpoint is available here <gh-file:vllm/entrypoints/openai/serving_pooling_with_io_plugin.py>.
|
||||
The `output_to_response` method is used only for online serving and converts the plugin output to the `IOProcessorResponse` type that is then returned by the API Server. The implementation of the `/pooling` serving endpoint is available here [vllm/entrypoints/openai/serving_pooling.py](../../vllm/entrypoints/openai/serving_pooling.py).
|
||||
|
||||
An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/christian-pinto/prithvi_io_processor_plugin). Please, also refer to our online (<gh-file:examples/online_serving/prithvi_geospatial_mae.py>) and offline (<gh-file:examples/offline_inference/prithvi_geospatial_mae_io_processor.py>) inference examples.
|
||||
An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/christian-pinto/prithvi_io_processor_plugin). Please, also refer to our online ([examples/online_serving/prithvi_geospatial_mae.py](../../examples/online_serving/prithvi_geospatial_mae.py)) and offline ([examples/offline_inference/prithvi_geospatial_mae_io_processor.py](../../examples/offline_inference/prithvi_geospatial_mae_io_processor.py)) inference examples.
|
||||
|
||||
## Using an IO Processor plugin
|
||||
|
||||
|
@ -80,13 +80,13 @@ The subset of metrics exposed in the Grafana dashboard gives us an indication of
|
||||
- `vllm:request_decode_time_seconds` - Requests decode time.
|
||||
- `vllm:request_max_num_generation_tokens` - Max generation tokens in a sequence group.
|
||||
|
||||
See [the PR which added this Dashboard](gh-pr:2316) for interesting and useful background on the choices made here.
|
||||
See [the PR which added this Dashboard](https://github.com/vllm-project/vllm/pull/2316) for interesting and useful background on the choices made here.
|
||||
|
||||
### Prometheus Client Library
|
||||
|
||||
Prometheus support was initially added [using the aioprometheus library](gh-pr:1890), but a switch was made quickly to [prometheus_client](gh-pr:2730). The rationale is discussed in both linked PRs.
|
||||
Prometheus support was initially added [using the aioprometheus library](https://github.com/vllm-project/vllm/pull/1890), but a switch was made quickly to [prometheus_client](https://github.com/vllm-project/vllm/pull/2730). The rationale is discussed in both linked PRs.
|
||||
|
||||
With the switch to `aioprometheus`, we lost a `MetricsMiddleware` to track HTTP metrics, but this was reinstated [using prometheus_fastapi_instrumentator](gh-pr:15657):
|
||||
With the switch to `aioprometheus`, we lost a `MetricsMiddleware` to track HTTP metrics, but this was reinstated [using prometheus_fastapi_instrumentator](https://github.com/vllm-project/vllm/pull/15657):
|
||||
|
||||
```bash
|
||||
$ curl http://0.0.0.0:8000/metrics 2>/dev/null | grep -P '^http_(?!.*(_bucket|_created|_sum)).*'
|
||||
@ -99,7 +99,7 @@ http_request_duration_seconds_count{handler="/v1/completions",method="POST"} 201
|
||||
|
||||
### Multi-process Mode
|
||||
|
||||
In v0, metrics are collected in the engine core process and we use multiprocess mode to make them available in the API server process. See <gh-pr:7279>.
|
||||
In v0, metrics are collected in the engine core process and we use multiprocess mode to make them available in the API server process. See <https://github.com/vllm-project/vllm/pull/7279>.
|
||||
|
||||
### Built in Python/Process Metrics
|
||||
|
||||
@ -125,32 +125,32 @@ vLLM instance.
|
||||
|
||||
For background, these are some of the relevant PRs which added the v0 metrics:
|
||||
|
||||
- <gh-pr:1890>
|
||||
- <gh-pr:2316>
|
||||
- <gh-pr:2730>
|
||||
- <gh-pr:4464>
|
||||
- <gh-pr:7279>
|
||||
- <https://github.com/vllm-project/vllm/pull/1890>
|
||||
- <https://github.com/vllm-project/vllm/pull/2316>
|
||||
- <https://github.com/vllm-project/vllm/pull/2730>
|
||||
- <https://github.com/vllm-project/vllm/pull/4464>
|
||||
- <https://github.com/vllm-project/vllm/pull/7279>
|
||||
|
||||
Also note the ["Even Better Observability"](gh-issue:3616) feature where e.g. [a detailed roadmap was laid out](gh-issue:3616#issuecomment-2030858781).
|
||||
Also note the ["Even Better Observability"](https://github.com/vllm-project/vllm/issues/3616) feature where e.g. [a detailed roadmap was laid out](https://github.com/vllm-project/vllm/issues/3616#issuecomment-2030858781).
|
||||
|
||||
## v1 Design
|
||||
|
||||
### v1 PRs
|
||||
|
||||
For background, here are the relevant v1 PRs relating to the v1
|
||||
metrics issue <gh-issue:10582>:
|
||||
metrics issue <https://github.com/vllm-project/vllm/issues/10582>:
|
||||
|
||||
- <gh-pr:11962>
|
||||
- <gh-pr:11973>
|
||||
- <gh-pr:10907>
|
||||
- <gh-pr:12416>
|
||||
- <gh-pr:12478>
|
||||
- <gh-pr:12516>
|
||||
- <gh-pr:12530>
|
||||
- <gh-pr:12561>
|
||||
- <gh-pr:12579>
|
||||
- <gh-pr:12592>
|
||||
- <gh-pr:12644>
|
||||
- <https://github.com/vllm-project/vllm/pull/11962>
|
||||
- <https://github.com/vllm-project/vllm/pull/11973>
|
||||
- <https://github.com/vllm-project/vllm/pull/10907>
|
||||
- <https://github.com/vllm-project/vllm/pull/12416>
|
||||
- <https://github.com/vllm-project/vllm/pull/12478>
|
||||
- <https://github.com/vllm-project/vllm/pull/12516>
|
||||
- <https://github.com/vllm-project/vllm/pull/12530>
|
||||
- <https://github.com/vllm-project/vllm/pull/12561>
|
||||
- <https://github.com/vllm-project/vllm/pull/12579>
|
||||
- <https://github.com/vllm-project/vllm/pull/12592>
|
||||
- <https://github.com/vllm-project/vllm/pull/12644>
|
||||
|
||||
### Metrics Collection
|
||||
|
||||
@ -369,7 +369,7 @@ vllm:cache_config_info{block_size="16",cache_dtype="auto",calculate_kv_scales="F
|
||||
|
||||
However, `prometheus_client` has
|
||||
[never supported Info metrics in multiprocessing mode](https://github.com/prometheus/client_python/pull/300) -
|
||||
for [unclear reasons](gh-pr:7279#discussion_r1710417152). We
|
||||
for [unclear reasons](https://github.com/vllm-project/vllm/pull/7279#discussion_r1710417152). We
|
||||
simply use a `Gauge` metric set to 1 and
|
||||
`multiprocess_mode="mostrecent"` instead.
|
||||
|
||||
@ -394,7 +394,7 @@ distinguish between per-adapter counts. This should be revisited.
|
||||
Note that `multiprocess_mode="livemostrecent"` is used - the most
|
||||
recent metric is used, but only from currently running processes.
|
||||
|
||||
This was added in <gh-pr:9477> and there is
|
||||
This was added in <https://github.com/vllm-project/vllm/pull/9477> and there is
|
||||
[at least one known user](https://github.com/kubernetes-sigs/gateway-api-inference-extension/pull/54).
|
||||
If we revisit this design and deprecate the old metric, we should reduce
|
||||
the need for a significant deprecation period by making the change in
|
||||
@ -402,7 +402,7 @@ v0 also and asking this project to move to the new metric.
|
||||
|
||||
### Prefix Cache metrics
|
||||
|
||||
The discussion in <gh-issue:10582> about adding prefix cache metrics yielded
|
||||
The discussion in <https://github.com/vllm-project/vllm/issues/10582> about adding prefix cache metrics yielded
|
||||
some interesting points which may be relevant to how we approach
|
||||
future metrics.
|
||||
|
||||
@ -439,8 +439,8 @@ suddenly (from their perspective) when it is removed, even if there is
|
||||
an equivalent metric for them to use.
|
||||
|
||||
As an example, see how `vllm:avg_prompt_throughput_toks_per_s` was
|
||||
[deprecated](gh-pr:2764) (with a comment in the code),
|
||||
[removed](gh-pr:12383), and then [noticed by a user](gh-issue:13218).
|
||||
[deprecated](https://github.com/vllm-project/vllm/pull/2764) (with a comment in the code),
|
||||
[removed](https://github.com/vllm-project/vllm/pull/12383), and then [noticed by a user](https://github.com/vllm-project/vllm/issues/13218).
|
||||
|
||||
In general:
|
||||
|
||||
@ -460,20 +460,20 @@ the project-wide deprecation policy.
|
||||
|
||||
### Unimplemented - `vllm:tokens_total`
|
||||
|
||||
Added by <gh-pr:4464>, but apparently never implemented. This can just be
|
||||
Added by <https://github.com/vllm-project/vllm/pull/4464>, but apparently never implemented. This can just be
|
||||
removed.
|
||||
|
||||
### Duplicated - Queue Time
|
||||
|
||||
The `vllm:time_in_queue_requests` Histogram metric was added by
|
||||
<gh-pr:9659> and its calculation is:
|
||||
<https://github.com/vllm-project/vllm/pull/9659> and its calculation is:
|
||||
|
||||
```python
|
||||
self.metrics.first_scheduled_time = now
|
||||
self.metrics.time_in_queue = now - self.metrics.arrival_time
|
||||
```
|
||||
|
||||
Two weeks later, <gh-pr:4464> added `vllm:request_queue_time_seconds` leaving
|
||||
Two weeks later, <https://github.com/vllm-project/vllm/pull/4464> added `vllm:request_queue_time_seconds` leaving
|
||||
us with:
|
||||
|
||||
```python
|
||||
@ -513,7 +513,7 @@ cache to complete other requests), we swap kv cache blocks out to CPU
|
||||
memory. This is also known as "KV cache offloading" and is configured
|
||||
with `--swap-space` and `--preemption-mode`.
|
||||
|
||||
In v0, [vLLM has long supported beam search](gh-issue:6226). The
|
||||
In v0, [vLLM has long supported beam search](https://github.com/vllm-project/vllm/issues/6226). The
|
||||
SequenceGroup encapsulated the idea of N Sequences which
|
||||
all shared the same prompt kv blocks. This enabled KV cache block
|
||||
sharing between requests, and copy-on-write to do branching. CPU
|
||||
@ -526,7 +526,7 @@ and the part of the prompt that was evicted can be recomputed.
|
||||
|
||||
SequenceGroup was removed in V1, although a replacement will be
|
||||
required for "parallel sampling" (`n>1`).
|
||||
[Beam search was moved out of the core (in V0)](gh-issue:8306). There was a
|
||||
[Beam search was moved out of the core (in V0)](https://github.com/vllm-project/vllm/issues/8306). There was a
|
||||
lot of complex code for a very uncommon feature.
|
||||
|
||||
In V1, with prefix caching being better (zero over head) and therefore
|
||||
@ -541,7 +541,7 @@ Some v0 metrics are only relevant in the context of "parallel
|
||||
sampling". This is where the `n` parameter in a request is used to
|
||||
request multiple completions from the same prompt.
|
||||
|
||||
As part of adding parallel sampling support in <gh-pr:10980>, we should
|
||||
As part of adding parallel sampling support in <https://github.com/vllm-project/vllm/pull/10980>, we should
|
||||
also add these metrics.
|
||||
|
||||
- `vllm:request_params_n` (Histogram)
|
||||
@ -566,7 +566,7 @@ model and then validate those tokens with the larger model.
|
||||
- `vllm:spec_decode_num_draft_tokens_total` (Counter)
|
||||
- `vllm:spec_decode_num_emitted_tokens_total` (Counter)
|
||||
|
||||
There is a PR under review (<gh-pr:12193>) to add "prompt lookup (ngram)"
|
||||
There is a PR under review (<https://github.com/vllm-project/vllm/pull/12193>) to add "prompt lookup (ngram)"
|
||||
speculative decoding to v1. Other techniques will follow. We should
|
||||
revisit the v0 metrics in this context.
|
||||
|
||||
@ -587,7 +587,7 @@ see:
|
||||
- [Standardizing Large Model Server Metrics in Kubernetes](https://docs.google.com/document/d/1SpSp1E6moa4HSrJnS4x3NpLuj88sMXr2tbofKlzTZpk)
|
||||
- [Benchmarking LLM Workloads for Performance Evaluation and Autoscaling in Kubernetes](https://docs.google.com/document/d/1k4Q4X14hW4vftElIuYGDu5KDe2LtV1XammoG-Xi3bbQ)
|
||||
- [Inference Perf](https://github.com/kubernetes-sigs/wg-serving/tree/main/proposals/013-inference-perf)
|
||||
- <gh-issue:5041> and <gh-pr:12726>.
|
||||
- <https://github.com/vllm-project/vllm/issues/5041> and <https://github.com/vllm-project/vllm/pull/12726>.
|
||||
|
||||
This is a non-trivial topic. Consider this comment from Rob:
|
||||
|
||||
@ -654,7 +654,7 @@ fall under the more general heading of "Observability".
|
||||
|
||||
v0 has support for OpenTelemetry tracing:
|
||||
|
||||
- Added by <gh-pr:4687>
|
||||
- Added by <https://github.com/vllm-project/vllm/pull/4687>
|
||||
- Configured with `--oltp-traces-endpoint` and `--collect-detailed-traces`
|
||||
- [OpenTelemetry blog post](https://opentelemetry.io/blog/2024/llm-observability/)
|
||||
- [User-facing docs](../examples/online_serving/opentelemetry.md)
|
||||
@ -685,7 +685,7 @@ documentation for this option states:
|
||||
> use of possibly costly and or blocking operations and hence might
|
||||
> have a performance impact.
|
||||
|
||||
The metrics were added by <gh-pr:7089> and who up in an OpenTelemetry trace
|
||||
The metrics were added by <https://github.com/vllm-project/vllm/pull/7089> and who up in an OpenTelemetry trace
|
||||
as:
|
||||
|
||||
```text
|
||||
|
@ -1,6 +1,6 @@
|
||||
# Multi-Modal Data Processing
|
||||
|
||||
To enable various optimizations in vLLM such as [chunked prefill][chunked-prefill] and [prefix caching](../features/automatic_prefix_caching.md), we use [BaseMultiModalProcessor][vllm.multimodal.processing.BaseMultiModalProcessor] to provide the correspondence between placeholder feature tokens (e.g. `<image>`) and multi-modal inputs (e.g. the raw input image) based on the outputs of HF processor.
|
||||
To enable various optimizations in vLLM such as [chunked prefill](../configuration/optimization.md#chunked-prefill) and [prefix caching](../features/automatic_prefix_caching.md), we use [BaseMultiModalProcessor][vllm.multimodal.processing.BaseMultiModalProcessor] to provide the correspondence between placeholder feature tokens (e.g. `<image>`) and multi-modal inputs (e.g. the raw input image) based on the outputs of HF processor.
|
||||
|
||||
Here are the main features of [BaseMultiModalProcessor][vllm.multimodal.processing.BaseMultiModalProcessor]:
|
||||
|
||||
@ -41,14 +41,10 @@ While HF processors support text + multi-modal inputs natively, this is not so f
|
||||
|
||||
Moreover, since the tokenized text has not passed through the HF processor, we have to apply Step 3 by ourselves to keep the output tokens and multi-modal data consistent with each other.
|
||||
|
||||
[](){ #mm-dummy-text }
|
||||
|
||||
### Dummy text
|
||||
|
||||
We work around the first issue by requiring each model to define how to generate dummy text based on the number of multi-modal inputs, via [get_dummy_text][vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_text]. This lets us generate dummy text corresponding to the multi-modal inputs and input them together to obtain the processed multi-modal data.
|
||||
|
||||
[](){ #mm-automatic-prompt-updating }
|
||||
|
||||
### Automatic prompt updating
|
||||
|
||||
We address the second issue by implementing model-agnostic code in
|
||||
@ -60,8 +56,8 @@ With the help of dummy text and automatic prompt updating, our multi-modal proce
|
||||
|
||||
## Processor Output Caching
|
||||
|
||||
Some HF processors, such as the one for Qwen2-VL, are [very slow](gh-issue:9238). To alleviate this problem, we cache the multi-modal outputs of HF processor to avoid processing the same multi-modal input (e.g. image) again.
|
||||
Some HF processors, such as the one for Qwen2-VL, are [very slow](https://github.com/vllm-project/vllm/issues/9238). To alleviate this problem, we cache the multi-modal outputs of HF processor to avoid processing the same multi-modal input (e.g. image) again.
|
||||
|
||||
When new data is passed in, we first check which items are in the cache, and which ones are missing. The missing items are passed into the HF processor in a single batch and cached, before being merged with the existing items in the cache.
|
||||
|
||||
Since we only process the missing multi-modal data items, the number of input placeholder tokens no longer corresponds to the number of the multi-modal inputs, so they can't be passed alongside the text prompt to HF processor. Therefore, we process the text and multi-modal inputs separately, using [dummy text][mm-dummy-text] to avoid HF errors. Since this skips HF's prompt updating code, we apply [automatic prompt updating][mm-automatic-prompt-updating] afterwards to keep the output tokens and multi-modal data consistent with each other.
|
||||
Since we only process the missing multi-modal data items, the number of input placeholder tokens no longer corresponds to the number of the multi-modal inputs, so they can't be passed alongside the text prompt to HF processor. Therefore, we process the text and multi-modal inputs separately, using [dummy text](#dummy-text) to avoid HF errors. Since this skips HF's prompt updating code, we apply [automatic prompt updating](#automatic-prompt-updating) afterwards to keep the output tokens and multi-modal data consistent with each other.
|
||||
|
@ -92,8 +92,8 @@ To be used with a particular `FusedMoEPrepareAndFinalize` sub-class, MoE kernels
|
||||
| flashinfer | standard | nvfp4,</br>fp8 | T | <sup>5</sup> | N | Y | [`flashinfer_cutlass_moe_fp4`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.flashinfer_cutlass_moe_fp4],</br>[`FlashInferExperts`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.FlashInferExperts] |
|
||||
| gpt oss triton | standard | N/A | N/A | <sup>5</sup> | Y | Y | [`triton_kernel_fused_experts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.triton_kernel_fused_experts],</br>[`OAITritonExperts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.OAITritonExperts] |
|
||||
| deep gemm+triton<sup>2</sup> | standard,</br>batched | all<sup>1</sup> | G(128),A,T | silu, gelu | <sup>6</sup> | Y | [`TritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe.TritonOrDeepGemmExperts],</br>[`BatchedTritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe.BatchedTritonOrDeepGemmExperts] |
|
||||
| marlin | standard | <sup>3</sup> | <sup>3</sup> | silu,</br>swigluoai | Y | N | [`fused_marlin_moe`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.fused_marlin_moe] |
|
||||
| marlin experts | standard | N/A | N/A | silu,</br>swigluoai | Y | Y | [`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts] |
|
||||
| marlin | standard | <sup>3</sup> | <sup>3</sup> | silu,</br>swigluoai | Y | Y | [`fused_marlin_moe`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.fused_marlin_moe],</br>[`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts],</br>[`BatchedMarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.BatchedMarlinExperts] |
|
||||
| marlin experts | standard,</br>batched | N/A | N/A | silu,</br>swigluoai | Y | Y | [`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts],</br>[`BatchedMarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.BatchedMarlinExperts] |
|
||||
| trtllm | standard | mxfp4,</br>nvfp4 | G(16),G(32) | <sup>5</sup> | N | Y | [`TrtLlmGenExperts`][vllm.model_executor.layers.fused_moe.trtllm_moe.TrtLlmGenExperts] |
|
||||
| pallas | standard | N/A | N/A | silu | N | N | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_pallas.fused_moe] |
|
||||
| iterative | standard | N/A | N/A | silu | N | N | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_torch_iterative.fused_moe] |
|
||||
@ -115,6 +115,6 @@ The following table shows "families" of modular kernels that are intended to wor
|
||||
|
||||
| backend | `FusedMoEPrepareAndFinalize` subclasses | `FusedMoEPermuteExpertsUnpermute` subclasses |
|
||||
|----------------------------------|------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------|
|
||||
| deepep_high_throughput | `DeepEPHTPrepareAndFinalize` | `DeepGemmExperts`,</br>`TritonExperts`,</br>`TritonOrDeepGemmExperts`,</br>`CutlassExpertsFp8`, </br>`MarlinExperts` |
|
||||
| deepep_low_latency,</br>pplx | `DeepEPLLPrepareAndFinalize`,</br>`PplxPrepareAndFinalize` | `BatchedDeepGemmExperts`,</br>`BatchedTritonExperts`,</br>`BatchedTritonOrDeepGemmExperts`,</br>`CutlassBatchedExpertsFp8`|
|
||||
| flashinfer | `FlashInferCutlassMoEPrepareAndFinalize` | `FlashInferExperts` |
|
||||
| deepep_high_throughput | `DeepEPHTPrepareAndFinalize` | `DeepGemmExperts`,</br>`TritonExperts`,</br>`TritonOrDeepGemmExperts`,</br>`CutlassExpertsFp8`, </br>`MarlinExperts` |
|
||||
| deepep_low_latency,</br>pplx | `DeepEPLLPrepareAndFinalize`,</br>`PplxPrepareAndFinalize` | `BatchedDeepGemmExperts`,</br>`BatchedTritonExperts`,</br>`BatchedTritonOrDeepGemmExperts`,</br>`CutlassBatchedExpertsFp8`,</br>`BatchedMarlinExperts`|
|
||||
| flashinfer | `FlashInferCutlassMoEPrepareAndFinalize` | `FlashInferExperts` |
|
||||
|
@ -2,7 +2,7 @@
|
||||
|
||||
## Debugging
|
||||
|
||||
Please see the [Troubleshooting][troubleshooting-python-multiprocessing]
|
||||
Please see the [Troubleshooting](../usage/troubleshooting.md#python-multiprocessing)
|
||||
page for information on known issues and how to solve them.
|
||||
|
||||
## Introduction
|
||||
@ -82,7 +82,7 @@ There are other miscellaneous places hard-coding the use of `spawn`:
|
||||
|
||||
Related PRs:
|
||||
|
||||
- <gh-pr:8823>
|
||||
- <https://github.com/vllm-project/vllm/pull/8823>
|
||||
|
||||
## Prior State in v1
|
||||
|
||||
|
@ -41,7 +41,7 @@ Every plugin has three parts:
|
||||
|
||||
1. **Plugin group**: The name of the entry point group. vLLM uses the entry point group `vllm.general_plugins` to register general plugins. This is the key of `entry_points` in the `setup.py` file. Always use `vllm.general_plugins` for vLLM's general plugins.
|
||||
2. **Plugin name**: The name of the plugin. This is the value in the dictionary of the `entry_points` dictionary. In the example above, the plugin name is `register_dummy_model`. Plugins can be filtered by their names using the `VLLM_PLUGINS` environment variable. To load only a specific plugin, set `VLLM_PLUGINS` to the plugin name.
|
||||
3. **Plugin value**: The fully qualified name of the function to register in the plugin system. In the example above, the plugin value is `vllm_add_dummy_model:register`, which refers to a function named `register` in the `vllm_add_dummy_model` module.
|
||||
3. **Plugin value**: The fully qualified name of the function or module to register in the plugin system. In the example above, the plugin value is `vllm_add_dummy_model:register`, which refers to a function named `register` in the `vllm_add_dummy_model` module.
|
||||
|
||||
## Types of supported plugins
|
||||
|
||||
@ -51,6 +51,8 @@ Every plugin has three parts:
|
||||
|
||||
- **IO Processor plugins** (with group name `vllm.io_processor_plugins`): The primary use case for these plugins is to register custom pre/post processing of the model prompt and model output for pooling models. The plugin function returns the IOProcessor's class fully qualified name.
|
||||
|
||||
- **Stat logger plugins** (with group name `vllm.stat_logger_plugins`): The primary use case for these plugins is to register custom, out-of-the-tree loggers into vLLM. The entry point should be a class that subclasses StatLoggerBase.
|
||||
|
||||
## Guidelines for Writing Plugins
|
||||
|
||||
- **Being re-entrant**: The function specified in the entry point should be re-entrant, meaning it can be called multiple times without causing issues. This is necessary because the function might be called multiple times in some processes.
|
||||
|
@ -19,8 +19,8 @@ vLLM will take all the available factors into consideration, and decide a direct
|
||||
|
||||
The factors considered include:
|
||||
|
||||
- All the related configs (see the `compute_hash` functions in their respective configs in the [config folder](gh-file:vllm/config))
|
||||
- PyTorch configs (see the `compute_hash` functions in the [compiler_interface.py](gh-file:vllm/compilation/compiler_interface.py))
|
||||
- All the related configs (see the `compute_hash` functions in their respective configs in the [config folder](../../vllm/config))
|
||||
- PyTorch configs (see the `compute_hash` functions in the [compiler_interface.py](../../vllm/compilation/compiler_interface.py))
|
||||
- The model's forward function and the relevant functions called by the forward function (see below)
|
||||
|
||||
With all these factors taken into consideration, usually we can guarantee that the cache is safe to use, and will not cause any unexpected behavior. Therefore, the cache is enabled by default. If you want to debug the compilation process, or if you suspect the cache is causing some issues, you can disable it by setting the environment variable `VLLM_DISABLE_COMPILE_CACHE=1`.
|
||||
|
@ -36,45 +36,43 @@ th:not(:first-child) {
|
||||
}
|
||||
</style>
|
||||
|
||||
| Feature | [CP][chunked-prefill] | [APC](automatic_prefix_caching.md) | [LoRA](lora.md) | [SD](spec_decode.md) | CUDA graph | [pooling](../models/pooling_models.md) | <abbr title="Encoder-Decoder Models">enc-dec</abbr> | <abbr title="Logprobs">logP</abbr> | <abbr title="Prompt Logprobs">prmpt logP</abbr> | <abbr title="Async Output Processing">async output</abbr> | multi-step | <abbr title="Multimodal Inputs">mm</abbr> | best-of | beam-search | [prompt-embeds](prompt_embeds.md) |
|
||||
| Feature | [CP](../configuration/optimization.md#chunked-prefill) | [APC](automatic_prefix_caching.md) | [LoRA](lora.md) | [SD](spec_decode.md) | CUDA graph | [pooling](../models/pooling_models.md) | <abbr title="Encoder-Decoder Models">enc-dec</abbr> | <abbr title="Logprobs">logP</abbr> | <abbr title="Prompt Logprobs">prmpt logP</abbr> | <abbr title="Async Output Processing">async output</abbr> | multi-step | <abbr title="Multimodal Inputs">mm</abbr> | best-of | beam-search | [prompt-embeds](prompt_embeds.md) |
|
||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
||||
| [CP][chunked-prefill] | ✅ | | | | | | | | | | | | | | |
|
||||
| [CP](../configuration/optimization.md#chunked-prefill) | ✅ | | | | | | | | | | | | | | |
|
||||
| [APC](automatic_prefix_caching.md) | ✅ | ✅ | | | | | | | | | | | | | |
|
||||
| [LoRA](lora.md) | ✅ | ✅ | ✅ | | | | | | | | | | | | |
|
||||
| [SD](spec_decode.md) | ✅ | ✅ | ❌ | ✅ | | | | | | | | | | | |
|
||||
| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | | | |
|
||||
| [pooling](../models/pooling_models.md) | 🟠\* | 🟠\* | ✅ | ❌ | ✅ | ✅ | | | | | | | | | |
|
||||
| <abbr title="Encoder-Decoder Models">enc-dec</abbr> | ❌ | [❌](gh-issue:7366) | ❌ | [❌](gh-issue:7366) | ✅ | ✅ | ✅ | | | | | | | | |
|
||||
| <abbr title="Encoder-Decoder Models">enc-dec</abbr> | ❌ | [❌](https://github.com/vllm-project/vllm/issues/7366) | ❌ | [❌](https://github.com/vllm-project/vllm/issues/7366) | ✅ | ✅ | ✅ | | | | | | | | |
|
||||
| <abbr title="Logprobs">logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | | | | | | | |
|
||||
| <abbr title="Prompt Logprobs">prmpt logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | | | | | | |
|
||||
| <abbr title="Async Output Processing">async output</abbr> | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | | | | | |
|
||||
| multi-step | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | | | | |
|
||||
| [mm](multimodal_inputs.md) | ✅ | ✅ | [🟠](gh-pr:4194)<sup>^</sup> | ❔ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | | | |
|
||||
| best-of | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ✅ | ✅ | | |
|
||||
| beam-search | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ❔ | ✅ | ✅ | |
|
||||
| [prompt-embeds](prompt_embeds.md) | ✅ | [❌](gh-issue:25096) | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❔ | ❔ | ❌ | ❔ | ❔ | ✅ |
|
||||
| [mm](multimodal_inputs.md) | ✅ | ✅ | [🟠](https://github.com/vllm-project/vllm/pull/4194)<sup>^</sup> | ❔ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | | | |
|
||||
| best-of | ✅ | ✅ | ✅ | [❌](https://github.com/vllm-project/vllm/issues/6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](https://github.com/vllm-project/vllm/issues/7968) | ✅ | ✅ | | |
|
||||
| beam-search | ✅ | ✅ | ✅ | [❌](https://github.com/vllm-project/vllm/issues/6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](https://github.com/vllm-project/vllm/issues/7968) | ❔ | ✅ | ✅ | |
|
||||
| [prompt-embeds](prompt_embeds.md) | ✅ | [❌](https://github.com/vllm-project/vllm/issues/25096) | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❔ | ❔ | ❌ | ❔ | ❔ | ✅ |
|
||||
|
||||
\* Chunked prefill and prefix caching are only applicable to last-token pooling.
|
||||
<sup>^</sup> LoRA is only applicable to the language backbone of multimodal models.
|
||||
|
||||
[](){ #feature-x-hardware }
|
||||
|
||||
### Feature x Hardware
|
||||
|
||||
| Feature | Volta | Turing | Ampere | Ada | Hopper | CPU | AMD | TPU |
|
||||
|-----------------------------------------------------------|---------------------|-----------|-----------|--------|------------|--------------------|--------|-----|
|
||||
| [CP][chunked-prefill] | [❌](gh-issue:2729) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| [APC](automatic_prefix_caching.md) | [❌](gh-issue:3687) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| [LoRA](lora.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| [SD](spec_decode.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ |
|
||||
| [pooling](../models/pooling_models.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| <abbr title="Encoder-Decoder Models">enc-dec</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| [mm](multimodal_inputs.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| <abbr title="Logprobs">logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| <abbr title="Prompt Logprobs">prmpt logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| <abbr title="Async Output Processing">async output</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| multi-step | ✅ | ✅ | ✅ | ✅ | ✅ | [❌](gh-issue:8477) | ✅ | ❌ |
|
||||
| best-of | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| beam-search | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| [prompt-embeds](prompt_embeds.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ? | [❌](gh-issue:25097) |
|
||||
| Feature | Volta | Turing | Ampere | Ada | Hopper | CPU | AMD | TPU | Intel GPU |
|
||||
|-----------------------------------------------------------|---------------------|-----------|-----------|--------|------------|--------------------|--------|-----| ------------|
|
||||
| [CP](../configuration/optimization.md#chunked-prefill) | [❌](https://github.com/vllm-project/vllm/issues/2729) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| [APC](automatic_prefix_caching.md) | [❌](https://github.com/vllm-project/vllm/issues/3687) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| [LoRA](lora.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| [SD](spec_decode.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | [🟠](https://github.com/vllm-project/vllm/issues/26963) |
|
||||
| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | [❌](https://github.com/vllm-project/vllm/issues/26970) |
|
||||
| [pooling](../models/pooling_models.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ |
|
||||
| <abbr title="Encoder-Decoder Models">enc-dec</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ |
|
||||
| [mm](multimodal_inputs.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | [🟠](https://github.com/vllm-project/vllm/issues/26965) |
|
||||
| <abbr title="Logprobs">logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ |
|
||||
| <abbr title="Prompt Logprobs">prmpt logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ |
|
||||
| <abbr title="Async Output Processing">async output</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ |
|
||||
| multi-step | ✅ | ✅ | ✅ | ✅ | ✅ | [❌](https://github.com/vllm-project/vllm/issues/8477) | ✅ | ❌ | ✅ |
|
||||
| best-of | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ |
|
||||
| beam-search | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ |
|
||||
| [prompt-embeds](prompt_embeds.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ? | [❌](https://github.com/vllm-project/vllm/issues/25097) | ✅ |
|
||||
|
@ -11,7 +11,7 @@ Automatic Prefix Caching (APC in short) caches the KV cache of existing queries,
|
||||
|
||||
Set `enable_prefix_caching=True` in vLLM engine to enable APC. Here is an example:
|
||||
|
||||
<gh-file:examples/offline_inference/automatic_prefix_caching.py>
|
||||
[examples/offline_inference/automatic_prefix_caching.py](../../examples/offline_inference/automatic_prefix_caching.py)
|
||||
|
||||
## Example workloads
|
||||
|
||||
|
@ -17,14 +17,14 @@ Two main reasons:
|
||||
|
||||
## Usage example
|
||||
|
||||
Please refer to <gh-file:examples/online_serving/disaggregated_prefill.sh> for the example usage of disaggregated prefilling.
|
||||
Please refer to [examples/online_serving/disaggregated_prefill.sh](../../examples/online_serving/disaggregated_prefill.sh) for the example usage of disaggregated prefilling.
|
||||
|
||||
Now supports 5 types of connectors:
|
||||
|
||||
- **SharedStorageConnector**: refer to <gh-file:examples/offline_inference/disaggregated-prefill-v1/run.sh> for the example usage of SharedStorageConnector disaggregated prefilling.
|
||||
- **LMCacheConnectorV1**: refer to <gh-file:examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh> for the example usage of LMCacheConnectorV1 disaggregated prefilling which uses NIXL as the underlying KV transmission.
|
||||
- **NixlConnector**: refer to <gh-file:tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh> for the example usage of NixlConnector disaggregated prefilling which support fully async send/recv. For detailed usage guide, see [NixlConnector Usage Guide](nixl_connector_usage.md).
|
||||
- **P2pNcclConnector**: refer to <gh-file:examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh> for the example usage of P2pNcclConnector disaggregated prefilling.
|
||||
- **SharedStorageConnector**: refer to [examples/offline_inference/disaggregated-prefill-v1/run.sh](../../examples/offline_inference/disaggregated-prefill-v1/run.sh) for the example usage of SharedStorageConnector disaggregated prefilling.
|
||||
- **LMCacheConnectorV1**: refer to [examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh](../../examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh) for the example usage of LMCacheConnectorV1 disaggregated prefilling which uses NIXL as the underlying KV transmission.
|
||||
- **NixlConnector**: refer to [tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh](../../tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh) for the example usage of NixlConnector disaggregated prefilling which support fully async send/recv. For detailed usage guide, see [NixlConnector Usage Guide](nixl_connector_usage.md).
|
||||
- **P2pNcclConnector**: refer to [examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh](../../examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh) for the example usage of P2pNcclConnector disaggregated prefilling.
|
||||
- **MultiConnector**: take advantage of the kv_connector_extra_config: dict[str, Any] already present in KVTransferConfig to stash all the connectors we want in an ordered list of kwargs.such as:
|
||||
|
||||
```bash
|
||||
@ -45,7 +45,7 @@ For NixlConnector, you may also specify one or multiple NIXL_Backend. Such as:
|
||||
|
||||
## Benchmarks
|
||||
|
||||
Please refer to <gh-file:benchmarks/disagg_benchmarks> for disaggregated prefilling benchmarks.
|
||||
Please refer to [benchmarks/disagg_benchmarks](../../benchmarks/disagg_benchmarks) for disaggregated prefilling benchmarks.
|
||||
|
||||
## Development
|
||||
|
||||
|
@ -47,7 +47,7 @@ the third parameter is the path to the LoRA adapter.
|
||||
)
|
||||
```
|
||||
|
||||
Check out <gh-file:examples/offline_inference/multilora_inference.py> for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options.
|
||||
Check out [examples/offline_inference/multilora_inference.py](../../examples/offline_inference/multilora_inference.py) for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options.
|
||||
|
||||
## Serving LoRA Adapters
|
||||
|
||||
|
@ -1,9 +1,9 @@
|
||||
# Multimodal Inputs
|
||||
|
||||
This page teaches you how to pass multi-modal inputs to [multi-modal models][supported-mm-models] in vLLM.
|
||||
This page teaches you how to pass multi-modal inputs to [multi-modal models](../models/supported_models.md#list-of-multimodal-language-models) in vLLM.
|
||||
|
||||
!!! note
|
||||
We are actively iterating on multi-modal support. See [this RFC](gh-issue:4194) for upcoming changes,
|
||||
We are actively iterating on multi-modal support. See [this RFC](https://github.com/vllm-project/vllm/issues/4194) for upcoming changes,
|
||||
and [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) if you have any feedback or feature requests.
|
||||
|
||||
!!! tip
|
||||
@ -129,7 +129,7 @@ You can pass a single image to the `'image'` field of the multi-modal dictionary
|
||||
print(generated_text)
|
||||
```
|
||||
|
||||
Full example: <gh-file:examples/offline_inference/vision_language.py>
|
||||
Full example: [examples/offline_inference/vision_language.py](../../examples/offline_inference/vision_language.py)
|
||||
|
||||
To substitute multiple images inside the same text prompt, you can pass in a list of images instead:
|
||||
|
||||
@ -162,7 +162,7 @@ To substitute multiple images inside the same text prompt, you can pass in a lis
|
||||
print(generated_text)
|
||||
```
|
||||
|
||||
Full example: <gh-file:examples/offline_inference/vision_language_multi_image.py>
|
||||
Full example: [examples/offline_inference/vision_language_multi_image.py](../../examples/offline_inference/vision_language_multi_image.py)
|
||||
|
||||
If using the [LLM.chat](../models/generative_models.md#llmchat) method, you can pass images directly in the message content using various formats: image URLs, PIL Image objects, or pre-computed embeddings:
|
||||
|
||||
@ -346,13 +346,13 @@ Instead of NumPy arrays, you can also pass `'torch.Tensor'` instances, as shown
|
||||
!!! note
|
||||
'process_vision_info' is only applicable to Qwen2.5-VL and similar models.
|
||||
|
||||
Full example: <gh-file:examples/offline_inference/vision_language.py>
|
||||
Full example: [examples/offline_inference/vision_language.py](../../examples/offline_inference/vision_language.py)
|
||||
|
||||
### Audio Inputs
|
||||
|
||||
You can pass a tuple `(array, sampling_rate)` to the `'audio'` field of the multi-modal dictionary.
|
||||
|
||||
Full example: <gh-file:examples/offline_inference/audio_language.py>
|
||||
Full example: [examples/offline_inference/audio_language.py](../../examples/offline_inference/audio_language.py)
|
||||
|
||||
### Embedding Inputs
|
||||
|
||||
@ -434,11 +434,11 @@ Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions
|
||||
A chat template is **required** to use Chat Completions API.
|
||||
For HF format models, the default chat template is defined inside `chat_template.json` or `tokenizer_config.json`.
|
||||
|
||||
If no default chat template is available, we will first look for a built-in fallback in <gh-file:vllm/transformers_utils/chat_templates/registry.py>.
|
||||
If no default chat template is available, we will first look for a built-in fallback in [vllm/transformers_utils/chat_templates/registry.py](../../vllm/transformers_utils/chat_templates/registry.py).
|
||||
If no fallback is available, an error is raised and you have to provide the chat template manually via the `--chat-template` argument.
|
||||
|
||||
For certain models, we provide alternative chat templates inside <gh-dir:examples>.
|
||||
For example, VLM2Vec uses <gh-file:examples/template_vlm2vec_phi3v.jinja> which is different from the default one for Phi-3-Vision.
|
||||
For certain models, we provide alternative chat templates inside [examples](../../examples).
|
||||
For example, VLM2Vec uses [examples/template_vlm2vec_phi3v.jinja](../../examples/template_vlm2vec_phi3v.jinja) which is different from the default one for Phi-3-Vision.
|
||||
|
||||
### Image Inputs
|
||||
|
||||
@ -524,7 +524,7 @@ Then, you can use the OpenAI client as follows:
|
||||
print("Chat completion output:", chat_response.choices[0].message.content)
|
||||
```
|
||||
|
||||
Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
|
||||
Full example: [examples/online_serving/openai_chat_completion_client_for_multimodal.py](../../examples/online_serving/openai_chat_completion_client_for_multimodal.py)
|
||||
|
||||
!!! tip
|
||||
Loading from local file paths is also supported on vLLM: You can specify the allowed local media path via `--allowed-local-media-path` when launching the API server/engine,
|
||||
@ -595,7 +595,7 @@ Then, you can use the OpenAI client as follows:
|
||||
print("Chat completion output from image url:", result)
|
||||
```
|
||||
|
||||
Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
|
||||
Full example: [examples/online_serving/openai_chat_completion_client_for_multimodal.py](../../examples/online_serving/openai_chat_completion_client_for_multimodal.py)
|
||||
|
||||
!!! note
|
||||
By default, the timeout for fetching videos through HTTP URL is `30` seconds.
|
||||
@ -719,7 +719,7 @@ Alternatively, you can pass `audio_url`, which is the audio counterpart of `imag
|
||||
print("Chat completion output from audio url:", result)
|
||||
```
|
||||
|
||||
Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
|
||||
Full example: [examples/online_serving/openai_chat_completion_client_for_multimodal.py](../../examples/online_serving/openai_chat_completion_client_for_multimodal.py)
|
||||
|
||||
!!! note
|
||||
By default, the timeout for fetching audios through HTTP URL is `10` seconds.
|
||||
|
@ -9,7 +9,7 @@ NixlConnector is a high-performance KV cache transfer connector for vLLM's disag
|
||||
Install the NIXL library: `uv pip install nixl`, as a quick start.
|
||||
|
||||
- Refer to [NIXL official repository](https://github.com/ai-dynamo/nixl) for more installation instructions
|
||||
- The specified required NIXL version can be found in [requirements/kv_connectors.txt](gh-file:requirements/kv_connectors.txt) and other relevant config files
|
||||
- The specified required NIXL version can be found in [requirements/kv_connectors.txt](../../requirements/kv_connectors.txt) and other relevant config files
|
||||
|
||||
For non-cuda platform, please install nixl with ucx build from source, instructed as below.
|
||||
|
||||
@ -170,6 +170,6 @@ Support use case: Prefill with 'HND' and decode with 'NHD' with experimental con
|
||||
|
||||
Refer to these example scripts in the vLLM repository:
|
||||
|
||||
- [run_accuracy_test.sh](gh-file:tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh)
|
||||
- [toy_proxy_server.py](gh-file:tests/v1/kv_connector/nixl_integration/toy_proxy_server.py)
|
||||
- [test_accuracy.py](gh-file:tests/v1/kv_connector/nixl_integration/test_accuracy.py)
|
||||
- [run_accuracy_test.sh](../../tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh)
|
||||
- [toy_proxy_server.py](../../tests/v1/kv_connector/nixl_integration/toy_proxy_server.py)
|
||||
- [test_accuracy.py](../../tests/v1/kv_connector/nixl_integration/test_accuracy.py)
|
||||
|
@ -16,7 +16,7 @@ To input multi-modal data, follow this schema in [vllm.inputs.EmbedsPrompt][]:
|
||||
|
||||
You can pass prompt embeddings from Hugging Face Transformers models to the `'prompt_embeds'` field of the prompt embedding dictionary, as shown in the following examples:
|
||||
|
||||
<gh-file:examples/offline_inference/prompt_embed_inference.py>
|
||||
[examples/offline_inference/prompt_embed_inference.py](../../examples/offline_inference/prompt_embed_inference.py)
|
||||
|
||||
## Online Serving
|
||||
|
||||
@ -37,4 +37,4 @@ vllm serve meta-llama/Llama-3.2-1B-Instruct --runner generate \
|
||||
|
||||
Then, you can use the OpenAI client as follows:
|
||||
|
||||
<gh-file:examples/online_serving/prompt_embed_inference_with_openai_client.py>
|
||||
[examples/online_serving/prompt_embed_inference_with_openai_client.py](../../examples/online_serving/prompt_embed_inference_with_openai_client.py)
|
||||
|
@ -64,4 +64,4 @@ th:not(:first-child) {
|
||||
!!! note
|
||||
This compatibility chart is subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods.
|
||||
|
||||
For the most up-to-date information on hardware support and quantization methods, please refer to <gh-dir:vllm/model_executor/layers/quantization> or consult with the vLLM development team.
|
||||
For the most up-to-date information on hardware support and quantization methods, please refer to [vllm/model_executor/layers/quantization](../../../vllm/model_executor/layers/quantization) or consult with the vLLM development team.
|
||||
|
@ -1,5 +1,9 @@
|
||||
# AutoAWQ
|
||||
|
||||
> ⚠️ **Warning:**
|
||||
The `AutoAWQ` library is deprecated. This functionality has been adopted by the vLLM project in [`llm-compressor`](https://github.com/vllm-project/llm-compressor/tree/main/examples/awq).
|
||||
For the recommended quantization workflow, please see the AWQ examples in [`llm-compressor`](https://github.com/vllm-project/llm-compressor/tree/main/examples/awq). For more details on the deprecation, refer to the original [AutoAWQ repository](https://github.com/casper-hansen/AutoAWQ).
|
||||
|
||||
To create a new 4-bit quantized model, you can leverage [AutoAWQ](https://github.com/casper-hansen/AutoAWQ).
|
||||
Quantization reduces the model's precision from BF16/FP16 to INT4 which effectively reduces the total model memory footprint.
|
||||
The main benefits are lower latency and memory usage.
|
||||
@ -18,13 +22,15 @@ After installing AutoAWQ, you are ready to quantize a model. Please refer to the
|
||||
from awq import AutoAWQForCausalLM
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
model_path = 'mistralai/Mistral-7B-Instruct-v0.2'
|
||||
quant_path = 'mistral-instruct-v0.2-awq'
|
||||
quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }
|
||||
model_path = "mistralai/Mistral-7B-Instruct-v0.2"
|
||||
quant_path = "mistral-instruct-v0.2-awq"
|
||||
quant_config = {"zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM"}
|
||||
|
||||
# Load model
|
||||
model = AutoAWQForCausalLM.from_pretrained(
|
||||
model_path, **{"low_cpu_mem_usage": True, "use_cache": False}
|
||||
model_path,
|
||||
low_cpu_mem_usage=True,
|
||||
use_cache=False,
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||
|
||||
|
@ -58,7 +58,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
from auto_round import AutoRound
|
||||
|
||||
model_name = "Qwen/Qwen3-0.6B"
|
||||
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto")
|
||||
model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto")
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
|
||||
bits, group_size, sym = 4, 128, True
|
||||
|
@ -34,7 +34,7 @@ llm = LLM(
|
||||
model=model_id,
|
||||
dtype=torch.bfloat16,
|
||||
trust_remote_code=True,
|
||||
quantization="bitblas"
|
||||
quantization="bitblas",
|
||||
)
|
||||
```
|
||||
|
||||
@ -53,6 +53,6 @@ llm = LLM(
|
||||
dtype=torch.float16,
|
||||
trust_remote_code=True,
|
||||
quantization="bitblas",
|
||||
max_model_len=1024
|
||||
max_model_len=1024,
|
||||
)
|
||||
```
|
||||
|
@ -27,7 +27,7 @@ model_id = "unsloth/tinyllama-bnb-4bit"
|
||||
llm = LLM(
|
||||
model=model_id,
|
||||
dtype=torch.bfloat16,
|
||||
trust_remote_code=True
|
||||
trust_remote_code=True,
|
||||
)
|
||||
```
|
||||
|
||||
@ -43,7 +43,7 @@ llm = LLM(
|
||||
model=model_id,
|
||||
dtype=torch.bfloat16,
|
||||
trust_remote_code=True,
|
||||
quantization="bitsandbytes"
|
||||
quantization="bitsandbytes",
|
||||
)
|
||||
```
|
||||
|
||||
|
@ -41,7 +41,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
|
||||
|
||||
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
MODEL_ID, device_map="auto", torch_dtype="auto",
|
||||
MODEL_ID,
|
||||
device_map="auto",
|
||||
dtype="auto",
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
||||
```
|
||||
@ -63,7 +65,10 @@ Since simple RTN does not require data for weight quantization and the activatio
|
||||
|
||||
# Configure the simple PTQ quantization
|
||||
recipe = QuantizationModifier(
|
||||
targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"])
|
||||
targets="Linear",
|
||||
scheme="FP8_DYNAMIC",
|
||||
ignore=["lm_head"],
|
||||
)
|
||||
|
||||
# Apply the quantization algorithm.
|
||||
oneshot(model=model, recipe=recipe)
|
||||
|
@ -47,15 +47,15 @@ You can also use the GGUF model directly through the LLM entrypoint:
|
||||
conversation = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant"
|
||||
"content": "You are a helpful assistant",
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Hello"
|
||||
"content": "Hello",
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "Hello! How can I assist you today?"
|
||||
"content": "Hello! How can I assist you today?",
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
@ -67,8 +67,10 @@ You can also use the GGUF model directly through the LLM entrypoint:
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
|
||||
# Create an LLM.
|
||||
llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
|
||||
tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
|
||||
llm = LLM(
|
||||
model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
|
||||
tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
|
||||
)
|
||||
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
||||
# that contain the prompt, generated text, and other information.
|
||||
outputs = llm.chat(conversation, sampling_params)
|
||||
|
@ -40,7 +40,7 @@ Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`:
|
||||
calibration_dataset = load_dataset(
|
||||
"allenai/c4",
|
||||
data_files="en/c4-train.00001-of-01024.json.gz",
|
||||
split="train"
|
||||
split="train",
|
||||
).select(range(1024))["text"]
|
||||
|
||||
quant_config = QuantizeConfig(bits=4, group_size=128)
|
||||
|
@ -39,7 +39,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
|
||||
|
||||
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
MODEL_ID, device_map="auto", torch_dtype="auto",
|
||||
MODEL_ID,
|
||||
device_map="auto",
|
||||
dtype="auto",
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
||||
```
|
||||
@ -166,7 +168,7 @@ The following is an example of an expanded quantization recipe you can tune to y
|
||||
},
|
||||
ignore=["lm_head"],
|
||||
update_size=NUM_CALIBRATION_SAMPLES,
|
||||
dampening_frac=0.01
|
||||
dampening_frac=0.01,
|
||||
)
|
||||
```
|
||||
|
||||
|
@ -44,7 +44,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
|
||||
|
||||
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
MODEL_ID, device_map="auto", torch_dtype="auto",
|
||||
MODEL_ID,
|
||||
device_map="auto",
|
||||
dtype="auto",
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
||||
```
|
||||
|
@ -56,9 +56,9 @@ The quantized checkpoint can then be deployed with vLLM. As an example, the foll
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
def main():
|
||||
|
||||
model_id = "nvidia/Llama-3.1-8B-Instruct-FP8"
|
||||
# Ensure you specify quantization='modelopt' when loading the modelopt checkpoint
|
||||
|
||||
# Ensure you specify quantization="modelopt" when loading the modelopt checkpoint
|
||||
llm = LLM(model=model_id, quantization="modelopt", trust_remote_code=True)
|
||||
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.9)
|
||||
|
@ -41,9 +41,11 @@ Here is an example of how to enable FP8 quantization:
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
sampling_params = SamplingParams(temperature=0.7, top_p=0.8)
|
||||
llm = LLM(model="meta-llama/Llama-2-7b-chat-hf",
|
||||
kv_cache_dtype="fp8",
|
||||
calculate_kv_scales=True)
|
||||
llm = LLM(
|
||||
model="meta-llama/Llama-2-7b-chat-hf",
|
||||
kv_cache_dtype="fp8",
|
||||
calculate_kv_scales=True,
|
||||
)
|
||||
prompt = "London is the capital of"
|
||||
out = llm.generate(prompt, sampling_params)[0].outputs[0].text
|
||||
print(out)
|
||||
@ -80,7 +82,7 @@ Here's a complete example using `meta-llama/Llama-3.1-8B-Instruct` (most models
|
||||
|
||||
# Select model and load it
|
||||
MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
|
||||
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", torch_dtype="auto")
|
||||
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", dtype="auto")
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
||||
|
||||
# Select calibration dataset
|
||||
|
@ -48,7 +48,9 @@ to fetch model and tokenizer.
|
||||
MAX_SEQ_LEN = 512
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
MODEL_ID, device_map="auto", torch_dtype="auto",
|
||||
MODEL_ID,
|
||||
device_map="auto",
|
||||
dtype="auto",
|
||||
)
|
||||
model.eval()
|
||||
|
||||
@ -75,10 +77,18 @@ to [Adding Calibration Datasets](https://quark.docs.amd.com/latest/pytorch/calib
|
||||
dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation")
|
||||
text_data = dataset["text"][:NUM_CALIBRATION_DATA]
|
||||
|
||||
tokenized_outputs = tokenizer(text_data, return_tensors="pt",
|
||||
padding=True, truncation=True, max_length=MAX_SEQ_LEN)
|
||||
calib_dataloader = DataLoader(tokenized_outputs['input_ids'],
|
||||
batch_size=BATCH_SIZE, drop_last=True)
|
||||
tokenized_outputs = tokenizer(
|
||||
text_data,
|
||||
return_tensors="pt",
|
||||
padding=True,
|
||||
truncation=True,
|
||||
max_length=MAX_SEQ_LEN,
|
||||
)
|
||||
calib_dataloader = DataLoader(
|
||||
tokenized_outputs['input_ids'],
|
||||
batch_size=BATCH_SIZE,
|
||||
drop_last=True,
|
||||
)
|
||||
```
|
||||
|
||||
### 3. Set the Quantization Configuration
|
||||
@ -103,26 +113,32 @@ kv-cache and the quantization algorithm is AutoSmoothQuant.
|
||||
load_quant_algo_config_from_file)
|
||||
|
||||
# Define fp8/per-tensor/static spec.
|
||||
FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(observer_method="min_max",
|
||||
is_dynamic=False).to_quantization_spec()
|
||||
FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(
|
||||
observer_method="min_max",
|
||||
is_dynamic=False,
|
||||
).to_quantization_spec()
|
||||
|
||||
# Define global quantization config, input tensors and weight apply FP8_PER_TENSOR_SPEC.
|
||||
global_quant_config = QuantizationConfig(input_tensors=FP8_PER_TENSOR_SPEC,
|
||||
weight=FP8_PER_TENSOR_SPEC)
|
||||
global_quant_config = QuantizationConfig(
|
||||
input_tensors=FP8_PER_TENSOR_SPEC,
|
||||
weight=FP8_PER_TENSOR_SPEC,
|
||||
)
|
||||
|
||||
# Define quantization config for kv-cache layers, output tensors apply FP8_PER_TENSOR_SPEC.
|
||||
KV_CACHE_SPEC = FP8_PER_TENSOR_SPEC
|
||||
kv_cache_layer_names_for_llama = ["*k_proj", "*v_proj"]
|
||||
kv_cache_quant_config = {name :
|
||||
QuantizationConfig(input_tensors=global_quant_config.input_tensors,
|
||||
weight=global_quant_config.weight,
|
||||
output_tensors=KV_CACHE_SPEC)
|
||||
for name in kv_cache_layer_names_for_llama}
|
||||
kv_cache_quant_config = {
|
||||
name: QuantizationConfig(
|
||||
input_tensors=global_quant_config.input_tensors,
|
||||
weight=global_quant_config.weight,
|
||||
output_tensors=KV_CACHE_SPEC,
|
||||
)
|
||||
for name in kv_cache_layer_names_for_llama
|
||||
}
|
||||
layer_quant_config = kv_cache_quant_config.copy()
|
||||
|
||||
# Define algorithm config by config file.
|
||||
LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE =
|
||||
'examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json'
|
||||
LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE = "examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json"
|
||||
algo_config = load_quant_algo_config_from_file(LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE)
|
||||
|
||||
EXCLUDE_LAYERS = ["lm_head"]
|
||||
@ -131,7 +147,8 @@ kv-cache and the quantization algorithm is AutoSmoothQuant.
|
||||
layer_quant_config=layer_quant_config,
|
||||
kv_cache_quant_config=kv_cache_quant_config,
|
||||
exclude=EXCLUDE_LAYERS,
|
||||
algo_config=algo_config)
|
||||
algo_config=algo_config,
|
||||
)
|
||||
```
|
||||
|
||||
### 4. Quantize the Model and Export
|
||||
@ -165,8 +182,11 @@ for more exporting format details.
|
||||
EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant"
|
||||
exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR)
|
||||
with torch.no_grad():
|
||||
exporter.export_safetensors_model(freezed_model,
|
||||
quant_config=quant_config, tokenizer=tokenizer)
|
||||
exporter.export_safetensors_model(
|
||||
freezed_model,
|
||||
quant_config=quant_config,
|
||||
tokenizer=tokenizer,
|
||||
)
|
||||
```
|
||||
|
||||
### 5. Evaluation in vLLM
|
||||
@ -189,8 +209,11 @@ Now, you can load and run the Quark quantized model directly through the LLM ent
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
|
||||
# Create an LLM.
|
||||
llm = LLM(model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant",
|
||||
kv_cache_dtype='fp8',quantization='quark')
|
||||
llm = LLM(
|
||||
model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant",
|
||||
kv_cache_dtype="fp8",
|
||||
quantization="quark",
|
||||
)
|
||||
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
||||
# that contain the prompt, generated text, and other information.
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
|
@ -27,7 +27,7 @@ You can quantize your own huggingface model with torchao, e.g. [transformers](ht
|
||||
quantization_config = TorchAoConfig(Int8WeightOnlyConfig())
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
model_name,
|
||||
torch_dtype="auto",
|
||||
dtype="auto",
|
||||
device_map="auto",
|
||||
quantization_config=quantization_config
|
||||
)
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user