mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 14:53:52 +08:00
Compare commits
491 Commits
debug-logs
...
8a81d776ce
Author | SHA1 | Date | |
---|---|---|---|
8a81d776ce | |||
f6fdacd82c | |||
d31f7844f8 | |||
7a6c8c3fa1 | |||
221bf72577 | |||
b3aba04e5a | |||
8a297115e2 | |||
191eed0bb9 | |||
fb860670da | |||
83e760c57d | |||
c2bba69065 | |||
e133d6d218 | |||
a1946c9f61 | |||
9f020f4f31 | |||
3b45075206 | |||
168e578efc | |||
6ac5e06f7c | |||
5c2acb270a | |||
b26b70bec4 | |||
ab4be40fc5 | |||
245e4f2c01 | |||
1d165d6d85 | |||
83004020fd | |||
12e21701e7 | |||
30a33b92ee | |||
7c572544e4 | |||
c312320764 | |||
c981f0ea78 | |||
6367bde739 | |||
f50cc221ea | |||
acedc74b1a | |||
d29483b58a | |||
950cf9e58e | |||
3125d79950 | |||
e33ee23ee3 | |||
b10c64c834 | |||
0925b28a8e | |||
99722d5f0e | |||
4c91a28e30 | |||
b038d9c40c | |||
2ba60ec7fe | |||
bd7157a071 | |||
be429d0cfd | |||
c253745eb8 | |||
daec4d2624 | |||
6c9fdbf725 | |||
483ea64611 | |||
e20eba753b | |||
bbc1b29665 | |||
acb1bfa601 | |||
75c7ad9918 | |||
5550ff9c25 | |||
3aeb19a39e | |||
8c017b3490 | |||
9c2c2287a0 | |||
fec2b341ad | |||
87bc0c492f | |||
fe3b9372ad | |||
bde9e2272a | |||
08405609cc | |||
ab81379ea6 | |||
4ffd6e8942 | |||
965c5f4914 | |||
4d055ef465 | |||
17c540a993 | |||
4d4d6bad19 | |||
11ae016bd7 | |||
41d3071918 | |||
fb5e10d3fb | |||
b2f78cbad4 | |||
23583ee28c | |||
01c977e96d | |||
b3dda72c23 | |||
fb0571b077 | |||
2ed8b6b3d0 | |||
013abde6ef | |||
a5464dcf92 | |||
ac3ed5a815 | |||
e6ba2000ae | |||
aa255ff55a | |||
7bb736d00e | |||
9f4e30904b | |||
5afd3276df | |||
43721bc67f | |||
02d709a6f1 | |||
4a510ab487 | |||
314fa8abbf | |||
334535b6fb | |||
dcbb3f1871 | |||
00417f4e44 | |||
ed344f4116 | |||
e51928793e | |||
d2740fafbf | |||
17838e50ef | |||
44c8555621 | |||
f7d318de2b | |||
76f0d05bc6 | |||
7d8975de84 | |||
785d8b6410 | |||
f6cdc9a02f | |||
509cdc0370 | |||
9b6504c307 | |||
e19b16dde6 | |||
582f2c6be7 | |||
f8a0acbdbe | |||
1317034379 | |||
0ecc553ee6 | |||
f96bc3649c | |||
938c43ea7f | |||
0a9ef0cfce | |||
e5b438a247 | |||
0b99f5d302 | |||
1f491aa0c8 | |||
de92d916fe | |||
a1063628a4 | |||
d796375258 | |||
14f8456344 | |||
4794c2bd92 | |||
d3cbaa08dc | |||
828523ad8e | |||
136a17fe6e | |||
f57438338d | |||
5d598680e3 | |||
8f4b313c37 | |||
f93e348010 | |||
f54f85129e | |||
d4d1a6024f | |||
db1764e4e0 | |||
7f83b4ee8e | |||
5c3bae1a6a | |||
5210dc3940 | |||
650b51f9f9 | |||
6256697997 | |||
71557a5f7c | |||
f3c378ffa7 | |||
f5ed68ef63 | |||
efdef57b1f | |||
b8a4572157 | |||
302ef403a2 | |||
8865da157b | |||
f0862eae43 | |||
8c851f6d04 | |||
7cfa420f49 | |||
a27b288e4a | |||
e471d7ca7e | |||
c43ca8259e | |||
85a65e7f51 | |||
a2986b3e33 | |||
96b9aa5aa0 | |||
e66d787bce | |||
bfad142e25 | |||
9354660036 | |||
07ca70af8d | |||
2dcd12d357 | |||
579d2e5458 | |||
0512c04aee | |||
7e0ef4084a | |||
4aed506b65 | |||
a86b4c58e8 | |||
ff4810ba73 | |||
9d6964926e | |||
0e65818910 | |||
380f17527c | |||
b92ab3deda | |||
acaa2c0a4a | |||
82af928c41 | |||
87efc681db | |||
c3a722fcb2 | |||
aba48f7db1 | |||
04b5f9802d | |||
efc8f7d814 | |||
6d87a2838c | |||
e6cdbd6792 | |||
df850c4912 | |||
720394de43 | |||
88a49745af | |||
ca683a2a72 | |||
e9f1b8c9e9 | |||
ea97940d6c | |||
fdd32750f0 | |||
c715ba3735 | |||
9c4cb68339 | |||
780eb03d9b | |||
ef9676a1f1 | |||
70b1b330e1 | |||
d1d063a588 | |||
7e6edb1469 | |||
74704d4553 | |||
d2f816d6ff | |||
577d498212 | |||
fd85c9f426 | |||
d32c611f45 | |||
01ad27faff | |||
481545b397 | |||
d3cc8427c0 | |||
4821ac1b4d | |||
4497c8f821 | |||
2e36cdbe2b | |||
fe3edb4cf0 | |||
29350922c6 | |||
8ae169286f | |||
8a0af6a561 | |||
cfded80793 | |||
b59dd19b55 | |||
3e051bda82 | |||
8317f72354 | |||
d8bebb008a | |||
35bc22f23c | |||
fa96fb9c70 | |||
e3fdb627d9 | |||
7200a21cd1 | |||
577c72a227 | |||
314285d4f2 | |||
d2a7938582 | |||
89342ce4c0 | |||
f89f599395 | |||
e251e457c5 | |||
afc47e4de7 | |||
e3b90c1ba2 | |||
134f70b3ed | |||
a1b2d658ee | |||
5c7fe25491 | |||
53c9a7cee2 | |||
0d21b9b51e | |||
10214b6935 | |||
4a61950f4d | |||
3263799056 | |||
8e67b2557a | |||
4073c82c4e | |||
767c3ab869 | |||
4f207c7174 | |||
782505ed8e | |||
98f30b8cba | |||
3cd36660f7 | |||
46ad73955a | |||
41f3884438 | |||
60e419c1ee | |||
7ef6052804 | |||
4fca1a1bd2 | |||
a6049be73c | |||
18ed7746ea | |||
8fcaaf6a16 | |||
9bb38130cb | |||
b91d8db873 | |||
045b396d09 | |||
76852017ea | |||
82e64c7a20 | |||
4ca204055e | |||
c5c8f5ea59 | |||
01653a917b | |||
0cd103e7cb | |||
5be7ca1b99 | |||
f0a30a067b | |||
9d6cff3ede | |||
a25f2adee9 | |||
d0bed837ac | |||
f7ee69868a | |||
d2a71530c1 | |||
086609de64 | |||
727144bed1 | |||
55392bc879 | |||
ddaff2938e | |||
27ed39a347 | |||
8f8474fbe3 | |||
be067861c6 | |||
5bc26c438d | |||
eef921f45e | |||
e317414ce1 | |||
949cb0170d | |||
e94cfd51da | |||
7c12763b24 | |||
3b780a4bbb | |||
30f78af147 | |||
19a9b169bf | |||
96ad65b7fe | |||
8d2b8c0ff2 | |||
b2155ed317 | |||
910abdbd08 | |||
cddce79fda | |||
e519281920 | |||
7b03584de8 | |||
ae9d0e7da5 | |||
0e67102d93 | |||
f4ba2061cf | |||
1e6848a65d | |||
67661375fa | |||
213b64452a | |||
784c231151 | |||
606b00e80f | |||
720d3cd0f0 | |||
ab196edefb | |||
3ee202ea1e | |||
ad430a67ca | |||
6f0f570c43 | |||
b545a0b207 | |||
29255cfc3b | |||
da4455609d | |||
aafb99a4d4 | |||
757fa4a4da | |||
c6187f55f7 | |||
8983e0216f | |||
1ee35382cb | |||
6e783bc54b | |||
c9d33c60dc | |||
2e54db4d2b | |||
44f633dba1 | |||
a462331e36 | |||
4069db3f2e | |||
0d37450eb7 | |||
47e66c24e2 | |||
3b736e1c38 | |||
2c1c7dfb35 | |||
e246ad6f0c | |||
5728da11ea | |||
92be3f3517 | |||
d1ddf340c8 | |||
ec10fd0abc | |||
0426e3c5e1 | |||
4bdf7ac593 | |||
dc7976dd9f | |||
e4791438ed | |||
e6e898f95d | |||
ddcbc2f334 | |||
a83ff278d6 | |||
cf4cd6c24f | |||
b960441812 | |||
1317028aa8 | |||
5e49c3e777 | |||
0d7c3cb51d | |||
1b2c440cd6 | |||
0f29dca988 | |||
d24cf322e1 | |||
d17f0fbf30 | |||
43ab8cfaa5 | |||
de253d63b7 | |||
8bd696fa53 | |||
bb6d8c21f9 | |||
ebf6ef1a9b | |||
0c52d6ef81 | |||
467a4f98f1 | |||
e614ab7806 | |||
2a03f93de9 | |||
da364615fc | |||
f08919b7d1 | |||
93f2c0aa08 | |||
4ebc9108a7 | |||
e1ba235668 | |||
b82f4307c9 | |||
76879cc160 | |||
b25d7b5657 | |||
e09d1753ec | |||
4ba8875749 | |||
6273fe8d3d | |||
9fb3ae4e6f | |||
76afe4edf8 | |||
c1b06fc182 | |||
241b4cfe66 | |||
9fc983c707 | |||
2f99f2f506 | |||
338b1bf04f | |||
e39dc46f8f | |||
10c75b5439 | |||
f9582fd8f4 | |||
f377333bd7 | |||
f8607863d8 | |||
335b28f7d1 | |||
5e65d6b2ad | |||
0d4f48fa10 | |||
127c8b782a | |||
cd9890544b | |||
067da2d1df | |||
046118b938 | |||
b32260ab85 | |||
f80e7866c0 | |||
31a4b3e6c4 | |||
caf8b1c084 | |||
1b86bd8e18 | |||
59012df99b | |||
3d1f67616d | |||
6ebaf43ee4 | |||
0c824fc46f | |||
eb577e4655 | |||
8f36850f73 | |||
29fd2662ba | |||
30a3e5af69 | |||
a38c1bfe09 | |||
320feae6f5 | |||
1e4ecca1d0 | |||
c0a7b89d8e | |||
6f59beaf0b | |||
41f1cf38f2 | |||
08d26a1b7e | |||
63773a6200 | |||
883b42896a | |||
e1098ced95 | |||
d100d78eb3 | |||
7e4cd070b0 | |||
46b0779996 | |||
de342585ff | |||
185d8ed44f | |||
d9836d4517 | |||
5f7e8a916a | |||
4dbdf4a294 | |||
c6873c4e6d | |||
2111b4643c | |||
c50901f3b9 | |||
8229280a9c | |||
f77df94647 | |||
f231e5bc21 | |||
2161efe978 | |||
f23b4c04fd | |||
93540958b8 | |||
44b9af5bb2 | |||
7cd95dc8a3 | |||
c02058c222 | |||
b2ea5ba677 | |||
824a3f403f | |||
05f6846ede | |||
20db99cc69 | |||
6431be808f | |||
4727a8afa7 | |||
b8f603cebe | |||
fc679696f8 | |||
ab5e7d93f4 | |||
0340f45553 | |||
19a00eb210 | |||
391612e78b | |||
77c95f72f7 | |||
59f30d0448 | |||
43c146ca42 | |||
7c2ec0fe87 | |||
039b6bade3 | |||
6c04638214 | |||
91ac7f764d | |||
4be7d7c1c9 | |||
59b477645c | |||
778f554157 | |||
d3c84297c3 | |||
f509a20846 | |||
60bc25e74c | |||
b893d661b1 | |||
6b6e98775f | |||
9c3c21c519 | |||
512b8affa4 | |||
1c0c68202c | |||
5f317530ec | |||
557b2e961d | |||
4e256cadc2 | |||
d6953beb91 | |||
17edd8a807 | |||
3303cfb4ac | |||
b7e8e4e6be | |||
432e1cbc23 | |||
201c971e96 | |||
e0986ea07b | |||
a964e5e6c3 | |||
78c1d5bfd2 | |||
59a85c366e | |||
119f00630b | |||
a42d2df75f | |||
5c057e068f | |||
ed3aeb25a4 | |||
86ee949128 | |||
4570535ec4 | |||
2a6dc67eb5 | |||
f05fea1f5e | |||
d0df145c2a | |||
1838cd4860 | |||
7d6b03381e | |||
7c2e91c4e0 | |||
736fbf4c89 | |||
44ea85137a | |||
d3d649efec | |||
ea507c3a93 | |||
9705fba7b7 | |||
2f7dbc9b42 | |||
ea25a76c05 | |||
67bc0c003e | |||
5a05f26603 | |||
7ef40bb983 | |||
767cbb011d | |||
7cfa4b24bf | |||
b71fcd4905 | |||
75003f34e8 | |||
78b8015a4d | |||
831b124151 | |||
c1ffcb55da | |||
0879736aab | |||
a26917332f | |||
cd9e5b8340 | |||
300a59c4c3 |
@ -5,11 +5,11 @@ import os
|
||||
import sys
|
||||
import zipfile
|
||||
|
||||
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 450 MiB
|
||||
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 500 MiB
|
||||
# Note that we have 800 MiB quota, please use it wisely.
|
||||
# See https://github.com/pypi/support/issues/6326 .
|
||||
# Please also sync the value with the one in Dockerfile.
|
||||
VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 450))
|
||||
VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 500))
|
||||
|
||||
|
||||
def print_top_10_largest_files(zip_file):
|
||||
|
12
.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
Normal file
12
.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
Normal file
@ -0,0 +1,12 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
|
||||
model_name: "HandH1998/QQQ-Llama-3-8b-g128"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.419
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.416
|
||||
limit: 1000
|
||||
num_fewshot: 5
|
@ -0,0 +1,12 @@
|
||||
# For hf script, without -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 100 -t 8
|
||||
model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
|
||||
backend: "vllm-vlm"
|
||||
tasks:
|
||||
- name: "chartqa"
|
||||
metrics:
|
||||
- name: "relaxed_accuracy,none"
|
||||
# TODO(zhewenl): model card is 0.90, but the actual score is 0.80.
|
||||
value: 0.80
|
||||
limit: 100
|
||||
num_fewshot: 0
|
@ -0,0 +1,10 @@
|
||||
# For hf script, without -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 250 -t 8 -f 5
|
||||
model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
|
||||
tasks:
|
||||
- name: "mmlu_pro"
|
||||
metrics:
|
||||
- name: "exact_match,custom-extract"
|
||||
value: 0.80
|
||||
limit: 250 # will run on 250 * 14 subjects = 3500 samples
|
||||
num_fewshot: 5
|
@ -1,4 +1,5 @@
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
|
||||
# For vllm script, with -t option (tensor parallel size)
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -l 1319 -t 1
|
||||
model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
|
@ -0,0 +1,12 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m Qwen/Qwen2.5-VL-7B-Instruct -l 2500 -t 1
|
||||
|
||||
model_name: "Qwen/Qwen2.5-VL-7B-Instruct"
|
||||
backend: "vllm-vlm"
|
||||
tasks:
|
||||
- name: "chartqa"
|
||||
metrics:
|
||||
- name: "relaxed_accuracy,none"
|
||||
value: 0.855
|
||||
limit: 2500
|
||||
num_fewshot: 0
|
1
.buildkite/lm-eval-harness/configs/models-large-h100.txt
Normal file
1
.buildkite/lm-eval-harness/configs/models-large-h100.txt
Normal file
@ -0,0 +1 @@
|
||||
Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
|
@ -0,0 +1 @@
|
||||
Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
|
1
.buildkite/lm-eval-harness/configs/models-mm-small.txt
Normal file
1
.buildkite/lm-eval-harness/configs/models-mm-small.txt
Normal file
@ -0,0 +1 @@
|
||||
Qwen2.5-VL-7B-Instruct.yaml
|
44
.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
Executable file
44
.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
Executable file
@ -0,0 +1,44 @@
|
||||
#!/bin/bash
|
||||
# We can use this script to compute baseline accuracy on chartqa for vllm.
|
||||
#
|
||||
# Make sure you have lm-eval-harness installed:
|
||||
# pip install lm-eval==0.4.9
|
||||
|
||||
usage() {
|
||||
echo``
|
||||
echo "Runs lm eval harness on ChartQA using multimodal vllm."
|
||||
echo "This pathway is intended to be used to create baselines for "
|
||||
echo "our correctness tests in vllm's CI."
|
||||
echo
|
||||
echo "usage: ${0} <options>"
|
||||
echo
|
||||
echo " -m - huggingface stub or local directory of the model"
|
||||
echo " -l - limit number of samples to run"
|
||||
echo " -t - tensor parallel size to run at"
|
||||
echo
|
||||
}
|
||||
|
||||
while getopts "m:l:t:" OPT; do
|
||||
case ${OPT} in
|
||||
m )
|
||||
MODEL="$OPTARG"
|
||||
;;
|
||||
l )
|
||||
LIMIT="$OPTARG"
|
||||
;;
|
||||
t )
|
||||
TP_SIZE="$OPTARG"
|
||||
;;
|
||||
\? )
|
||||
usage
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
lm_eval --model vllm-vlm \
|
||||
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE" \
|
||||
--tasks chartqa \
|
||||
--batch_size auto \
|
||||
--apply_chat_template \
|
||||
--limit $LIMIT
|
0
.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
Normal file → Executable file
0
.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
Normal file → Executable file
@ -0,0 +1,50 @@
|
||||
#!/bin/bash
|
||||
# We can use this script to compute baseline accuracy on MMLUPRO for vllm.
|
||||
# We use this for fp8, which HF does not support.
|
||||
#
|
||||
# Make sure you have lm-eval-harness installed:
|
||||
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
|
||||
|
||||
usage() {
|
||||
echo``
|
||||
echo "Runs lm eval harness on MMLU Pro using huggingface transformers."
|
||||
echo "This pathway is intended to be used to create baselines for "
|
||||
echo "our automated nm-test-accuracy workflow"
|
||||
echo
|
||||
echo "usage: ${0} <options>"
|
||||
echo
|
||||
echo " -m - huggingface stub or local directory of the model"
|
||||
echo " -l - limit number of samples to run"
|
||||
echo " -f - number of fewshot samples to use"
|
||||
echo " -t - tensor parallel size to run at"
|
||||
echo
|
||||
}
|
||||
|
||||
while getopts "m:b:l:f:t:" OPT; do
|
||||
case ${OPT} in
|
||||
m )
|
||||
MODEL="$OPTARG"
|
||||
;;
|
||||
b )
|
||||
BATCH_SIZE="$OPTARG"
|
||||
;;
|
||||
l )
|
||||
LIMIT="$OPTARG"
|
||||
;;
|
||||
f )
|
||||
FEWSHOT="$OPTARG"
|
||||
;;
|
||||
t )
|
||||
TP_SIZE="$OPTARG"
|
||||
;;
|
||||
\? )
|
||||
usage
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
lm_eval --model vllm \
|
||||
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \
|
||||
--tasks mmlu_pro --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
|
||||
--batch_size auto
|
@ -19,21 +19,27 @@ RTOL = 0.08
|
||||
def launch_lm_eval(eval_config, tp_size):
|
||||
trust_remote_code = eval_config.get("trust_remote_code", False)
|
||||
max_model_len = eval_config.get("max_model_len", 4096)
|
||||
batch_size = eval_config.get("batch_size", "auto")
|
||||
backend = eval_config.get("backend", "vllm")
|
||||
model_args = (
|
||||
f"pretrained={eval_config['model_name']},"
|
||||
f"tensor_parallel_size={tp_size},"
|
||||
f"enforce_eager=true,"
|
||||
f"add_bos_token=true,"
|
||||
f"trust_remote_code={trust_remote_code},"
|
||||
f"max_model_len={max_model_len}"
|
||||
f"max_model_len={max_model_len},"
|
||||
)
|
||||
results = lm_eval.simple_evaluate(
|
||||
model="vllm",
|
||||
model=backend,
|
||||
model_args=model_args,
|
||||
tasks=[task["name"] for task in eval_config["tasks"]],
|
||||
num_fewshot=eval_config["num_fewshot"],
|
||||
limit=eval_config["limit"],
|
||||
batch_size="auto",
|
||||
# TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
|
||||
# text models. however, this is regressing measured strict-match for
|
||||
# existing text models in CI, so only apply it for mm.
|
||||
apply_chat_template=backend == "vllm-vlm",
|
||||
batch_size=batch_size,
|
||||
)
|
||||
return results
|
||||
|
||||
|
@ -368,7 +368,7 @@ if __name__ == "__main__":
|
||||
# The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
|
||||
# we want to turn it into "8xGPUTYPE"
|
||||
df["GPU"] = df["GPU"].apply(
|
||||
lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}"
|
||||
lambda x: f"{len(x.splitlines())}x{x.splitlines()[0]}"
|
||||
)
|
||||
|
||||
# get markdown tables
|
||||
|
@ -454,11 +454,6 @@ main() {
|
||||
fi
|
||||
check_hf_token
|
||||
|
||||
# Set to v1 to run v1 benchmark
|
||||
if [[ "${ENGINE_VERSION:-v0}" == "v1" ]]; then
|
||||
export VLLM_USE_V1=1
|
||||
fi
|
||||
|
||||
# dependencies
|
||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||
(which jq) || (apt-get update && apt-get -y install jq)
|
||||
|
@ -1,46 +0,0 @@
|
||||
# This local pyproject file is part of the migration from yapf to ruff format.
|
||||
# It uses the same core rules as the main pyproject.toml file, but with the
|
||||
# following differences:
|
||||
# - ruff line length is overridden to 88
|
||||
# - deprecated typing ignores (UP006, UP035) have been removed
|
||||
|
||||
[tool.ruff]
|
||||
line-length = 88
|
||||
|
||||
[tool.ruff.lint.per-file-ignores]
|
||||
"vllm/third_party/**" = ["ALL"]
|
||||
"vllm/version.py" = ["F401"]
|
||||
"vllm/_version.py" = ["ALL"]
|
||||
|
||||
[tool.ruff.lint]
|
||||
select = [
|
||||
# pycodestyle
|
||||
"E",
|
||||
# Pyflakes
|
||||
"F",
|
||||
# pyupgrade
|
||||
"UP",
|
||||
# flake8-bugbear
|
||||
"B",
|
||||
# flake8-simplify
|
||||
"SIM",
|
||||
# isort
|
||||
"I",
|
||||
# flake8-logging-format
|
||||
"G",
|
||||
]
|
||||
ignore = [
|
||||
# star imports
|
||||
"F405", "F403",
|
||||
# lambda expression assignment
|
||||
"E731",
|
||||
# Loop control variable not used within loop body
|
||||
"B007",
|
||||
# f-string format
|
||||
"UP032",
|
||||
# Can remove once 3.10+ is the minimum Python version
|
||||
"UP007",
|
||||
]
|
||||
|
||||
[tool.ruff.format]
|
||||
docstring-code-format = true
|
@ -8,7 +8,7 @@ steps:
|
||||
commands:
|
||||
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
|
||||
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||
@ -48,7 +48,7 @@ steps:
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||
@ -76,7 +76,7 @@ steps:
|
||||
queue: arm64_cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
|
||||
|
||||
# Add job to create multi-arch manifest
|
||||
@ -150,11 +150,16 @@ steps:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly"
|
||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
|
||||
- "docker push vllm/vllm-openai:nightly"
|
||||
- "docker push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
|
||||
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64"
|
||||
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64"
|
||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 vllm/vllm-openai:nightly-x86_64"
|
||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 vllm/vllm-openai:nightly-aarch64"
|
||||
- "docker push vllm/vllm-openai:nightly-x86_64"
|
||||
- "docker push vllm/vllm-openai:nightly-aarch64"
|
||||
- "docker manifest create vllm/vllm-openai:nightly vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
|
||||
- "docker manifest create vllm/vllm-openai:nightly-$BUILDKITE_COMMIT vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
|
||||
- "docker manifest push vllm/vllm-openai:nightly"
|
||||
- "docker manifest push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
|
||||
# Clean up old nightly builds (keep only last 14)
|
||||
- "bash .buildkite/scripts/cleanup-nightly-builds.sh"
|
||||
plugins:
|
||||
@ -163,3 +168,4 @@ steps:
|
||||
password-env: DOCKERHUB_TOKEN
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
DOCKERHUB_USERNAME: "vllmbot"
|
||||
|
@ -8,20 +8,41 @@ set -ex
|
||||
# DockerHub API endpoint for vllm/vllm-openai repository
|
||||
REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
|
||||
|
||||
# Get DockerHub token from environment
|
||||
# Get DockerHub credentials from environment
|
||||
if [ -z "$DOCKERHUB_TOKEN" ]; then
|
||||
echo "Error: DOCKERHUB_TOKEN environment variable is not set"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ -z "$DOCKERHUB_USERNAME" ]; then
|
||||
echo "Error: DOCKERHUB_USERNAME environment variable is not set"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Get DockerHub bearer token
|
||||
echo "Getting DockerHub bearer token..."
|
||||
set +x
|
||||
BEARER_TOKEN=$(curl -s -X POST \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"username\": \"$DOCKERHUB_USERNAME\", \"password\": \"$DOCKERHUB_TOKEN\"}" \
|
||||
"https://hub.docker.com/v2/users/login" | jq -r '.token')
|
||||
set -x
|
||||
|
||||
if [ -z "$BEARER_TOKEN" ] || [ "$BEARER_TOKEN" = "null" ]; then
|
||||
echo "Error: Failed to get DockerHub bearer token"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Function to get all tags from DockerHub
|
||||
get_all_tags() {
|
||||
local page=1
|
||||
local all_tags=""
|
||||
|
||||
while true; do
|
||||
local response=$(curl -s -H "Authorization: Bearer $DOCKERHUB_TOKEN" \
|
||||
set +x
|
||||
local response=$(curl -s -H "Authorization: Bearer $BEARER_TOKEN" \
|
||||
"$REPO_API_URL?page=$page&page_size=100")
|
||||
set -x
|
||||
|
||||
# Get both last_updated timestamp and tag name, separated by |
|
||||
local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"')
|
||||
@ -43,7 +64,9 @@ delete_tag() {
|
||||
echo "Deleting tag: $tag_name"
|
||||
|
||||
local delete_url="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags/$tag_name"
|
||||
local response=$(curl -s -X DELETE -H "Authorization: Bearer $DOCKERHUB_TOKEN" "$delete_url")
|
||||
set +x
|
||||
local response=$(curl -s -X DELETE -H "Authorization: Bearer $BEARER_TOKEN" "$delete_url")
|
||||
set -x
|
||||
|
||||
if echo "$response" | jq -e '.detail' > /dev/null 2>&1; then
|
||||
echo "Warning: Failed to delete tag $tag_name: $(echo "$response" | jq -r '.detail')"
|
||||
|
@ -25,25 +25,28 @@ function cpu_tests() {
|
||||
|
||||
# offline inference
|
||||
podman exec -it "$container_id" bash -c "
|
||||
set -e
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
||||
set -xve
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log
|
||||
|
||||
# Run basic model test
|
||||
podman exec -it "$container_id" bash -c "
|
||||
set -e
|
||||
set -evx
|
||||
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
|
||||
pip install sentence-transformers datamodel_code_generator
|
||||
pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
|
||||
|
||||
# Note: disable Bart until supports V1
|
||||
# pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
|
||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
|
||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
|
||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
|
||||
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
|
||||
pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model"
|
||||
# TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
|
||||
# pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log
|
||||
}
|
||||
|
||||
# All of CPU tests are expected to be finished less than 40 mins.
|
||||
|
||||
export container_id
|
||||
export -f cpu_tests
|
||||
timeout 40m bash -c cpu_tests
|
||||
timeout 120m bash -c cpu_tests
|
||||
|
||||
|
@ -70,7 +70,7 @@ function cpu_tests() {
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pytest -x -s -v \
|
||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
|
||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs"
|
||||
|
||||
# Note: disable it until supports V1
|
||||
# Run AWQ test
|
||||
|
@ -64,10 +64,9 @@ python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git
|
||||
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
|
||||
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
|
||||
echo "--- Python dependencies installed ---"
|
||||
export VLLM_USE_V1=1
|
||||
|
||||
export VLLM_XLA_CHECK_RECOMPILATION=1
|
||||
export VLLM_XLA_CACHE_PATH=
|
||||
echo "Using VLLM V1"
|
||||
|
||||
echo "--- Hardware Information ---"
|
||||
# tpu-info
|
||||
|
@ -64,10 +64,9 @@ python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git
|
||||
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
|
||||
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
|
||||
echo "--- Python dependencies installed ---"
|
||||
export VLLM_USE_V1=1
|
||||
|
||||
export VLLM_XLA_CHECK_RECOMPILATION=1
|
||||
export VLLM_XLA_CACHE_PATH=
|
||||
echo "Using VLLM V1"
|
||||
|
||||
echo "--- Hardware Information ---"
|
||||
# tpu-info
|
||||
|
@ -44,6 +44,5 @@ docker run \
|
||||
pytest -v -s v1/structured_output
|
||||
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py
|
||||
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
|
||||
pytest -v -s v1/test_metrics
|
||||
pytest -v -s v1/test_serial_utils.py
|
||||
'
|
||||
|
@ -9,6 +9,6 @@ MAX_NUM_BATCHED_TOKENS=1024
|
||||
TENSOR_PARALLEL_SIZE=1
|
||||
MAX_MODEL_LEN=2048
|
||||
DOWNLOAD_DIR=/mnt/disks/persist
|
||||
EXPECTED_THROUGHPUT=10.0
|
||||
EXPECTED_THROUGHPUT=8.7
|
||||
INPUT_LEN=1800
|
||||
OUTPUT_LEN=128
|
||||
|
@ -42,7 +42,7 @@ echo "lanching vllm..."
|
||||
echo "logging to $VLLM_LOG"
|
||||
echo
|
||||
|
||||
VLLM_USE_V1=1 vllm serve $MODEL \
|
||||
vllm serve $MODEL \
|
||||
--seed 42 \
|
||||
--max-num-seqs $MAX_NUM_SEQS \
|
||||
--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
|
||||
|
1267
.buildkite/test-amd.yaml
Normal file
1267
.buildkite/test-amd.yaml
Normal file
File diff suppressed because it is too large
Load Diff
@ -296,6 +296,7 @@ steps:
|
||||
- tests/v1
|
||||
commands:
|
||||
# split the test to avoid interference
|
||||
- pytest -v -s -m 'not cpu_test' v1/core
|
||||
- pytest -v -s v1/executor
|
||||
- pytest -v -s v1/kv_offload
|
||||
- pytest -v -s v1/sample
|
||||
@ -317,7 +318,7 @@ steps:
|
||||
no_gpu: true
|
||||
commands:
|
||||
# split the test to avoid interference
|
||||
- pytest -v -s v1/core
|
||||
- pytest -v -s -m 'cpu_test' v1/core
|
||||
- pytest -v -s v1/structured_output
|
||||
- pytest -v -s v1/test_serial_utils.py
|
||||
- pytest -v -s -m 'cpu_test' v1/kv_connector/unit
|
||||
@ -397,12 +398,12 @@ steps:
|
||||
- pytest -v -s compile/test_pass_manager.py
|
||||
- pytest -v -s compile/test_fusion.py
|
||||
- pytest -v -s compile/test_fusion_attn.py
|
||||
- pytest -v -s compile/test_functionalization.py
|
||||
- pytest -v -s compile/test_silu_mul_quant_fusion.py
|
||||
- pytest -v -s compile/test_sequence_parallelism.py
|
||||
- pytest -v -s compile/test_async_tp.py
|
||||
- pytest -v -s compile/test_fusion_all_reduce.py
|
||||
- pytest -v -s compile/test_decorator.py
|
||||
- pytest -v -s compile/test_noop_elimination.py
|
||||
- pytest -v -s compile/test_aot_compile.py
|
||||
|
||||
- label: PyTorch Fullgraph Smoke Test # 15min
|
||||
timeout_in_minutes: 30
|
||||
@ -415,8 +416,8 @@ steps:
|
||||
- pytest -v -s compile/test_basic_correctness.py
|
||||
- pytest -v -s compile/piecewise/
|
||||
|
||||
- label: PyTorch Fullgraph Test # 20min
|
||||
timeout_in_minutes: 30
|
||||
- label: PyTorch Fullgraph Test # 22min
|
||||
timeout_in_minutes: 35
|
||||
mirror_hardwares: [amdexperimental]
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
@ -424,6 +425,7 @@ steps:
|
||||
- tests/compile
|
||||
commands:
|
||||
- pytest -v -s compile/test_full_graph.py
|
||||
- pytest -v -s compile/test_fusions_e2e.py
|
||||
|
||||
- label: Kernels Core Operation Test # 48min
|
||||
timeout_in_minutes: 75
|
||||
@ -431,8 +433,9 @@ steps:
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- tests/kernels/core
|
||||
- tests/kernels/test_top_k_per_row.py
|
||||
commands:
|
||||
- pytest -v -s kernels/core
|
||||
- pytest -v -s kernels/core kernels/test_top_k_per_row.py
|
||||
|
||||
- label: Kernels Attention Test %N # 23min
|
||||
timeout_in_minutes: 35
|
||||
@ -476,6 +479,7 @@ steps:
|
||||
source_file_dependencies:
|
||||
- csrc/mamba/
|
||||
- tests/kernels/mamba
|
||||
- vllm/model_executor/layers/mamba/ops
|
||||
commands:
|
||||
- pytest -v -s kernels/mamba
|
||||
|
||||
@ -524,8 +528,9 @@ steps:
|
||||
# since torchao nightly is only compatible with torch nightly currently
|
||||
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
|
||||
# we can only upgrade after this is resolved
|
||||
- pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
|
||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/
|
||||
# TODO(jerryzh168): resolve the above comment
|
||||
- uv pip install --system torchao==0.13.0
|
||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
|
||||
|
||||
- label: LM Eval Small Models # 53min
|
||||
timeout_in_minutes: 75
|
||||
@ -730,6 +735,16 @@ steps:
|
||||
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
|
||||
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
||||
|
||||
- label: Multi-Modal Accuracy Eval (Small Models) # 50min
|
||||
timeout_in_minutes: 70
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
source_file_dependencies:
|
||||
- vllm/multimodal/
|
||||
- vllm/inputs/
|
||||
- vllm/v1/core/
|
||||
commands:
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
|
||||
|
||||
- label: Multi-Modal Models Test (Extended) 1
|
||||
mirror_hardwares: [amdexperimental]
|
||||
optional: true
|
||||
@ -793,8 +808,8 @@ steps:
|
||||
# Whisper needs spawn method to avoid deadlock
|
||||
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
|
||||
|
||||
- label: Blackwell Test # 38 min
|
||||
timeout_in_minutes: 60
|
||||
- label: Blackwell Test # 21 min
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
# optional: true
|
||||
@ -807,8 +822,6 @@ steps:
|
||||
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/compilation/fusion.py
|
||||
- vllm/compilation/fusion_attn.py
|
||||
commands:
|
||||
- nvidia-smi
|
||||
- python3 examples/offline_inference/basic/chat.py
|
||||
@ -825,19 +838,38 @@ steps:
|
||||
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
|
||||
- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
|
||||
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
|
||||
- pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
|
||||
- pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
|
||||
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
|
||||
- pytest -v -s tests/kernels/moe/test_mxfp4_moe.py
|
||||
# Fusion
|
||||
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
||||
- pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
|
||||
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
|
||||
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
||||
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
||||
|
||||
- label: GPT-OSS Eval (Blackwell)
|
||||
- label: Blackwell Fusion Tests # 30 min
|
||||
timeout_in_minutes: 40
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/fp4/
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/compilation/
|
||||
# can affect pattern matching
|
||||
- vllm/model_executor/layers/layernorm.py
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
commands:
|
||||
- nvidia-smi
|
||||
- pytest -v -s tests/compile/test_fusion_attn.py
|
||||
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
||||
# this runner has 2 GPUs available even though num_gpus=2 is not set
|
||||
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
||||
- pytest -v -s tests/compile/test_fusions_e2e.py
|
||||
|
||||
- label: Blackwell GPT-OSS Eval
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
optional: true # disable while debugging
|
||||
optional: true # run on nightlies
|
||||
source_file_dependencies:
|
||||
- tests/evals/gpt_oss
|
||||
- vllm/model_executor/models/gpt_oss.py
|
||||
@ -864,6 +896,16 @@ steps:
|
||||
commands:
|
||||
- pytest -s -v tests/quantization/test_blackwell_moe.py
|
||||
|
||||
- label: Blackwell LM Eval Small Models
|
||||
timeout_in_minutes: 120
|
||||
gpu: b200
|
||||
optional: true # run on nightlies
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
commands:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
|
||||
|
||||
##### 1 GPU test #####
|
||||
##### multi gpus test #####
|
||||
|
||||
@ -978,6 +1020,11 @@ steps:
|
||||
- pytest -v -s plugins_tests/test_io_processor_plugins.py
|
||||
- pip uninstall prithvi_io_processor_plugin -y
|
||||
# end io_processor plugins test
|
||||
# begin stat_logger plugins test
|
||||
- pip install -e ./plugins/vllm_add_dummy_stat_logger
|
||||
- pytest -v -s plugins_tests/test_stats_logger_plugins.py
|
||||
- pip uninstall dummy_stat_logger -y
|
||||
# end stat_logger plugins test
|
||||
# other tests continue here:
|
||||
- pytest -v -s plugins_tests/test_scheduler_plugins.py
|
||||
- pip install -e ./plugins/vllm_add_dummy_model
|
||||
@ -1042,6 +1089,17 @@ steps:
|
||||
- tests/weight_loading
|
||||
commands:
|
||||
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
|
||||
|
||||
- label: NixlConnector PD accuracy tests (Distributed) # 30min
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
|
||||
- tests/v1/kv_connector/nixl_integration/
|
||||
commands:
|
||||
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
||||
- bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
|
||||
|
||||
|
||||
##### multi gpus test #####
|
||||
@ -1074,12 +1132,16 @@ steps:
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
||||
|
||||
##### H200 test #####
|
||||
- label: Distrubted Tests (H200) # optional
|
||||
- label: Distributed Tests (H200) # optional
|
||||
gpu: h200
|
||||
optional: true
|
||||
working_dir: "/vllm-workspace/"
|
||||
num_gpus: 2
|
||||
commands:
|
||||
- pytest -v -s tests/compile/test_async_tp.py
|
||||
- pytest -v -s tests/compile/test_sequence_parallelism.py
|
||||
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
||||
- pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
|
||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
|
||||
|
||||
|
17
.coveragerc
17
.coveragerc
@ -1,5 +1,10 @@
|
||||
[run]
|
||||
source = vllm
|
||||
# Track the installed vllm package (this is what actually gets imported during tests)
|
||||
# Use wildcard pattern to match the installed location
|
||||
source =
|
||||
vllm
|
||||
*/dist-packages/vllm
|
||||
*/site-packages/vllm
|
||||
omit =
|
||||
*/tests/*
|
||||
*/test_*
|
||||
@ -12,6 +17,16 @@ omit =
|
||||
*/benchmarks/*
|
||||
*/docs/*
|
||||
|
||||
[paths]
|
||||
# Map all possible vllm locations to a canonical "vllm" path
|
||||
# This ensures coverage.combine properly merges data from different test runs
|
||||
source =
|
||||
vllm
|
||||
/vllm-workspace/src/vllm
|
||||
/vllm-workspace/vllm
|
||||
*/site-packages/vllm
|
||||
*/dist-packages/vllm
|
||||
|
||||
[report]
|
||||
exclude_lines =
|
||||
pragma: no cover
|
||||
|
4
.git-blame-ignore-revs
Normal file
4
.git-blame-ignore-revs
Normal file
@ -0,0 +1,4 @@
|
||||
# Migrate from `yapf` & `isort` to `ruff`
|
||||
d6953beb91da4e9c99be4c0a1304a2d24189535c
|
||||
# Convert `Optional[x]` to `x | None` and `Union[x, y]` to `x | y`
|
||||
8fcaaf6a165e661f63fc51be906bc05b0767332f
|
14
.github/CODEOWNERS
vendored
14
.github/CODEOWNERS
vendored
@ -5,9 +5,7 @@
|
||||
/vllm/attention @LucasWilkinson
|
||||
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
|
||||
/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
|
||||
/vllm/model_executor/layers/fused_moe @mgoin
|
||||
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche
|
||||
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
|
||||
/vllm/model_executor/layers/mamba @tdoublep
|
||||
/vllm/model_executor/model_loader @22quinn
|
||||
@ -23,9 +21,9 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
||||
# Any change to the VllmConfig changes can have a large user-facing impact,
|
||||
# so spam a lot of people
|
||||
/vllm/config @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
|
||||
/vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
|
||||
|
||||
# vLLM V1
|
||||
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
|
||||
/vllm/v1/attention @LucasWilkinson
|
||||
/vllm/v1/attention/backends/flashinfer.py @mgoin
|
||||
/vllm/v1/attention/backends/triton_attn.py @tdoublep
|
||||
@ -59,7 +57,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
||||
/tests/v1/offloading @ApostaC
|
||||
|
||||
# Transformers backend
|
||||
/vllm/model_executor/models/transformers.py @hmellor
|
||||
/vllm/model_executor/models/transformers @hmellor
|
||||
/tests/models/test_transformers.py @hmellor
|
||||
|
||||
# Docs
|
||||
@ -120,3 +118,11 @@ mkdocs.yaml @hmellor
|
||||
|
||||
# KVConnector installation files
|
||||
/requirements/kv_connectors.txt @NickLucche
|
||||
|
||||
# Pooling models
|
||||
/examples/*/pooling/ @noooop
|
||||
/tests/models/*/pooling* @noooop
|
||||
/tests/entrypoints/pooling @noooop
|
||||
/vllm/config/pooler.py @noooop
|
||||
/vllm/pooling_params.py @noooop
|
||||
/vllm/model_executor/layers/pooler.py @noooop
|
||||
|
2
.github/mergify.yml
vendored
2
.github/mergify.yml
vendored
@ -11,6 +11,8 @@ pull_request_rules:
|
||||
label:
|
||||
add:
|
||||
- documentation
|
||||
comment:
|
||||
message: "Documentation preview: https://vllm--{{number}}.org.readthedocs.build/en/{{number}}/"
|
||||
|
||||
- name: label-ci-build
|
||||
description: Automatically apply ci/build label
|
||||
|
138
.github/workflows/issue_autolabel.yml
vendored
138
.github/workflows/issue_autolabel.yml
vendored
@ -13,6 +13,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Label issues based on keywords
|
||||
id: label-step
|
||||
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
|
||||
with:
|
||||
script: |
|
||||
@ -42,7 +43,6 @@ jobs:
|
||||
searchIn: "body"
|
||||
},
|
||||
],
|
||||
|
||||
// Substring search - matches anywhere in text (partial matches)
|
||||
substrings: [
|
||||
{
|
||||
@ -89,14 +89,12 @@ jobs:
|
||||
term: "hip_",
|
||||
searchIn: "both"
|
||||
},
|
||||
|
||||
// ROCm tools and libraries
|
||||
{
|
||||
term: "hipify",
|
||||
searchIn: "both"
|
||||
},
|
||||
],
|
||||
|
||||
// Regex patterns - for complex pattern matching
|
||||
regexPatterns: [
|
||||
{
|
||||
@ -107,13 +105,17 @@ jobs:
|
||||
}
|
||||
],
|
||||
},
|
||||
// Add more label configurations here as needed
|
||||
// example: {
|
||||
// keywords: [...],
|
||||
// substrings: [...],
|
||||
// regexPatterns: [...]
|
||||
// },
|
||||
};
|
||||
|
||||
// Helper function to create regex based on search type
|
||||
function createSearchRegex(term, type) {
|
||||
// Escape special regex characters in the term
|
||||
const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
||||
|
||||
switch (type) {
|
||||
case 'keyword':
|
||||
// Word boundary search - matches whole words only
|
||||
@ -125,16 +127,13 @@ jobs:
|
||||
throw new Error(`Unknown search type: ${type}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Helper function to find matching terms in text with line information
|
||||
function findMatchingTermsWithLines(text, searchTerms = [], searchType = 'keyword', searchLocation = '') {
|
||||
const matches = [];
|
||||
const lines = text.split('\n');
|
||||
|
||||
for (const termConfig of searchTerms) {
|
||||
let regex;
|
||||
let term, searchIn, pattern, description, flags;
|
||||
|
||||
// Handle different input formats (string or object)
|
||||
if (typeof termConfig === 'string') {
|
||||
term = termConfig;
|
||||
@ -146,21 +145,17 @@ jobs:
|
||||
description = termConfig.description;
|
||||
flags = termConfig.flags;
|
||||
}
|
||||
|
||||
// Skip if this term shouldn't be searched in the current location
|
||||
if (searchIn !== 'both' && searchIn !== searchLocation) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Create appropriate regex
|
||||
if (searchType === 'regex') {
|
||||
regex = new RegExp(pattern, flags || "gi");
|
||||
} else {
|
||||
regex = createSearchRegex(term, searchType);
|
||||
}
|
||||
|
||||
const termMatches = [];
|
||||
|
||||
// Check each line for matches
|
||||
lines.forEach((line, lineIndex) => {
|
||||
const lineMatches = line.match(regex);
|
||||
@ -175,15 +170,14 @@ jobs:
|
||||
originalTerm: term || pattern,
|
||||
description: description,
|
||||
// Show context around the match in the line
|
||||
context: line.length > 100 ?
|
||||
line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30),
|
||||
line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...'
|
||||
context: line.length > 100 ?
|
||||
line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30),
|
||||
line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...'
|
||||
: line.trim()
|
||||
});
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
if (termMatches.length > 0) {
|
||||
matches.push({
|
||||
term: term || (description || pattern),
|
||||
@ -196,64 +190,48 @@ jobs:
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return matches;
|
||||
}
|
||||
|
||||
// Helper function to check if label should be added
|
||||
async function processLabel(labelName, config) {
|
||||
const body = context.payload.issue.body || "";
|
||||
const title = context.payload.issue.title || "";
|
||||
|
||||
core.notice(`Processing label: ${labelName}`);
|
||||
core.notice(`Issue Title: "${title}"`);
|
||||
core.notice(`Issue Body length: ${body.length} characters`);
|
||||
|
||||
let shouldAddLabel = false;
|
||||
let allMatches = [];
|
||||
let reason = '';
|
||||
|
||||
const keywords = config.keywords || [];
|
||||
const substrings = config.substrings || [];
|
||||
const regexPatterns = config.regexPatterns || [];
|
||||
|
||||
core.notice(`Searching with ${keywords.length} keywords, ${substrings.length} substrings, and ${regexPatterns.length} regex patterns`);
|
||||
|
||||
// Search in title
|
||||
if (title.trim()) {
|
||||
core.notice(`Searching in title: "${title}"`);
|
||||
|
||||
const titleKeywordMatches = findMatchingTermsWithLines(title, keywords, 'keyword', 'title');
|
||||
const titleSubstringMatches = findMatchingTermsWithLines(title, substrings, 'substring', 'title');
|
||||
const titleRegexMatches = findMatchingTermsWithLines(title, regexPatterns, 'regex', 'title');
|
||||
|
||||
allMatches.push(...titleKeywordMatches, ...titleSubstringMatches, ...titleRegexMatches);
|
||||
}
|
||||
|
||||
// Search in body
|
||||
if (body.trim()) {
|
||||
core.notice(`Searching in body (${body.length} characters)`);
|
||||
|
||||
const bodyKeywordMatches = findMatchingTermsWithLines(body, keywords, 'keyword', 'body');
|
||||
const bodySubstringMatches = findMatchingTermsWithLines(body, substrings, 'substring', 'body');
|
||||
const bodyRegexMatches = findMatchingTermsWithLines(body, regexPatterns, 'regex', 'body');
|
||||
|
||||
allMatches.push(...bodyKeywordMatches, ...bodySubstringMatches, ...bodyRegexMatches);
|
||||
}
|
||||
|
||||
if (allMatches.length > 0) {
|
||||
core.notice(`Found ${allMatches.length} matching term(s):`);
|
||||
|
||||
for (const termMatch of allMatches) {
|
||||
const locationText = termMatch.searchLocation === 'title' ? 'title' : 'body';
|
||||
const searchInText = termMatch.searchIn === 'both' ? 'both' : termMatch.searchIn;
|
||||
|
||||
if (termMatch.searchType === 'regex') {
|
||||
core.notice(` 📍 Regex: "${termMatch.term}" (pattern: ${termMatch.pattern}) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
|
||||
} else {
|
||||
core.notice(` 📍 Term: "${termMatch.term}" (${termMatch.searchType} search) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
|
||||
}
|
||||
|
||||
// Show details for each match
|
||||
termMatch.matches.forEach((match, index) => {
|
||||
core.notice(` ${index + 1}. Line ${match.lineNumber} in ${match.searchLocation}: "${match.match}" [${match.searchType}]`);
|
||||
@ -266,7 +244,6 @@ jobs:
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
shouldAddLabel = true;
|
||||
const totalMatches = allMatches.reduce((sum, t) => sum + t.count, 0);
|
||||
const titleMatches = allMatches.filter(t => t.searchLocation === 'title').reduce((sum, t) => sum + t.count, 0);
|
||||
@ -274,13 +251,10 @@ jobs:
|
||||
const keywordMatches = allMatches.filter(t => t.searchType === 'keyword').reduce((sum, t) => sum + t.count, 0);
|
||||
const substringMatches = allMatches.filter(t => t.searchType === 'substring').reduce((sum, t) => sum + t.count, 0);
|
||||
const regexMatches = allMatches.filter(t => t.searchType === 'regex').reduce((sum, t) => sum + t.count, 0);
|
||||
|
||||
reason = `Found ${totalMatches} total matches (${titleMatches} in title, ${bodyMatches} in body) - ${keywordMatches} keyword matches, ${substringMatches} substring matches, ${regexMatches} regex matches`;
|
||||
}
|
||||
|
||||
core.notice(`Final decision: ${shouldAddLabel ? 'ADD LABEL' : 'DO NOT ADD LABEL'}`);
|
||||
core.notice(`Reason: ${reason || 'No matching terms found'}`);
|
||||
|
||||
if (shouldAddLabel) {
|
||||
const existingLabels = context.payload.issue.labels.map(l => l.name);
|
||||
if (!existingLabels.includes(labelName)) {
|
||||
@ -296,14 +270,92 @@ jobs:
|
||||
core.notice(`Label "${labelName}" already present.`);
|
||||
return false;
|
||||
}
|
||||
|
||||
core.notice(`No matching terms found for label "${labelName}".`);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Process all configured labels
|
||||
const processLabels = Object.entries(labelConfig)
|
||||
.map(([labelName, config]) => processLabel(labelName, config));
|
||||
const labelsAdded = await Promise.all(processLabels);
|
||||
const numLabelsAdded = labelsAdded.reduce((x, y) => x + y, 0);
|
||||
core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
|
||||
const labelsAddedResults = await Promise.all(
|
||||
Object.entries(labelConfig).map(([labelName, config]) =>
|
||||
processLabel(labelName, config).then(added => ({ labelName, added }))
|
||||
)
|
||||
);
|
||||
|
||||
const numLabelsAdded = labelsAddedResults.filter(r => r.added).length;
|
||||
core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
|
||||
|
||||
// Return which labels were added for the next step
|
||||
const addedLabels = labelsAddedResults.filter(r => r.added).map(r => r.labelName);
|
||||
core.setOutput('labels_added', JSON.stringify(addedLabels));
|
||||
return addedLabels;
|
||||
|
||||
- name: CC users for labeled issues
|
||||
if: steps.label-step.outputs.labels_added != '[]'
|
||||
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
|
||||
with:
|
||||
script: |
|
||||
// Configuration: Map labels to GitHub users to CC
|
||||
// You can add multiple users per label, and multiple label configurations
|
||||
const ccConfig = {
|
||||
rocm: {
|
||||
users: ['hongxiayang', 'tjtanaa', 'vllmellm'], // Add more users as needed: ['user1', 'user2', 'user3']
|
||||
message: 'CC {users} for ROCm-related issue' // {users} will be replaced with @mentions
|
||||
},
|
||||
// Add more label -> user mappings here
|
||||
// Example:
|
||||
// cuda: {
|
||||
// users: ['user1', 'user2'],
|
||||
// message: 'CC {users} for CUDA-related issue'
|
||||
// },
|
||||
// performance: {
|
||||
// users: ['perfexpert'],
|
||||
// message: 'CC {users} for performance issue'
|
||||
// },
|
||||
};
|
||||
|
||||
const labelsAdded = JSON.parse('${{ steps.label-step.outputs.labels_added }}');
|
||||
core.notice(`Labels added: ${labelsAdded.join(', ')}`);
|
||||
|
||||
// Get existing comments to check for already mentioned users
|
||||
const comments = await github.rest.issues.listComments({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
issue_number: context.issue.number,
|
||||
});
|
||||
|
||||
const issueBody = context.payload.issue.body || '';
|
||||
const allExistingText = issueBody + '\n' + comments.data.map(c => c.body).join('\n');
|
||||
|
||||
// Process each label that was added
|
||||
for (const label of labelsAdded) {
|
||||
if (ccConfig[label]) {
|
||||
const config = ccConfig[label];
|
||||
const usersToMention = [];
|
||||
|
||||
// Check which users haven't been mentioned yet
|
||||
for (const user of config.users) {
|
||||
const mentionPattern = new RegExp(`@${user}\\b`, 'i');
|
||||
if (!mentionPattern.test(allExistingText)) {
|
||||
usersToMention.push(user);
|
||||
} else {
|
||||
core.notice(`@${user} already mentioned for label "${label}", skipping`);
|
||||
}
|
||||
}
|
||||
|
||||
// Post comment if there are users to mention
|
||||
if (usersToMention.length > 0) {
|
||||
const mentions = usersToMention.map(u => `@${u}`).join(' ');
|
||||
const message = config.message.replace('{users}', mentions);
|
||||
|
||||
await github.rest.issues.createComment({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
issue_number: context.issue.number,
|
||||
body: message
|
||||
});
|
||||
|
||||
core.notice(`CC comment added for label "${label}": ${mentions}`);
|
||||
} else {
|
||||
core.notice(`All users for label "${label}" already mentioned, skipping comment`);
|
||||
}
|
||||
}
|
||||
}
|
2
.github/workflows/stale.yml
vendored
2
.github/workflows/stale.yml
vendored
@ -13,7 +13,7 @@ jobs:
|
||||
actions: write
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/stale@3a9db7e6a41a89f618792c92c0e97cc736e1b13f # v10.0.0
|
||||
- uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0
|
||||
with:
|
||||
# Increasing this value ensures that changes to this workflow
|
||||
# propagate to all issues and PRs in days rather than months
|
||||
|
@ -4,7 +4,6 @@ MD013: false
|
||||
MD024:
|
||||
siblings_only: true
|
||||
MD033: false
|
||||
MD042: false
|
||||
MD045: false
|
||||
MD046: false
|
||||
MD051: false
|
||||
|
@ -6,30 +6,19 @@ default_stages:
|
||||
- manual # Run in CI
|
||||
exclude: 'vllm/third_party/.*'
|
||||
repos:
|
||||
- repo: https://github.com/google/yapf
|
||||
rev: v0.43.0
|
||||
hooks:
|
||||
- id: yapf
|
||||
args: [--in-place, --verbose]
|
||||
# Keep the same list from yapfignore here to avoid yapf failing without any inputs
|
||||
exclude: '(.buildkite|benchmarks|build|examples)/.*'
|
||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||
rev: v0.11.7
|
||||
rev: v0.14.0
|
||||
hooks:
|
||||
- id: ruff
|
||||
- id: ruff-check
|
||||
args: [--output-format, github, --fix]
|
||||
- id: ruff-format
|
||||
files: ^(.buildkite|benchmarks|examples)/.*
|
||||
- repo: https://github.com/crate-ci/typos
|
||||
rev: v1.35.5
|
||||
rev: v1.38.1
|
||||
hooks:
|
||||
- id: typos
|
||||
- repo: https://github.com/PyCQA/isort
|
||||
rev: 6.0.1
|
||||
hooks:
|
||||
- id: isort
|
||||
args: [--force-exclude]
|
||||
- repo: https://github.com/pre-commit/mirrors-clang-format
|
||||
rev: v20.1.3
|
||||
rev: v21.1.2
|
||||
hooks:
|
||||
- id: clang-format
|
||||
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
|
||||
@ -46,7 +35,7 @@ repos:
|
||||
hooks:
|
||||
- id: actionlint
|
||||
- repo: https://github.com/astral-sh/uv-pre-commit
|
||||
rev: 0.6.17
|
||||
rev: 0.9.1
|
||||
hooks:
|
||||
- id: pip-compile
|
||||
args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128, --python-platform, x86_64-manylinux_2_28]
|
||||
@ -67,11 +56,6 @@ repos:
|
||||
types_or: [python, pyi]
|
||||
require_serial: true
|
||||
additional_dependencies: [mypy==1.11.1, regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic]
|
||||
- id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||
name: Run mypy for Python 3.9
|
||||
entry: python tools/pre_commit/mypy.py 1 "3.9"
|
||||
<<: *mypy_common
|
||||
stages: [manual] # Only run in CI
|
||||
- id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||
name: Run mypy for Python 3.10
|
||||
entry: python tools/pre_commit/mypy.py 1 "3.10"
|
||||
@ -87,6 +71,11 @@ repos:
|
||||
entry: python tools/pre_commit/mypy.py 1 "3.12"
|
||||
<<: *mypy_common
|
||||
stages: [manual] # Only run in CI
|
||||
- id: mypy-3.13 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||
name: Run mypy for Python 3.13
|
||||
entry: python tools/pre_commit/mypy.py 1 "3.13"
|
||||
<<: *mypy_common
|
||||
stages: [manual] # Only run in CI
|
||||
- id: shellcheck
|
||||
name: Lint shell scripts
|
||||
entry: tools/shellcheck.sh
|
||||
|
@ -34,7 +34,7 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
|
||||
# Supported python versions. These versions will be searched in order, the
|
||||
# first match will be selected. These should be kept in sync with setup.py.
|
||||
#
|
||||
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12" "3.13")
|
||||
set(PYTHON_SUPPORTED_VERSIONS "3.10" "3.11" "3.12" "3.13")
|
||||
|
||||
# Supported AMD GPU architectures.
|
||||
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
|
||||
@ -269,8 +269,8 @@ set(VLLM_EXT_SRC
|
||||
"csrc/sampler.cu"
|
||||
"csrc/cuda_view.cu"
|
||||
"csrc/quantization/gptq/q_gemm.cu"
|
||||
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
|
||||
"csrc/quantization/fp8/common.cu"
|
||||
"csrc/quantization/w8a8/int8/scaled_quant.cu"
|
||||
"csrc/quantization/w8a8/fp8/common.cu"
|
||||
"csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
|
||||
"csrc/quantization/gguf/gguf_kernel.cu"
|
||||
"csrc/quantization/activation_kernels.cu"
|
||||
@ -314,12 +314,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
list(APPEND VLLM_EXT_SRC
|
||||
"csrc/quantization/awq/gemm_kernels.cu"
|
||||
"csrc/permute_cols.cu"
|
||||
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
|
||||
"csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu"
|
||||
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
|
||||
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
|
||||
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
|
||||
"csrc/cutlass_extensions/common.cpp"
|
||||
"csrc/quantization/fp8/per_token_group_quant.cu")
|
||||
"csrc/quantization/w8a8/fp8/per_token_group_quant.cu"
|
||||
"csrc/quantization/w8a8/int8/per_token_group_quant.cu")
|
||||
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${VLLM_EXT_SRC}"
|
||||
@ -423,11 +424,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS)
|
||||
set(SRCS
|
||||
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu"
|
||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
|
||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
|
||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu"
|
||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu")
|
||||
"csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm90.cu"
|
||||
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8.cu"
|
||||
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_int8.cu"
|
||||
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_azp_sm90_int8.cu"
|
||||
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm90_fp8.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${SRCS}"
|
||||
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
||||
@ -458,9 +459,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
endif()
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||
set(SRCS
|
||||
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu"
|
||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu"
|
||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8.cu"
|
||||
"csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm120.cu"
|
||||
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm120_fp8.cu"
|
||||
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm120_fp8.cu"
|
||||
)
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${SRCS}"
|
||||
@ -492,9 +493,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
endif()
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||
set(SRCS
|
||||
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
|
||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
|
||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu"
|
||||
"csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm100.cu"
|
||||
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8.cu"
|
||||
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8.cu"
|
||||
)
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${SRCS}"
|
||||
@ -525,7 +526,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
# subtract out the archs that are already built for 3x
|
||||
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
|
||||
if (SCALED_MM_2X_ARCHS)
|
||||
set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu")
|
||||
set(SRCS "csrc/quantization/w8a8/cutlass/scaled_mm_c2x.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${SRCS}"
|
||||
CUDA_ARCHS "${SCALED_MM_2X_ARCHS}")
|
||||
@ -648,7 +649,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
# if it's possible to compile MoE kernels that use its output.
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
|
||||
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm90.cu")
|
||||
set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm90.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${SRCS}"
|
||||
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
||||
@ -672,7 +673,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||
endif()
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm100.cu")
|
||||
set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm100.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${SRCS}"
|
||||
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
||||
@ -697,7 +698,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
|
||||
endif()
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
|
||||
set(SRCS "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
|
||||
set(SRCS "csrc/quantization/w8a8/cutlass/moe/moe_data.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${SRCS}"
|
||||
CUDA_ARCHS "${CUTLASS_MOE_DATA_ARCHS}")
|
||||
@ -720,7 +721,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
|
||||
endif()
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||
set(SRCS "csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu")
|
||||
set(SRCS "csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${SRCS}"
|
||||
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
||||
@ -1006,6 +1007,7 @@ endif()
|
||||
# For CUDA we also build and ship some external projects.
|
||||
if (VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
include(cmake/external_projects/flashmla.cmake)
|
||||
include(cmake/external_projects/qutlass.cmake)
|
||||
|
||||
# vllm-flash-attn should be last as it overwrites some CMake functions
|
||||
include(cmake/external_projects/vllm_flash_attn.cmake)
|
||||
|
@ -149,6 +149,7 @@ Compute Resources:
|
||||
- Trainy
|
||||
- UC Berkeley
|
||||
- UC San Diego
|
||||
- Volcengine
|
||||
|
||||
Slack Sponsor: Anyscale
|
||||
|
||||
|
@ -74,7 +74,7 @@ start_server() {
|
||||
local vllm_log=$4
|
||||
local profile_dir=$5
|
||||
|
||||
pkill -if vllm
|
||||
pkill -if "vllm serve" || true
|
||||
|
||||
# Define the common arguments as a bash array.
|
||||
# Each argument and its value are separate elements.
|
||||
@ -96,11 +96,11 @@ start_server() {
|
||||
# This correctly passes each element as a separate argument.
|
||||
if [[ -n "$profile_dir" ]]; then
|
||||
# Start server with profiling enabled
|
||||
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
|
||||
VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
|
||||
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
|
||||
else
|
||||
# Start server without profiling
|
||||
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 \
|
||||
VLLM_SERVER_DEV_MODE=1 \
|
||||
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
|
||||
fi
|
||||
local server_pid=$!
|
||||
@ -139,7 +139,7 @@ run_benchmark() {
|
||||
echo "vllm_log: $vllm_log"
|
||||
echo
|
||||
rm -f $vllm_log
|
||||
pkill -if vllm
|
||||
pkill -if "vllm serve" || true
|
||||
|
||||
echo "starting server..."
|
||||
# Call start_server without a profile_dir to avoid profiling overhead
|
||||
@ -232,7 +232,7 @@ run_benchmark() {
|
||||
|
||||
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
|
||||
|
||||
pkill -if vllm
|
||||
pkill -if "vllm serve" || true
|
||||
sleep 10
|
||||
echo "===================="
|
||||
return 0
|
||||
@ -308,6 +308,6 @@ if (( $(echo "$best_throughput > 0" | bc -l) )); then
|
||||
else
|
||||
echo "No configuration met the latency requirements. Skipping final profiling run."
|
||||
fi
|
||||
pkill -if vllm
|
||||
pkill -if "vllm serve" || true
|
||||
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH"
|
||||
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT"
|
||||
|
@ -8,7 +8,6 @@ import sys
|
||||
import time
|
||||
import traceback
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional, Union
|
||||
|
||||
import aiohttp
|
||||
import huggingface_hub.constants
|
||||
@ -28,13 +27,13 @@ class RequestFuncInput:
|
||||
prompt_len: int
|
||||
output_len: int
|
||||
model: str
|
||||
model_name: Optional[str] = None
|
||||
logprobs: Optional[int] = None
|
||||
extra_body: Optional[dict] = None
|
||||
multi_modal_content: Optional[dict | list[dict]] = None
|
||||
model_name: str | None = None
|
||||
logprobs: int | None = None
|
||||
extra_body: dict | None = None
|
||||
multi_modal_content: dict | list[dict] | None = None
|
||||
ignore_eos: bool = False
|
||||
language: Optional[str] = None
|
||||
request_id: Optional[str] = None
|
||||
language: str | None = None
|
||||
request_id: str | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -52,7 +51,7 @@ class RequestFuncOutput:
|
||||
|
||||
async def async_request_tgi(
|
||||
request_func_input: RequestFuncInput,
|
||||
pbar: Optional[tqdm] = None,
|
||||
pbar: tqdm | None = None,
|
||||
) -> RequestFuncOutput:
|
||||
api_url = request_func_input.api_url
|
||||
assert api_url.endswith("generate_stream")
|
||||
@ -133,7 +132,7 @@ async def async_request_tgi(
|
||||
|
||||
async def async_request_trt_llm(
|
||||
request_func_input: RequestFuncInput,
|
||||
pbar: Optional[tqdm] = None,
|
||||
pbar: tqdm | None = None,
|
||||
) -> RequestFuncOutput:
|
||||
api_url = request_func_input.api_url
|
||||
assert api_url.endswith("generate_stream")
|
||||
@ -204,7 +203,7 @@ async def async_request_trt_llm(
|
||||
|
||||
async def async_request_deepspeed_mii(
|
||||
request_func_input: RequestFuncInput,
|
||||
pbar: Optional[tqdm] = None,
|
||||
pbar: tqdm | None = None,
|
||||
) -> RequestFuncOutput:
|
||||
api_url = request_func_input.api_url
|
||||
assert api_url.endswith(("completions", "profile")), (
|
||||
@ -267,7 +266,7 @@ async def async_request_deepspeed_mii(
|
||||
|
||||
async def async_request_openai_completions(
|
||||
request_func_input: RequestFuncInput,
|
||||
pbar: Optional[tqdm] = None,
|
||||
pbar: tqdm | None = None,
|
||||
) -> RequestFuncOutput:
|
||||
api_url = request_func_input.api_url
|
||||
assert api_url.endswith(("completions", "profile")), (
|
||||
@ -367,7 +366,7 @@ async def async_request_openai_completions(
|
||||
|
||||
async def async_request_openai_chat_completions(
|
||||
request_func_input: RequestFuncInput,
|
||||
pbar: Optional[tqdm] = None,
|
||||
pbar: tqdm | None = None,
|
||||
) -> RequestFuncOutput:
|
||||
api_url = request_func_input.api_url
|
||||
assert api_url.endswith(("chat/completions", "profile")), (
|
||||
@ -476,7 +475,7 @@ async def async_request_openai_chat_completions(
|
||||
|
||||
async def async_request_openai_audio(
|
||||
request_func_input: RequestFuncInput,
|
||||
pbar: Optional[tqdm] = None,
|
||||
pbar: tqdm | None = None,
|
||||
) -> RequestFuncOutput:
|
||||
# Lazy import without PlaceholderModule to avoid vllm dep.
|
||||
import soundfile
|
||||
@ -610,7 +609,7 @@ def get_tokenizer(
|
||||
tokenizer_mode: str = "auto",
|
||||
trust_remote_code: bool = False,
|
||||
**kwargs,
|
||||
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
|
||||
) -> PreTrainedTokenizer | PreTrainedTokenizerFast:
|
||||
if pretrained_model_name_or_path is not None and not os.path.exists(
|
||||
pretrained_model_name_or_path
|
||||
):
|
||||
|
@ -2,9 +2,9 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import gc
|
||||
|
||||
from benchmark_utils import TimeCollector
|
||||
from tabulate import tabulate
|
||||
|
||||
from benchmark_utils import TimeCollector
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.v1.core.block_pool import BlockPool
|
||||
|
||||
|
@ -5,9 +5,9 @@ import time
|
||||
from unittest import mock
|
||||
|
||||
import numpy as np
|
||||
from benchmark_utils import TimeCollector
|
||||
from tabulate import tabulate
|
||||
|
||||
from benchmark_utils import TimeCollector
|
||||
from vllm.config import (
|
||||
CacheConfig,
|
||||
DeviceConfig,
|
||||
@ -164,7 +164,7 @@ def invoke_main() -> None:
|
||||
)
|
||||
parser.add_argument(
|
||||
"--batched", action="store_true", help="consider time to prepare batch"
|
||||
) # noqa: E501
|
||||
)
|
||||
parser.add_argument(
|
||||
"--num-iteration",
|
||||
type=int,
|
||||
|
@ -32,7 +32,6 @@ import dataclasses
|
||||
import json
|
||||
import random
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
from transformers import PreTrainedTokenizerBase
|
||||
|
||||
@ -80,7 +79,7 @@ def sample_requests_from_dataset(
|
||||
num_requests: int,
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
input_length_range: tuple[int, int],
|
||||
fixed_output_len: Optional[int],
|
||||
fixed_output_len: int | None,
|
||||
) -> list[Request]:
|
||||
if fixed_output_len is not None and fixed_output_len < 4:
|
||||
raise ValueError("output_len too small")
|
||||
@ -128,7 +127,7 @@ def sample_requests_from_random(
|
||||
num_requests: int,
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
input_length_range: tuple[int, int],
|
||||
fixed_output_len: Optional[int],
|
||||
fixed_output_len: int | None,
|
||||
prefix_len: int,
|
||||
) -> list[Request]:
|
||||
requests = []
|
||||
|
@ -7,7 +7,6 @@ import dataclasses
|
||||
import json
|
||||
import random
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
from transformers import AutoTokenizer, PreTrainedTokenizerBase
|
||||
|
||||
@ -24,7 +23,7 @@ def sample_requests(
|
||||
dataset_path: str,
|
||||
num_requests: int,
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
fixed_output_len: Optional[int],
|
||||
fixed_output_len: int | None,
|
||||
) -> list[tuple[str, int, int, int]]:
|
||||
if fixed_output_len is not None and fixed_output_len < 4:
|
||||
raise ValueError("output_len too small")
|
||||
|
@ -31,20 +31,19 @@ import time
|
||||
import uuid
|
||||
import warnings
|
||||
from collections.abc import AsyncGenerator
|
||||
from contextlib import nullcontext
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
|
||||
import datasets
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from tqdm.asyncio import tqdm
|
||||
from transformers import PreTrainedTokenizerBase
|
||||
|
||||
from backend_request_func import (
|
||||
ASYNC_REQUEST_FUNCS,
|
||||
RequestFuncInput,
|
||||
RequestFuncOutput,
|
||||
)
|
||||
from tqdm.asyncio import tqdm
|
||||
from transformers import PreTrainedTokenizerBase
|
||||
|
||||
try:
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
@ -317,7 +316,7 @@ def calculate_metrics(
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
selected_percentile_metrics: list[str],
|
||||
selected_percentiles: list[float],
|
||||
goodput_config_dict: Optional[dict[str, float]] = None,
|
||||
goodput_config_dict: dict[str, float] | None = None,
|
||||
) -> tuple[BenchmarkMetrics, list[int]]:
|
||||
actual_output_lens: list[int] = []
|
||||
total_input = 0
|
||||
@ -437,9 +436,9 @@ async def benchmark(
|
||||
selected_percentile_metrics: list[str],
|
||||
selected_percentiles: list[str],
|
||||
ignore_eos: bool,
|
||||
max_concurrency: Optional[int],
|
||||
max_concurrency: int | None,
|
||||
structured_output_ratio: float,
|
||||
goodput_config_dict: Optional[dict[str, float]] = None,
|
||||
goodput_config_dict: dict[str, float] | None = None,
|
||||
):
|
||||
if backend in ASYNC_REQUEST_FUNCS:
|
||||
request_func = ASYNC_REQUEST_FUNCS[backend]
|
||||
@ -503,15 +502,9 @@ async def benchmark(
|
||||
|
||||
pbar = None if disable_tqdm else tqdm(total=len(input_requests))
|
||||
|
||||
# This can be used once the minimum Python version is 3.10 or higher,
|
||||
# and it will simplify the code in limited_request_func.
|
||||
# semaphore = (asyncio.Semaphore(max_concurrency)
|
||||
# if max_concurrency else contextlib.nullcontext())
|
||||
semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
|
||||
semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else nullcontext()
|
||||
|
||||
async def limited_request_func(request_func_input, pbar):
|
||||
if semaphore is None:
|
||||
return await request_func(request_func_input=request_func_input, pbar=pbar)
|
||||
async with semaphore:
|
||||
return await request_func(request_func_input=request_func_input, pbar=pbar)
|
||||
|
||||
@ -910,13 +903,13 @@ def create_argument_parser():
|
||||
parser.add_argument(
|
||||
"--tokenizer",
|
||||
type=str,
|
||||
help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
|
||||
help="Name or path of the tokenizer, if not using the default tokenizer.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tokenizer-mode",
|
||||
type=str,
|
||||
default="auto",
|
||||
help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
|
||||
help="Name or path of the tokenizer, if not using the default tokenizer.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--num-prompts",
|
||||
|
@ -6,7 +6,7 @@ import math
|
||||
import os
|
||||
import time
|
||||
from types import TracebackType
|
||||
from typing import Any, Optional, Union
|
||||
from typing import Any
|
||||
|
||||
|
||||
def convert_to_pytorch_benchmark_format(
|
||||
@ -92,7 +92,7 @@ class TimeCollector:
|
||||
def __init__(self, scale: int) -> None:
|
||||
self.cnt: int = 0
|
||||
self._sum: int = 0
|
||||
self._max: Optional[int] = None
|
||||
self._max: int | None = None
|
||||
self.scale = scale
|
||||
self.start_time: int = time.monotonic_ns()
|
||||
|
||||
@ -104,13 +104,13 @@ class TimeCollector:
|
||||
else:
|
||||
self._max = max(self._max, v)
|
||||
|
||||
def avg(self) -> Union[float, str]:
|
||||
def avg(self) -> float | str:
|
||||
return self._sum * 1.0 / self.cnt / self.scale if self.cnt > 0 else "N/A"
|
||||
|
||||
def max(self) -> Union[float, str]:
|
||||
def max(self) -> float | str:
|
||||
return self._max / self.scale if self._max else "N/A"
|
||||
|
||||
def dump_avg_max(self) -> list[Union[float, str]]:
|
||||
def dump_avg_max(self) -> list[float | str]:
|
||||
return [self.avg(), self.max()]
|
||||
|
||||
def __enter__(self) -> None:
|
||||
@ -118,8 +118,8 @@ class TimeCollector:
|
||||
|
||||
def __exit__(
|
||||
self,
|
||||
exc_type: Optional[type[BaseException]],
|
||||
exc_value: Optional[BaseException],
|
||||
exc_traceback: Optional[TracebackType],
|
||||
exc_type: type[BaseException] | None,
|
||||
exc_value: BaseException | None,
|
||||
exc_traceback: TracebackType | None,
|
||||
) -> None:
|
||||
self.collect(time.monotonic_ns() - self.start_time)
|
||||
|
@ -6,8 +6,7 @@ import copy
|
||||
import itertools
|
||||
import pickle as pkl
|
||||
import time
|
||||
from collections.abc import Iterable
|
||||
from typing import Callable
|
||||
from collections.abc import Callable, Iterable
|
||||
|
||||
import torch
|
||||
import torch.utils.benchmark as TBenchmark
|
||||
|
@ -6,8 +6,7 @@ import copy
|
||||
import itertools
|
||||
import pickle as pkl
|
||||
import time
|
||||
from collections.abc import Iterable
|
||||
from typing import Callable, Optional
|
||||
from collections.abc import Callable, Iterable
|
||||
|
||||
import torch
|
||||
import torch.utils.benchmark as TBenchmark
|
||||
@ -53,7 +52,7 @@ def bench_int8(
|
||||
n: int,
|
||||
label: str,
|
||||
sub_label: str,
|
||||
bench_kernels: Optional[list[str]] = None,
|
||||
bench_kernels: list[str] | None = None,
|
||||
) -> Iterable[TMeasurement]:
|
||||
"""Benchmark INT8-based kernels."""
|
||||
assert dtype == torch.int8
|
||||
@ -108,7 +107,7 @@ def bench_fp8(
|
||||
n: int,
|
||||
label: str,
|
||||
sub_label: str,
|
||||
bench_kernels: Optional[list[str]] = None,
|
||||
bench_kernels: list[str] | None = None,
|
||||
) -> Iterable[TMeasurement]:
|
||||
"""Benchmark FP8-based kernels."""
|
||||
assert dtype == torch.float8_e4m3fn
|
||||
@ -183,7 +182,7 @@ def bench(
|
||||
n: int,
|
||||
label: str,
|
||||
sub_label: str,
|
||||
bench_kernels: Optional[list[str]] = None,
|
||||
bench_kernels: list[str] | None = None,
|
||||
) -> Iterable[TMeasurement]:
|
||||
if dtype == torch.int8:
|
||||
return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels)
|
||||
@ -201,7 +200,7 @@ def print_timers(timers: Iterable[TMeasurement]):
|
||||
def run(
|
||||
dtype: torch.dtype,
|
||||
MKNs: Iterable[tuple[int, int, int]],
|
||||
bench_kernels: Optional[list[str]] = None,
|
||||
bench_kernels: list[str] | None = None,
|
||||
) -> Iterable[TMeasurement]:
|
||||
results = []
|
||||
for m, k, n in MKNs:
|
||||
|
@ -3,10 +3,9 @@
|
||||
|
||||
import pickle as pkl
|
||||
import time
|
||||
from collections.abc import Iterable
|
||||
from collections.abc import Callable, Iterable
|
||||
from dataclasses import dataclass
|
||||
from itertools import product
|
||||
from typing import Callable, Optional
|
||||
|
||||
import torch
|
||||
import torch.utils.benchmark as TBenchmark
|
||||
@ -51,7 +50,7 @@ def get_bench_params() -> list[bench_params_t]:
|
||||
def unfused_int8_impl(
|
||||
rms_norm_layer: RMSNorm,
|
||||
x: torch.Tensor,
|
||||
residual: Optional[torch.Tensor],
|
||||
residual: torch.Tensor | None,
|
||||
quant_dtype: torch.dtype,
|
||||
):
|
||||
# Norm
|
||||
@ -68,7 +67,7 @@ def unfused_int8_impl(
|
||||
def unfused_fp8_impl(
|
||||
rms_norm_layer: RMSNorm,
|
||||
x: torch.Tensor,
|
||||
residual: Optional[torch.Tensor],
|
||||
residual: torch.Tensor | None,
|
||||
quant_dtype: torch.dtype,
|
||||
):
|
||||
# Norm
|
||||
@ -85,7 +84,7 @@ def unfused_fp8_impl(
|
||||
def fused_impl(
|
||||
rms_norm_layer: RMSNorm, # this stores the weights
|
||||
x: torch.Tensor,
|
||||
residual: Optional[torch.Tensor],
|
||||
residual: torch.Tensor | None,
|
||||
quant_dtype: torch.dtype,
|
||||
):
|
||||
out, _ = ops.rms_norm_dynamic_per_token_quant(
|
||||
|
191
benchmarks/kernels/bench_mxfp4_qutlass.py
Normal file
191
benchmarks/kernels/bench_mxfp4_qutlass.py
Normal file
@ -0,0 +1,191 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
#
|
||||
# Copyright (C) 2025 Roberto L. Castro (Roberto.LopezCastro@ist.ac.at).
|
||||
# All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import argparse
|
||||
import copy
|
||||
import itertools
|
||||
|
||||
import torch
|
||||
from compressed_tensors.transform.utils.hadamard import deterministic_hadamard_matrix
|
||||
from weight_shapes import WEIGHT_SHAPES
|
||||
|
||||
from vllm._custom_ops import fusedQuantizeMx, matmul_mxf4_bf16_tn
|
||||
from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked
|
||||
from vllm.triton_utils import triton
|
||||
|
||||
PROVIDER_CFGS = {
|
||||
"torch-bf16": dict(enabled=True),
|
||||
"mxfp4": dict(no_a_quant=False, enabled=True),
|
||||
"mxfp4-noquant": dict(no_a_quant=True, enabled=True),
|
||||
}
|
||||
|
||||
_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
|
||||
|
||||
|
||||
def get_hadamard_matrix(group_size: int, dtype: torch.dtype, device: torch.device):
|
||||
return (
|
||||
deterministic_hadamard_matrix(group_size, dtype=dtype, device=device)
|
||||
* group_size**-0.5
|
||||
)
|
||||
|
||||
|
||||
def _quant_weight_mxfp4(
|
||||
b: torch.Tensor, forward_hadamard_matrix: torch.Tensor, device: str
|
||||
):
|
||||
weight_hf_e2m1, weight_hf_e8m0 = fusedQuantizeMx(
|
||||
b, forward_hadamard_matrix, method="abs_max"
|
||||
)
|
||||
weight_hf_scale_block = to_blocked(weight_hf_e8m0, backend="triton")
|
||||
return weight_hf_e2m1, weight_hf_scale_block
|
||||
|
||||
|
||||
def build_mxfp4_runner(cfg, a, b, forward_hadamard_matrix, dtype, device):
|
||||
weight_hf_e2m1, weight_hf_scale_block = _quant_weight_mxfp4(
|
||||
b, forward_hadamard_matrix, device
|
||||
)
|
||||
alpha = torch.tensor([1.0], device="cuda")
|
||||
|
||||
if cfg["no_a_quant"]:
|
||||
# Pre-quantize activation
|
||||
input_hf_e2m1, input_hf_e8m0 = fusedQuantizeMx(
|
||||
a, forward_hadamard_matrix, method="abs_max"
|
||||
)
|
||||
input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton")
|
||||
|
||||
def run():
|
||||
return matmul_mxf4_bf16_tn(
|
||||
input_hf_e2m1,
|
||||
weight_hf_e2m1,
|
||||
input_hf_scale_block,
|
||||
weight_hf_scale_block,
|
||||
alpha,
|
||||
)
|
||||
|
||||
return run
|
||||
|
||||
# Quantize activation on-the-fly
|
||||
def run():
|
||||
input_hf_e2m1, input_hf_e8m0 = fusedQuantizeMx(
|
||||
a, forward_hadamard_matrix, method="abs_max"
|
||||
)
|
||||
input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton")
|
||||
return matmul_mxf4_bf16_tn(
|
||||
input_hf_e2m1,
|
||||
weight_hf_e2m1,
|
||||
input_hf_scale_block,
|
||||
weight_hf_scale_block,
|
||||
alpha,
|
||||
)
|
||||
|
||||
return run
|
||||
|
||||
|
||||
@triton.testing.perf_report(
|
||||
triton.testing.Benchmark(
|
||||
x_names=["batch_size"],
|
||||
x_vals=[
|
||||
1,
|
||||
4,
|
||||
8,
|
||||
16,
|
||||
32,
|
||||
64,
|
||||
128,
|
||||
256,
|
||||
512,
|
||||
1024,
|
||||
2048,
|
||||
4096,
|
||||
8192,
|
||||
16384,
|
||||
24576,
|
||||
32768,
|
||||
],
|
||||
x_log=False,
|
||||
line_arg="provider",
|
||||
line_vals=_enabled,
|
||||
line_names=_enabled,
|
||||
ylabel="TFLOP/s (larger is better)",
|
||||
plot_name="BF16 vs MXFP4 GEMMs",
|
||||
args={},
|
||||
)
|
||||
)
|
||||
def benchmark(batch_size, provider, N, K, had_size):
|
||||
M = batch_size
|
||||
device = "cuda"
|
||||
dtype = torch.bfloat16
|
||||
|
||||
a = torch.randn((M, K), device=device, dtype=dtype)
|
||||
b = torch.randn((N, K), device=device, dtype=dtype)
|
||||
forward_hadamard_matrix = get_hadamard_matrix(had_size, dtype, device)
|
||||
|
||||
quantiles = [0.5, 0.2, 0.8]
|
||||
|
||||
if provider == "torch-bf16":
|
||||
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||
lambda: torch.nn.functional.linear(a, b), rep=200, quantiles=quantiles
|
||||
)
|
||||
else:
|
||||
cfg = PROVIDER_CFGS[provider]
|
||||
run_quant = build_mxfp4_runner(
|
||||
cfg, a, b, forward_hadamard_matrix, dtype, device
|
||||
)
|
||||
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||
lambda: run_quant(), rep=200, quantiles=quantiles
|
||||
)
|
||||
|
||||
to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
|
||||
return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
|
||||
|
||||
|
||||
def prepare_shapes(args):
|
||||
out = []
|
||||
for model, tp_size in itertools.product(args.models, args.tp_sizes):
|
||||
for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
|
||||
KN[tp_dim] //= tp_size
|
||||
KN.append(model)
|
||||
out.append(KN)
|
||||
return out
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--models",
|
||||
nargs="+",
|
||||
type=str,
|
||||
default=["meta-llama/Llama-3.3-70B-Instruct"],
|
||||
choices=list(WEIGHT_SHAPES.keys()),
|
||||
)
|
||||
parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
|
||||
args = parser.parse_args()
|
||||
|
||||
for K, N, model in prepare_shapes(args):
|
||||
for had_size in [32, 64, 128]:
|
||||
print(f"{model}, N={N} K={K}, HAD={had_size}, BF16 vs MXFP4 GEMMs TFLOP/s:")
|
||||
benchmark.run(
|
||||
print_data=True,
|
||||
show_plots=True,
|
||||
save_path=f"bench_mxfp4_res_n{N}_k{K}",
|
||||
N=N,
|
||||
K=K,
|
||||
had_size=had_size,
|
||||
)
|
||||
|
||||
print("Benchmark finished!")
|
207
benchmarks/kernels/bench_nvfp4_qutlass.py
Normal file
207
benchmarks/kernels/bench_nvfp4_qutlass.py
Normal file
@ -0,0 +1,207 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
#
|
||||
# Copyright (C) 2025 Roberto L. Castro (Roberto.LopezCastro@ist.ac.at).
|
||||
# All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import argparse
|
||||
import copy
|
||||
import itertools
|
||||
|
||||
import torch
|
||||
from compressed_tensors.transform.utils.hadamard import deterministic_hadamard_matrix
|
||||
from weight_shapes import WEIGHT_SHAPES
|
||||
|
||||
from vllm import _custom_ops as ops # use existing nvfp4 gemm in vllm
|
||||
from vllm._custom_ops import fusedQuantizeNv
|
||||
from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked
|
||||
from vllm.triton_utils import triton
|
||||
|
||||
PROVIDER_CFGS = {
|
||||
"torch-bf16": dict(enabled=True),
|
||||
"nvfp4": dict(no_a_quant=False, enabled=True),
|
||||
"nvfp4-noquant": dict(no_a_quant=True, enabled=True),
|
||||
}
|
||||
|
||||
_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
|
||||
|
||||
|
||||
def get_hadamard_matrix(group_size: int, dtype: torch.dtype, device: torch.device):
|
||||
return (
|
||||
deterministic_hadamard_matrix(group_size, dtype=dtype, device=device)
|
||||
* group_size**-0.5
|
||||
)
|
||||
|
||||
|
||||
def _quant_weight_nvfp4(
|
||||
b: torch.Tensor,
|
||||
forward_hadamard_matrix: torch.Tensor,
|
||||
global_scale: torch.Tensor,
|
||||
device: str,
|
||||
M: int,
|
||||
N: int,
|
||||
K: int,
|
||||
):
|
||||
weight_hf_e2m1, weight_hf_e8m0 = fusedQuantizeNv(
|
||||
b, forward_hadamard_matrix, global_scale
|
||||
)
|
||||
weight_hf_scale_block = to_blocked(weight_hf_e8m0, backend="triton").view(
|
||||
-1, K // 16
|
||||
)
|
||||
return weight_hf_e2m1, weight_hf_scale_block
|
||||
|
||||
|
||||
def build_nvfp4_runner(cfg, a, b, forward_hadamard_matrix, dtype, device, M, N, K):
|
||||
alpha = torch.tensor([1.0], device="cuda")
|
||||
global_scale = torch.tensor([1.0], device="cuda")
|
||||
weight_hf_e2m1, weight_hf_scale_block = _quant_weight_nvfp4(
|
||||
b, forward_hadamard_matrix, global_scale, device, M, N, K
|
||||
)
|
||||
|
||||
if cfg["no_a_quant"]:
|
||||
# Pre-quantize activation
|
||||
input_hf_e2m1, input_hf_e8m0 = fusedQuantizeNv(
|
||||
a, forward_hadamard_matrix, global_scale
|
||||
)
|
||||
input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton").view(
|
||||
-1, K // 16
|
||||
)
|
||||
|
||||
def run():
|
||||
return ops.cutlass_scaled_fp4_mm(
|
||||
input_hf_e2m1,
|
||||
weight_hf_e2m1,
|
||||
input_hf_scale_block,
|
||||
weight_hf_scale_block,
|
||||
alpha,
|
||||
torch.bfloat16,
|
||||
)
|
||||
|
||||
return run
|
||||
|
||||
# Quantize activation on-the-fly
|
||||
def run():
|
||||
input_hf_e2m1, input_hf_e8m0 = fusedQuantizeNv(
|
||||
a, forward_hadamard_matrix, global_scale
|
||||
)
|
||||
input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton").view(
|
||||
-1, K // 16
|
||||
)
|
||||
return ops.cutlass_scaled_fp4_mm(
|
||||
input_hf_e2m1,
|
||||
weight_hf_e2m1,
|
||||
input_hf_scale_block,
|
||||
weight_hf_scale_block,
|
||||
alpha,
|
||||
torch.bfloat16,
|
||||
)
|
||||
|
||||
return run
|
||||
|
||||
|
||||
@triton.testing.perf_report(
|
||||
triton.testing.Benchmark(
|
||||
x_names=["batch_size"],
|
||||
x_vals=[
|
||||
1,
|
||||
4,
|
||||
8,
|
||||
16,
|
||||
32,
|
||||
64,
|
||||
128,
|
||||
256,
|
||||
512,
|
||||
1024,
|
||||
2048,
|
||||
4096,
|
||||
8192,
|
||||
16384,
|
||||
24576,
|
||||
32768,
|
||||
],
|
||||
x_log=False,
|
||||
line_arg="provider",
|
||||
line_vals=_enabled,
|
||||
line_names=_enabled,
|
||||
ylabel="TFLOP/s (larger is better)",
|
||||
plot_name="BF16 vs NVFP4 GEMMs",
|
||||
args={},
|
||||
)
|
||||
)
|
||||
def benchmark(batch_size, provider, N, K, had_size):
|
||||
M = batch_size
|
||||
device = "cuda"
|
||||
dtype = torch.bfloat16
|
||||
|
||||
a = torch.randn((M, K), device=device, dtype=dtype)
|
||||
b = torch.randn((N, K), device=device, dtype=dtype)
|
||||
forward_hadamard_matrix = get_hadamard_matrix(had_size, dtype, device)
|
||||
|
||||
quantiles = [0.5, 0.2, 0.8]
|
||||
|
||||
if provider == "torch-bf16":
|
||||
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||
lambda: torch.nn.functional.linear(a, b), rep=200, quantiles=quantiles
|
||||
)
|
||||
else:
|
||||
cfg = PROVIDER_CFGS[provider]
|
||||
run_quant = build_nvfp4_runner(
|
||||
cfg, a, b, forward_hadamard_matrix, dtype, device, M, N, K
|
||||
)
|
||||
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||
lambda: run_quant(), rep=200, quantiles=quantiles
|
||||
)
|
||||
|
||||
to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
|
||||
return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
|
||||
|
||||
|
||||
def prepare_shapes(args):
|
||||
out = []
|
||||
for model, tp_size in itertools.product(args.models, args.tp_sizes):
|
||||
for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
|
||||
KN[tp_dim] //= tp_size
|
||||
KN.append(model)
|
||||
out.append(KN)
|
||||
return out
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--models",
|
||||
nargs="+",
|
||||
type=str,
|
||||
default=["meta-llama/Llama-3.3-70B-Instruct"],
|
||||
choices=list(WEIGHT_SHAPES.keys()),
|
||||
)
|
||||
parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
|
||||
args = parser.parse_args()
|
||||
|
||||
for K, N, model in prepare_shapes(args):
|
||||
for had_size in [16, 32, 64, 128]:
|
||||
print(f"{model}, N={N} K={K}, HAD={had_size}, BF16 vs NVFP4 GEMMs TFLOP/s:")
|
||||
benchmark.run(
|
||||
print_data=True,
|
||||
show_plots=True,
|
||||
save_path=f"bench_nvfp4_res_n{N}_k{K}",
|
||||
N=N,
|
||||
K=K,
|
||||
had_size=had_size,
|
||||
)
|
||||
|
||||
print("Benchmark finished!")
|
@ -1,7 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import itertools
|
||||
from typing import Callable
|
||||
from collections.abc import Callable
|
||||
from unittest.mock import patch
|
||||
|
||||
import pandas as pd
|
||||
@ -10,7 +10,8 @@ import torch
|
||||
from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
|
||||
from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
|
||||
from vllm.triton_utils import triton
|
||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||
|
||||
|
||||
def with_triton_mode(fn):
|
||||
|
@ -10,7 +10,8 @@ import vllm.model_executor.layers.activation # noqa F401
|
||||
from vllm.model_executor.custom_op import CustomOp
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.triton_utils import triton
|
||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||
|
||||
batch_size_range = [1, 16, 32, 64, 128]
|
||||
seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
|
||||
|
@ -22,8 +22,8 @@ Example:
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
from collections.abc import Callable
|
||||
from contextlib import nullcontext
|
||||
from typing import Callable, Optional
|
||||
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
@ -264,12 +264,12 @@ class CommunicatorBenchmark:
|
||||
def benchmark_allreduce_single(
|
||||
self,
|
||||
sequence_length: int,
|
||||
allreduce_fn: Callable[[torch.Tensor], Optional[torch.Tensor]],
|
||||
allreduce_fn: Callable[[torch.Tensor], torch.Tensor | None],
|
||||
should_use_fn: Callable[[torch.Tensor], bool],
|
||||
context,
|
||||
num_warmup: int,
|
||||
num_trials: int,
|
||||
) -> Optional[float]:
|
||||
) -> float | None:
|
||||
"""Benchmark method with CUDA graph optimization."""
|
||||
try:
|
||||
# Create test tensor (2D: sequence_length x hidden_size)
|
||||
|
@ -7,7 +7,8 @@ import torch
|
||||
|
||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||
|
||||
|
||||
@torch.inference_mode()
|
||||
|
@ -6,11 +6,12 @@ import copy
|
||||
import json
|
||||
import pickle
|
||||
import time
|
||||
from collections.abc import Callable
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum, auto
|
||||
from itertools import product
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Optional
|
||||
from typing import Any
|
||||
|
||||
import torch
|
||||
import torch.utils.benchmark as TBenchmark
|
||||
@ -158,7 +159,7 @@ def ref_group_gemm(
|
||||
seq_lens_cpu: torch.Tensor,
|
||||
prompt_lora_mapping_cpu: torch.Tensor,
|
||||
scaling: float,
|
||||
add_inputs: Optional[bool],
|
||||
add_inputs: bool | None,
|
||||
):
|
||||
"""
|
||||
Torch group gemm reference implementation to test correctness of
|
||||
@ -316,8 +317,8 @@ class BenchmarkContext:
|
||||
lora_rank: int
|
||||
sort_by_lora_id: bool
|
||||
dtype: torch.dtype
|
||||
seq_length: Optional[int] = None
|
||||
num_slices: Optional[int] = None # num_slices for slice based ops
|
||||
seq_length: int | None = None
|
||||
num_slices: int | None = None # num_slices for slice based ops
|
||||
|
||||
def with_seq_length(self, seq_length: int) -> "BenchmarkContext":
|
||||
ctx = copy.copy(self)
|
||||
@ -561,7 +562,7 @@ class BenchmarkTensors:
|
||||
}
|
||||
|
||||
def bench_fn_kwargs(
|
||||
self, op_type: OpType, add_inputs: Optional[bool] = None
|
||||
self, op_type: OpType, add_inputs: bool | None = None
|
||||
) -> dict[str, Any]:
|
||||
if op_type.is_shrink_fn():
|
||||
assert add_inputs is None
|
||||
@ -575,7 +576,7 @@ class BenchmarkTensors:
|
||||
raise ValueError(f"Unrecognized optype {self}")
|
||||
|
||||
def test_correctness(
|
||||
self, op_type: OpType, expand_fn_add_inputs: Optional[bool]
|
||||
self, op_type: OpType, expand_fn_add_inputs: bool | None
|
||||
) -> bool:
|
||||
"""
|
||||
Test correctness of op_type implementation against a grouped gemm
|
||||
@ -611,8 +612,8 @@ def bench_optype(
|
||||
ctx: BenchmarkContext,
|
||||
arg_pool_size: int,
|
||||
op_type: OpType,
|
||||
cuda_graph_nops: Optional[int] = None,
|
||||
expand_fn_add_inputs: Optional[bool] = None,
|
||||
cuda_graph_nops: int | None = None,
|
||||
expand_fn_add_inputs: bool | None = None,
|
||||
test_correctness: bool = False,
|
||||
) -> TMeasurement:
|
||||
assert arg_pool_size >= 1
|
||||
@ -679,7 +680,7 @@ def bench_torch_mm(
|
||||
ctx: BenchmarkContext,
|
||||
arg_pool_size: int,
|
||||
op_type: OpType,
|
||||
cuda_graph_nops: Optional[int] = None,
|
||||
cuda_graph_nops: int | None = None,
|
||||
) -> TMeasurement:
|
||||
"""
|
||||
Benchmark basic torch.mm as a roofline.
|
||||
@ -744,7 +745,7 @@ def use_cuda_graph_recommendation() -> str:
|
||||
"""
|
||||
|
||||
|
||||
def print_timers(timers: list[TMeasurement], args: Optional[argparse.Namespace] = None):
|
||||
def print_timers(timers: list[TMeasurement], args: argparse.Namespace | None = None):
|
||||
compare = TBenchmark.Compare(timers)
|
||||
compare.print()
|
||||
|
||||
|
@ -8,10 +8,9 @@ import math
|
||||
import os
|
||||
import pickle as pkl
|
||||
import time
|
||||
from collections.abc import Iterable
|
||||
from collections.abc import Callable, Iterable
|
||||
from dataclasses import dataclass
|
||||
from itertools import product
|
||||
from typing import Callable, Optional
|
||||
|
||||
import pandas as pd
|
||||
import torch
|
||||
@ -63,23 +62,23 @@ class BenchmarkTensors:
|
||||
a: torch.Tensor
|
||||
|
||||
w_q: torch.Tensor
|
||||
group_size: Optional[int]
|
||||
group_size: int | None
|
||||
wtype: ScalarType
|
||||
w_g_s: torch.Tensor
|
||||
w_g_zp: Optional[torch.Tensor]
|
||||
w_ch_s: Optional[torch.Tensor]
|
||||
w_tok_s: Optional[torch.Tensor]
|
||||
w_g_zp: torch.Tensor | None
|
||||
w_ch_s: torch.Tensor | None
|
||||
w_tok_s: torch.Tensor | None
|
||||
|
||||
|
||||
@dataclass
|
||||
class TypeConfig:
|
||||
act_type: torch.dtype
|
||||
weight_type: ScalarType
|
||||
output_type: Optional[torch.dtype]
|
||||
group_scale_type: Optional[torch.dtype]
|
||||
group_zero_type: Optional[torch.dtype]
|
||||
channel_scale_type: Optional[torch.dtype]
|
||||
token_scale_type: Optional[torch.dtype]
|
||||
output_type: torch.dtype | None
|
||||
group_scale_type: torch.dtype | None
|
||||
group_zero_type: torch.dtype | None
|
||||
channel_scale_type: torch.dtype | None
|
||||
token_scale_type: torch.dtype | None
|
||||
|
||||
|
||||
def rand_data(shape, dtype=torch.float16, scale=1):
|
||||
@ -93,8 +92,8 @@ def quantize_and_pack(
|
||||
atype: torch.dtype,
|
||||
w: torch.Tensor,
|
||||
wtype: ScalarType,
|
||||
stype: Optional[torch.dtype],
|
||||
group_size: Optional[int],
|
||||
stype: torch.dtype | None,
|
||||
group_size: int | None,
|
||||
zero_points: bool = False,
|
||||
):
|
||||
assert wtype.is_integer(), "TODO: support floating point weights"
|
||||
@ -113,7 +112,7 @@ def quantize_and_pack(
|
||||
|
||||
|
||||
def create_bench_tensors(
|
||||
shape: tuple[int, int, int], types: TypeConfig, group_size: Optional[int]
|
||||
shape: tuple[int, int, int], types: TypeConfig, group_size: int | None
|
||||
) -> list[BenchmarkTensors]:
|
||||
m, n, k = shape
|
||||
|
||||
@ -331,8 +330,8 @@ def bench_fns(label: str, sub_label: str, description: str, fns: list[Callable])
|
||||
return res
|
||||
|
||||
|
||||
_SWEEP_SCHEDULES_RESULTS: Optional[pd.DataFrame] = None
|
||||
_SWEEP_SCHEDULES_RESULTS_CSV: Optional[str] = None
|
||||
_SWEEP_SCHEDULES_RESULTS: pd.DataFrame | None = None
|
||||
_SWEEP_SCHEDULES_RESULTS_CSV: str | None = None
|
||||
|
||||
|
||||
def bench(
|
||||
|
@ -579,10 +579,12 @@ def main(args: argparse.Namespace):
|
||||
E = config.ffn_config.moe_num_experts
|
||||
topk = config.ffn_config.moe_top_k
|
||||
intermediate_size = config.ffn_config.ffn_hidden_size
|
||||
hidden_size = config.hidden_size
|
||||
elif config.architectures[0] == "JambaForCausalLM":
|
||||
E = config.num_experts
|
||||
topk = config.num_experts_per_tok
|
||||
intermediate_size = config.intermediate_size
|
||||
hidden_size = config.hidden_size
|
||||
elif config.architectures[0] in (
|
||||
"DeepseekV2ForCausalLM",
|
||||
"DeepseekV3ForCausalLM",
|
||||
@ -592,6 +594,7 @@ def main(args: argparse.Namespace):
|
||||
E = config.n_routed_experts
|
||||
topk = config.num_experts_per_tok
|
||||
intermediate_size = config.moe_intermediate_size
|
||||
hidden_size = config.hidden_size
|
||||
elif config.architectures[0] in (
|
||||
"Qwen2MoeForCausalLM",
|
||||
"Qwen3MoeForCausalLM",
|
||||
@ -600,10 +603,18 @@ def main(args: argparse.Namespace):
|
||||
E = config.num_experts
|
||||
topk = config.num_experts_per_tok
|
||||
intermediate_size = config.moe_intermediate_size
|
||||
hidden_size = config.hidden_size
|
||||
elif config.architectures[0] == "Qwen3VLMoeForConditionalGeneration":
|
||||
text_config = config.get_text_config()
|
||||
E = text_config.num_experts
|
||||
topk = text_config.num_experts_per_tok
|
||||
intermediate_size = text_config.moe_intermediate_size
|
||||
hidden_size = text_config.hidden_size
|
||||
elif config.architectures[0] in ("HunYuanMoEV1ForCausalLM"):
|
||||
E = config.num_experts
|
||||
topk = config.moe_topk[0]
|
||||
intermediate_size = config.moe_intermediate_size[0]
|
||||
hidden_size = config.hidden_size
|
||||
else:
|
||||
# Support for llama4
|
||||
config = config.get_text_config()
|
||||
@ -611,6 +622,7 @@ def main(args: argparse.Namespace):
|
||||
E = config.num_local_experts
|
||||
topk = config.num_experts_per_tok
|
||||
intermediate_size = config.intermediate_size
|
||||
hidden_size = config.hidden_size
|
||||
enable_ep = bool(args.enable_expert_parallel)
|
||||
if enable_ep:
|
||||
ensure_divisibility(E, args.tp_size, "Number of experts")
|
||||
@ -619,8 +631,7 @@ def main(args: argparse.Namespace):
|
||||
else:
|
||||
ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size")
|
||||
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
||||
hidden_size = config.hidden_size
|
||||
dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
|
||||
dtype = torch.float16 if current_platform.is_rocm() else config.dtype
|
||||
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
|
||||
use_int8_w8a16 = args.dtype == "int8_w8a16"
|
||||
block_quant_shape = get_weight_block_size_safety(config)
|
||||
|
@ -344,7 +344,7 @@ def main(args: argparse.Namespace):
|
||||
topk = config.num_experts_per_tok
|
||||
|
||||
hidden_size = config.hidden_size
|
||||
dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
|
||||
dtype = torch.float16 if current_platform.is_rocm() else config.dtype
|
||||
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
|
||||
use_int8_w8a16 = args.dtype == "int8_w8a16"
|
||||
use_customized_permute = args.use_customized_permute
|
||||
|
@ -3,16 +3,15 @@
|
||||
|
||||
import random
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.logger import init_logger
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import (
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.torch_utils import (
|
||||
STR_DTYPE_TO_TORCH_DTYPE,
|
||||
FlexibleArgumentParser,
|
||||
create_kv_caches_with_random,
|
||||
)
|
||||
|
||||
@ -37,7 +36,7 @@ def main(
|
||||
seed: int,
|
||||
do_profile: bool,
|
||||
device: str = "cuda",
|
||||
kv_cache_dtype: Optional[str] = None,
|
||||
kv_cache_dtype: str | None = None,
|
||||
) -> None:
|
||||
current_platform.seed_everything(seed)
|
||||
|
||||
|
@ -3,8 +3,8 @@
|
||||
|
||||
import argparse
|
||||
import math
|
||||
from collections.abc import Callable
|
||||
from contextlib import contextmanager
|
||||
from typing import Callable
|
||||
from unittest.mock import patch
|
||||
|
||||
import torch
|
||||
|
@ -1,155 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import itertools
|
||||
|
||||
import torch
|
||||
|
||||
from vllm import _custom_ops as vllm_ops
|
||||
from vllm.triton_utils import triton
|
||||
|
||||
|
||||
def polynorm_naive(
|
||||
x: torch.Tensor,
|
||||
weight: torch.Tensor,
|
||||
bias: torch.Tensor,
|
||||
eps: float = 1e-6,
|
||||
):
|
||||
orig_shape = x.shape
|
||||
x = x.view(-1, x.shape[-1])
|
||||
|
||||
def norm(x, eps: float):
|
||||
return x / torch.sqrt(x.pow(2).mean(-1, keepdim=True) + eps)
|
||||
|
||||
x = x.float()
|
||||
return (
|
||||
(
|
||||
weight[0] * norm(x**3, eps)
|
||||
+ weight[1] * norm(x**2, eps)
|
||||
+ weight[2] * norm(x, eps)
|
||||
+ bias
|
||||
)
|
||||
.to(weight.dtype)
|
||||
.view(orig_shape)
|
||||
)
|
||||
|
||||
|
||||
def polynorm_vllm(
|
||||
x: torch.Tensor,
|
||||
weight: torch.Tensor,
|
||||
bias: torch.Tensor,
|
||||
eps: float = 1e-6,
|
||||
):
|
||||
orig_shape = x.shape
|
||||
x = x.view(-1, x.shape[-1])
|
||||
|
||||
out = torch.empty_like(x)
|
||||
vllm_ops.poly_norm(out, x, weight, bias, eps)
|
||||
output = out
|
||||
|
||||
output = output.view(orig_shape)
|
||||
return output
|
||||
|
||||
|
||||
def calculate_diff(batch_size, seq_len, hidden_dim):
|
||||
dtype = torch.bfloat16
|
||||
x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda")
|
||||
weight = torch.ones(3, dtype=dtype, device="cuda")
|
||||
bias = torch.ones(1, dtype=dtype, device="cuda")
|
||||
|
||||
output_naive = polynorm_naive(x, weight, bias)
|
||||
output_vllm = polynorm_vllm(x, weight, bias)
|
||||
|
||||
if torch.allclose(output_naive, output_vllm, atol=1e-2, rtol=1e-2):
|
||||
print("✅ All implementations match")
|
||||
else:
|
||||
print("❌ Implementations differ")
|
||||
|
||||
|
||||
batch_size_range = [2**i for i in range(0, 7, 2)]
|
||||
seq_length_range = [2**i for i in range(6, 11, 1)]
|
||||
dim_range = [2048, 4096]
|
||||
configs = list(itertools.product(dim_range, batch_size_range, seq_length_range))
|
||||
|
||||
|
||||
def get_benchmark():
|
||||
@triton.testing.perf_report(
|
||||
triton.testing.Benchmark(
|
||||
x_names=["dim", "batch_size", "seq_len"],
|
||||
x_vals=[list(_) for _ in configs],
|
||||
line_arg="provider",
|
||||
line_vals=["naive", "vllm"],
|
||||
line_names=["Naive", "vLLM"],
|
||||
styles=[("blue", "-"), ("red", "-")],
|
||||
ylabel="us",
|
||||
plot_name="polynorm-perf",
|
||||
args={},
|
||||
)
|
||||
)
|
||||
def benchmark(dim, batch_size, seq_len, provider):
|
||||
dtype = torch.bfloat16
|
||||
hidden_dim = dim * 4
|
||||
|
||||
x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda")
|
||||
weight = torch.ones(3, dtype=dtype, device="cuda")
|
||||
bias = torch.ones(1, dtype=dtype, device="cuda")
|
||||
|
||||
quantiles = [0.5, 0.2, 0.8]
|
||||
|
||||
if provider == "naive":
|
||||
ms, min_ms, max_ms = triton.testing.do_bench(
|
||||
lambda: polynorm_naive(x, weight, bias),
|
||||
quantiles=quantiles,
|
||||
)
|
||||
else:
|
||||
ms, min_ms, max_ms = triton.testing.do_bench(
|
||||
lambda: polynorm_vllm(x, weight, bias),
|
||||
quantiles=quantiles,
|
||||
)
|
||||
|
||||
return 1000 * ms, 1000 * max_ms, 1000 * min_ms
|
||||
|
||||
return benchmark
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--batch-size",
|
||||
type=int,
|
||||
default=4,
|
||||
help="Batch size",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--seq-len",
|
||||
type=int,
|
||||
default=128,
|
||||
help="Sequence length",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--hidden-dim",
|
||||
type=int,
|
||||
default=8192,
|
||||
help="Intermediate size of MLP",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--save-path",
|
||||
type=str,
|
||||
default="./configs/polnorm/",
|
||||
help="Path to save polnorm benchmark results",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Run correctness test
|
||||
calculate_diff(
|
||||
batch_size=args.batch_size,
|
||||
seq_len=args.seq_len,
|
||||
hidden_dim=args.hidden_dim,
|
||||
)
|
||||
|
||||
benchmark = get_benchmark()
|
||||
# Run performance benchmark
|
||||
benchmark.run(print_data=True, save_path=args.save_path)
|
@ -7,7 +7,8 @@ import torch
|
||||
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||
|
||||
|
||||
@torch.inference_mode()
|
||||
|
@ -1,7 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from __future__ import annotations
|
||||
|
||||
import random
|
||||
import time
|
||||
|
||||
@ -11,9 +9,9 @@ from tabulate import tabulate
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.logger import init_logger
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import (
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.torch_utils import (
|
||||
STR_DTYPE_TO_TORCH_DTYPE,
|
||||
FlexibleArgumentParser,
|
||||
create_kv_caches_with_random,
|
||||
)
|
||||
|
||||
|
@ -1,7 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from __future__ import annotations
|
||||
|
||||
import random
|
||||
import time
|
||||
|
||||
@ -14,9 +12,9 @@ from vllm.attention.ops.triton_reshape_and_cache_flash import (
|
||||
)
|
||||
from vllm.logger import init_logger
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import (
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.torch_utils import (
|
||||
STR_DTYPE_TO_TORCH_DTYPE,
|
||||
FlexibleArgumentParser,
|
||||
create_kv_caches_with_random_flash,
|
||||
)
|
||||
|
||||
|
@ -2,7 +2,6 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import itertools
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
from flashinfer.norm import fused_add_rmsnorm, rmsnorm
|
||||
@ -21,8 +20,8 @@ class HuggingFaceRMSNorm(nn.Module):
|
||||
def forward(
|
||||
self,
|
||||
x: torch.Tensor,
|
||||
residual: Optional[torch.Tensor] = None,
|
||||
) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
|
||||
residual: torch.Tensor | None = None,
|
||||
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
||||
orig_dtype = x.dtype
|
||||
x = x.to(torch.float32)
|
||||
if residual is not None:
|
||||
@ -41,7 +40,7 @@ class HuggingFaceRMSNorm(nn.Module):
|
||||
def rmsnorm_naive(
|
||||
x: torch.Tensor,
|
||||
weight: torch.Tensor,
|
||||
residual: Optional[torch.Tensor] = None,
|
||||
residual: torch.Tensor | None = None,
|
||||
eps: float = 1e-6,
|
||||
):
|
||||
naive_norm = HuggingFaceRMSNorm(x.shape[-1], eps=eps)
|
||||
@ -65,7 +64,7 @@ def rmsnorm_naive(
|
||||
def rmsnorm_flashinfer(
|
||||
x: torch.Tensor,
|
||||
weight: torch.Tensor,
|
||||
residual: Optional[torch.Tensor] = None,
|
||||
residual: torch.Tensor | None = None,
|
||||
eps: float = 1e-6,
|
||||
):
|
||||
orig_shape = x.shape
|
||||
@ -89,7 +88,7 @@ def rmsnorm_flashinfer(
|
||||
def rmsnorm_vllm(
|
||||
x: torch.Tensor,
|
||||
weight: torch.Tensor,
|
||||
residual: Optional[torch.Tensor] = None,
|
||||
residual: torch.Tensor | None = None,
|
||||
eps: float = 1e-6,
|
||||
):
|
||||
orig_shape = x.shape
|
||||
|
@ -2,7 +2,6 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from itertools import accumulate
|
||||
from typing import Optional
|
||||
|
||||
import nvtx
|
||||
import torch
|
||||
@ -18,7 +17,7 @@ def benchmark_rope_kernels_multi_lora(
|
||||
seq_len: int,
|
||||
num_heads: int,
|
||||
head_size: int,
|
||||
rotary_dim: Optional[int],
|
||||
rotary_dim: int | None,
|
||||
dtype: torch.dtype,
|
||||
seed: int,
|
||||
device: str,
|
||||
|
@ -1,5 +1,19 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
"""
|
||||
Comprehensive 3-way SiLU Benchmark Suite
|
||||
|
||||
This benchmark compares three SiLU implementations:
|
||||
1. SiLU V2 (CUDA) - Optimized CUDA kernel implementation
|
||||
2. Triton Kernel - Triton-based implementation
|
||||
|
||||
The suite generates detailed performance comparisons including:
|
||||
- Memory bandwidth utilization
|
||||
- Speedup ratios (baseline vs optimized implementations)
|
||||
- Performance across different expert configurations and token distributions
|
||||
"""
|
||||
|
||||
from collections.abc import Callable
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
@ -7,7 +21,7 @@ import numpy as np
|
||||
import torch
|
||||
|
||||
from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
|
||||
silu_mul_fp8_quant_deep_gemm_cuda,
|
||||
persistent_masked_m_silu_mul_quant,
|
||||
)
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.triton_utils import tl, triton
|
||||
@ -94,6 +108,7 @@ def silu_mul_fp8_quant_deep_gemm_triton(
|
||||
num_parallel_tokens,
|
||||
group_size: int = 128,
|
||||
eps: float = 1e-10,
|
||||
expert_offsets: torch.Tensor = None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
"""Quantize silu(y[..., :H]) * y[..., H:] to FP8 with group per-token scales
|
||||
|
||||
@ -174,7 +189,7 @@ def silu_mul_fp8_quant_deep_gemm_triton(
|
||||
|
||||
|
||||
# Parse generation strategies
|
||||
strategies = ["uniform", "max_t", "first_t"]
|
||||
strategies = ["random_imbalanced", "uniform", "max_t"]
|
||||
|
||||
|
||||
def benchmark(
|
||||
@ -195,15 +210,27 @@ def benchmark(
|
||||
current_platform.seed_everything(42 + seed_offset)
|
||||
y = torch.rand((E, T, 2 * H), dtype=torch.bfloat16, device="cuda").contiguous()
|
||||
|
||||
if gen_strategy == "uniform":
|
||||
r = torch.rand(size=(E,), device="cuda")
|
||||
if gen_strategy == "random_imbalanced":
|
||||
|
||||
def generate_expert_loads(n_e, total_tokens, ratio, device="cuda"):
|
||||
mean = total_tokens // n_e
|
||||
min_max = mean // ratio
|
||||
e = torch.ones(size=(E,), dtype=torch.int64, device=device) * mean
|
||||
e[0] = min_max
|
||||
r = torch.rand(size=(E - 1,))
|
||||
r /= r.sum()
|
||||
r *= total_tokens - min_max
|
||||
r = r.round().long()
|
||||
e[1:] = r.to(device=device)
|
||||
return e
|
||||
|
||||
tokens_per_expert = generate_expert_loads(E, total_tokens, 0.7, "cuda")
|
||||
elif gen_strategy == "uniform":
|
||||
r = torch.rand(size=(E,))
|
||||
r /= r.sum()
|
||||
r *= total_tokens
|
||||
tokens_per_expert = r.int()
|
||||
tokens_per_expert = torch.minimum(
|
||||
tokens_per_expert,
|
||||
torch.ones((E,), device=r.device, dtype=torch.int) * T,
|
||||
)
|
||||
r = r.round().long()
|
||||
tokens_per_expert = r
|
||||
elif gen_strategy == "max_t":
|
||||
tokens_per_expert = torch.empty(size=(E,), dtype=torch.int32, device="cuda")
|
||||
tokens_per_expert.fill_(total_tokens / E)
|
||||
@ -281,40 +308,34 @@ def benchmark(
|
||||
|
||||
|
||||
def create_comparison_plot(
|
||||
ratio, cuda_times, baseline_times, config_labels, strategy_name, id
|
||||
ratios, silu_v2_times, triton_times, config_labels, strategy_name, id
|
||||
):
|
||||
"""Create a comparison plot for a specific generation strategy"""
|
||||
fig, ax = plt.subplots(1, 1, figsize=(16, 6))
|
||||
fig, ax = plt.subplots(1, 1, figsize=(18, 6))
|
||||
|
||||
# Configure x-axis positions
|
||||
x = np.arange(len(config_labels))
|
||||
width = 0.35
|
||||
width = 0.25
|
||||
|
||||
# Execution Time plot (lower is better)
|
||||
ax.bar(x, silu_v2_times, width, label="SiLU V2 (CUDA)", alpha=0.8, color="blue")
|
||||
ax.bar(
|
||||
x - width / 2, cuda_times, width, label="CUDA Kernel", alpha=0.8, color="blue"
|
||||
)
|
||||
ax.bar(
|
||||
x + width / 2,
|
||||
baseline_times,
|
||||
width,
|
||||
label="Baseline",
|
||||
alpha=0.8,
|
||||
color="orange",
|
||||
x + width, triton_times, width, label="Triton Kernel", alpha=0.8, color="green"
|
||||
)
|
||||
|
||||
# Add speedup labels over each bar pair
|
||||
# Add speedup labels over each bar trio
|
||||
for i in range(len(x)):
|
||||
speedup = ratio[i]
|
||||
max_height = max(cuda_times[i], baseline_times[i])
|
||||
triton_v2_speedup = ratios[i][1] # triton/v2
|
||||
max_height = max(silu_v2_times[i], triton_times[i])
|
||||
|
||||
# Triton/V2 speedup
|
||||
ax.text(
|
||||
x[i],
|
||||
x[i] + width / 2,
|
||||
max_height + max_height * 0.02,
|
||||
f"{speedup:.2f}x",
|
||||
f"{triton_v2_speedup:.2f}x",
|
||||
ha="center",
|
||||
va="bottom",
|
||||
fontweight="bold",
|
||||
fontsize=9,
|
||||
fontsize=8,
|
||||
)
|
||||
|
||||
ax.set_xlabel("Configuration")
|
||||
@ -332,56 +353,75 @@ def create_comparison_plot(
|
||||
|
||||
|
||||
def create_combined_plot(all_results):
|
||||
"""Create a combined plot with all strategies in one PNG"""
|
||||
num_strategies = len(all_results)
|
||||
fig, axes = plt.subplots(num_strategies, 1, figsize=(20, 6 * num_strategies))
|
||||
fig, axes = plt.subplots(num_strategies, 1, figsize=(22, 7 * num_strategies))
|
||||
|
||||
if num_strategies == 1:
|
||||
axes = [axes]
|
||||
|
||||
for idx, (
|
||||
strategy_name,
|
||||
ratio,
|
||||
cuda_times,
|
||||
baseline_times,
|
||||
all_ratios,
|
||||
all_silu_v2_results,
|
||||
all_triton_results,
|
||||
config_labels,
|
||||
config_x_axis,
|
||||
) in enumerate(all_results):
|
||||
ax = axes[idx]
|
||||
|
||||
# Flatten the nested results to get bandwidth percentages for plotting
|
||||
silu_v2_bandwidths = []
|
||||
triton_bandwidths = []
|
||||
flat_ratios = []
|
||||
|
||||
for config_results in all_silu_v2_results:
|
||||
for result in config_results:
|
||||
silu_v2_bandwidths.append(result[3]) # bandwidth percentage
|
||||
|
||||
for config_results in all_triton_results:
|
||||
for result in config_results:
|
||||
triton_bandwidths.append(result[3]) # bandwidth percentage
|
||||
|
||||
for config_ratios in all_ratios:
|
||||
for ratio in config_ratios:
|
||||
flat_ratios.append(ratio)
|
||||
|
||||
# Configure x-axis positions
|
||||
x = np.arange(len(config_labels))
|
||||
width = 0.35
|
||||
width = 0.25
|
||||
|
||||
# Execution Time plot (lower is better)
|
||||
# Bandwidth utilization plot (higher is better)
|
||||
ax.bar(
|
||||
x - width / 2,
|
||||
cuda_times,
|
||||
x,
|
||||
silu_v2_bandwidths,
|
||||
width,
|
||||
label="CUDA Kernel",
|
||||
label="SiLU V2 (CUDA)",
|
||||
alpha=0.8,
|
||||
color="blue",
|
||||
)
|
||||
ax.bar(
|
||||
x + width / 2,
|
||||
baseline_times,
|
||||
x + width,
|
||||
triton_bandwidths,
|
||||
width,
|
||||
label="Baseline",
|
||||
label="Triton Kernel",
|
||||
alpha=0.8,
|
||||
color="orange",
|
||||
color="green",
|
||||
)
|
||||
|
||||
# Add speedup labels over each bar pair
|
||||
# Add speedup labels over each bar trio
|
||||
for i in range(len(x)):
|
||||
speedup = ratio[i]
|
||||
max_height = max(cuda_times[i], baseline_times[i])
|
||||
triton_v2_speedup = flat_ratios[i] # triton/v2
|
||||
max_height = max(silu_v2_bandwidths[i], triton_bandwidths[i])
|
||||
|
||||
# Triton/V2 speedup
|
||||
ax.text(
|
||||
x[i],
|
||||
x[i] + width / 2,
|
||||
max_height + max_height * 0.02,
|
||||
f"{speedup:.2f}x",
|
||||
f"{triton_v2_speedup:.2f}x",
|
||||
ha="center",
|
||||
va="bottom",
|
||||
fontweight="bold",
|
||||
fontsize=9,
|
||||
fontsize=8,
|
||||
)
|
||||
|
||||
ax.set_xlabel("Configuration")
|
||||
@ -395,7 +435,7 @@ def create_combined_plot(all_results):
|
||||
ax.grid(True, alpha=0.3)
|
||||
|
||||
plt.tight_layout()
|
||||
filename = "../../silu_bench/silu_benchmark_combined.png"
|
||||
filename = "silu_benchmark_combined_3way.png"
|
||||
plt.savefig(filename, dpi=300, bbox_inches="tight")
|
||||
plt.show()
|
||||
|
||||
@ -405,7 +445,9 @@ def create_combined_plot(all_results):
|
||||
outer_dim = 7168
|
||||
configs = [
|
||||
# DeepSeekV3 Configs
|
||||
# (1, 56, 7168),
|
||||
(8, 1024, 7168),
|
||||
# (32, 56, 7168),
|
||||
# DeepSeekV3 Configs
|
||||
(32, 1024, 7168),
|
||||
# DeepSeekV3 Configs
|
||||
@ -417,6 +459,7 @@ num_warmups = 20
|
||||
|
||||
strategy_descriptions = {
|
||||
"uniform": "Uniform Random",
|
||||
"random_imbalanced": "Imbalanced Random",
|
||||
"max_t": "Even Assignment",
|
||||
"first_t": "experts[0] = T, experts[1:] = 0",
|
||||
}
|
||||
@ -433,28 +476,31 @@ for id, strategy in enumerate(strategies):
|
||||
print(f"Testing strategy: {strategy_descriptions[strategy]}")
|
||||
print(f"{'=' * 60}")
|
||||
|
||||
# Collect benchmark data for both algorithms
|
||||
# Collect benchmark data for all three algorithms
|
||||
config_labels = []
|
||||
config_x_axis = []
|
||||
all_cuda_results = []
|
||||
all_baseline_results = []
|
||||
all_silu_v2_results = []
|
||||
all_triton_results = []
|
||||
all_ratios = []
|
||||
|
||||
for E, T, H in configs:
|
||||
total_tokens_config = [8 * E, 16 * E, 32 * E, 64 * E, 128 * E, 256 * E]
|
||||
total_tokens_config = []
|
||||
for i in [8, 16, 32, 64, 128, 256, 512]:
|
||||
if i <= T:
|
||||
total_tokens_config.append(i * E)
|
||||
config_x_axis.append(total_tokens_config)
|
||||
|
||||
cuda_results = []
|
||||
baseline_results = []
|
||||
silu_v2_results = []
|
||||
triton_results = []
|
||||
ratios = []
|
||||
|
||||
for total_tokens in total_tokens_config:
|
||||
config_label = f"E={E},T={T},H={H},TT={total_tokens}"
|
||||
config_labels.append(config_label)
|
||||
|
||||
# CUDA kernel results
|
||||
time_ms_cuda, gflops, gbps, perc = benchmark(
|
||||
silu_mul_fp8_quant_deep_gemm_cuda,
|
||||
# SiLU V2 (CUDA kernel) results
|
||||
time_ms_silu_v2, gflops, gbps, perc = benchmark(
|
||||
persistent_masked_m_silu_mul_quant,
|
||||
E,
|
||||
T,
|
||||
H,
|
||||
@ -463,9 +509,9 @@ for id, strategy in enumerate(strategies):
|
||||
num_warmups=num_warmups,
|
||||
gen_strategy=strategy,
|
||||
)
|
||||
cuda_results.append((time_ms_cuda, gflops, gbps, perc))
|
||||
silu_v2_results.append((time_ms_silu_v2, gflops, gbps, perc))
|
||||
|
||||
# Baseline results
|
||||
# Triton kernel results
|
||||
time_ms_triton, gflops, gbps, perc = benchmark(
|
||||
silu_mul_fp8_quant_deep_gemm_triton,
|
||||
E,
|
||||
@ -476,12 +522,20 @@ for id, strategy in enumerate(strategies):
|
||||
num_warmups=num_warmups,
|
||||
gen_strategy=strategy,
|
||||
)
|
||||
baseline_results.append((time_ms_triton, gflops, gbps, perc))
|
||||
ratios.append(time_ms_triton / time_ms_cuda)
|
||||
triton_results.append((time_ms_triton, gflops, gbps, perc))
|
||||
|
||||
print(f"Completed: {config_label}")
|
||||
all_cuda_results.append(cuda_results)
|
||||
all_baseline_results.append(baseline_results)
|
||||
# Calculate speedup ratios (triton baseline / implementation)
|
||||
triton_v2_ratio = time_ms_triton / time_ms_silu_v2
|
||||
ratios.append(triton_v2_ratio)
|
||||
|
||||
print(
|
||||
f"Completed: {config_label}:"
|
||||
f" V2: {time_ms_silu_v2:.3f}ms,"
|
||||
f" Triton: {time_ms_triton:.3f}ms"
|
||||
)
|
||||
|
||||
all_silu_v2_results.append(silu_v2_results)
|
||||
all_triton_results.append(triton_results)
|
||||
all_ratios.append(ratios)
|
||||
|
||||
# Store results for combined plotting
|
||||
@ -489,8 +543,8 @@ for id, strategy in enumerate(strategies):
|
||||
(
|
||||
strategy_descriptions[strategy],
|
||||
all_ratios,
|
||||
all_cuda_results,
|
||||
all_baseline_results,
|
||||
all_silu_v2_results,
|
||||
all_triton_results,
|
||||
config_labels,
|
||||
config_x_axis,
|
||||
)
|
||||
@ -498,15 +552,18 @@ for id, strategy in enumerate(strategies):
|
||||
|
||||
# Print summary table for this strategy
|
||||
print(f"\nSummary Table - {strategy_descriptions[strategy]}:")
|
||||
print(f"{'Config':<20} {'CUDA Time(ms)':<12} {'Base Time(ms)':<12} {'Speedup':<8}")
|
||||
print("-" * 60)
|
||||
print(f" {'V2 Time(ms)':<12} {'Triton Time(ms)':<14} {'Triton/V2':<10}")
|
||||
print("-" * 90)
|
||||
|
||||
for i, (E, T, H) in enumerate(configs):
|
||||
speedup = baseline_results[i][0] / cuda_results[i][0]
|
||||
# Get the first result for each config (simplifying for summary)
|
||||
v2_time = silu_v2_results[i][0]
|
||||
triton_time = triton_results[i][0]
|
||||
triton_v2_speedup = triton_time / v2_time
|
||||
config_label = f"E={E:3d},T={T:4d},H={H:4d}"
|
||||
print(
|
||||
f"{config_label:<20} {cuda_results[i][0]:8.5f} "
|
||||
f"{baseline_results[i][0]:8.5f} {speedup:6.2f}x"
|
||||
f"{config_label:<20} {v2_time:8.5f} {triton_time:10.5f} "
|
||||
f"{triton_v2_speedup:8.2f}x"
|
||||
)
|
||||
|
||||
|
||||
@ -514,15 +571,14 @@ def create_total_tokens_plot(all_results):
|
||||
num_strategies = len(all_results)
|
||||
num_configs = len(configs)
|
||||
|
||||
# Create side-by-side subplots: 2 columns for speedup and bandwidth percentage
|
||||
fig, axs = plt.subplots(
|
||||
num_strategies, num_configs * 2, figsize=(28, 6 * num_strategies)
|
||||
num_strategies, num_configs * 2, figsize=(32, 8 * num_strategies)
|
||||
)
|
||||
|
||||
# Add main title to the entire figure
|
||||
fig.suptitle(
|
||||
"Performance Analysis: Speedup vs Bandwidth Utilization (Triton & CUDA)",
|
||||
fontsize=16,
|
||||
"Performance Analysis: Speedup vs Bandwidth Utilization (SiLU V2, and Triton)",
|
||||
fontsize=18,
|
||||
fontweight="bold",
|
||||
y=0.98,
|
||||
)
|
||||
@ -539,8 +595,8 @@ def create_total_tokens_plot(all_results):
|
||||
(
|
||||
strategy_name,
|
||||
all_ratios,
|
||||
all_cuda_results,
|
||||
all_baseline_results,
|
||||
all_silu_v2_results,
|
||||
all_triton_results,
|
||||
config_labels,
|
||||
config_x_axis,
|
||||
) = result
|
||||
@ -555,42 +611,54 @@ def create_total_tokens_plot(all_results):
|
||||
ratios = all_ratios[config_idx]
|
||||
total_tokens_values = config_x_axis[config_idx]
|
||||
|
||||
# Extract CUDA and Triton bandwidth percentages
|
||||
cuda_bandwidth_percentages = [
|
||||
result[3] for result in all_cuda_results[config_idx]
|
||||
# Extract speedup ratios
|
||||
triton_v2_ratios = [ratio for ratio in ratios]
|
||||
|
||||
# Extract bandwidth percentages for all implementations
|
||||
v2_bandwidth_percentages = [
|
||||
result[3] for result in all_silu_v2_results[config_idx]
|
||||
]
|
||||
triton_bandwidth_percentages = [
|
||||
result[3] for result in all_baseline_results[config_idx]
|
||||
result[3] for result in all_triton_results[config_idx]
|
||||
]
|
||||
|
||||
# Plot speedup ratios vs total tokens (left plot)
|
||||
ax_speedup.plot(
|
||||
total_tokens_values, ratios, "bo-", linewidth=3, markersize=8
|
||||
total_tokens_values,
|
||||
triton_v2_ratios,
|
||||
"go-",
|
||||
linewidth=3,
|
||||
markersize=8,
|
||||
label="Triton/V2 Speedup",
|
||||
)
|
||||
ax_speedup.set_title(
|
||||
f"{strategy_name}\nSpeedup (CUDA/Triton)\nE={E}, T={T}, H={H}",
|
||||
f"{strategy_name}\nSpeedup vs Baseline (Triton)\nE={E}, T={T}, H={H}",
|
||||
fontsize=12,
|
||||
fontweight="bold",
|
||||
)
|
||||
ax_speedup.set_xlabel("Total Tokens", fontweight="bold", fontsize=11)
|
||||
ax_speedup.set_ylabel("Speedup Ratio", fontweight="bold", fontsize=11)
|
||||
ax_speedup.legend(prop={"weight": "bold"})
|
||||
ax_speedup.grid(True, alpha=0.3)
|
||||
|
||||
# Plot bandwidth utilization (right plot)
|
||||
ax_bandwidth.plot(
|
||||
total_tokens_values,
|
||||
cuda_bandwidth_percentages,
|
||||
"ro-",
|
||||
v2_bandwidth_percentages,
|
||||
"o-",
|
||||
linewidth=3,
|
||||
markersize=8,
|
||||
label="CUDA",
|
||||
label="SiLU V2",
|
||||
color="blue",
|
||||
)
|
||||
ax_bandwidth.plot(
|
||||
total_tokens_values,
|
||||
triton_bandwidth_percentages,
|
||||
"go-",
|
||||
"o-",
|
||||
linewidth=3,
|
||||
markersize=8,
|
||||
label="Triton",
|
||||
color="green",
|
||||
)
|
||||
ax_bandwidth.set_title(
|
||||
f"{strategy_name}\nBandwidth Utilization (Hopper)\nE={E}, T={T}, H={H}",
|
||||
@ -618,38 +686,12 @@ def create_total_tokens_plot(all_results):
|
||||
for label in ax.get_xticklabels() + ax.get_yticklabels():
|
||||
label.set_fontweight("bold")
|
||||
|
||||
# Add value labels on speedup points
|
||||
for x, y in zip(total_tokens_values, ratios):
|
||||
# Add value labels on Triton/V2 speedup points
|
||||
for x, y in zip(total_tokens_values, triton_v2_ratios):
|
||||
ax_speedup.annotate(
|
||||
f"{y:.2f}x",
|
||||
(x, y),
|
||||
textcoords="offset points",
|
||||
xytext=(0, 12),
|
||||
ha="center",
|
||||
fontsize=10,
|
||||
fontweight="bold",
|
||||
bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.7),
|
||||
)
|
||||
|
||||
# Add value labels on CUDA bandwidth points
|
||||
for x, y in zip(total_tokens_values, cuda_bandwidth_percentages):
|
||||
ax_bandwidth.annotate(
|
||||
f"{y:.1f}%",
|
||||
(x, y),
|
||||
textcoords="offset points",
|
||||
xytext=(0, 12),
|
||||
ha="center",
|
||||
fontsize=9,
|
||||
fontweight="bold",
|
||||
bbox=dict(boxstyle="round,pad=0.2", facecolor="red", alpha=0.3),
|
||||
)
|
||||
|
||||
# Add value labels on Triton bandwidth points
|
||||
for x, y in zip(total_tokens_values, triton_bandwidth_percentages):
|
||||
ax_bandwidth.annotate(
|
||||
f"{y:.1f}%",
|
||||
(x, y),
|
||||
textcoords="offset points",
|
||||
xytext=(0, -15),
|
||||
ha="center",
|
||||
fontsize=9,
|
||||
@ -659,17 +701,20 @@ def create_total_tokens_plot(all_results):
|
||||
|
||||
plt.tight_layout()
|
||||
plt.subplots_adjust(top=0.93) # Make room for main title
|
||||
filename = "silu_benchmark_total_tokens.png"
|
||||
filename = "silu_benchmark_total_tokens_3way.png"
|
||||
plt.savefig(filename, dpi=300, bbox_inches="tight")
|
||||
plt.show()
|
||||
|
||||
return filename
|
||||
|
||||
|
||||
# Create combined plot with all strategies
|
||||
combined_plot_filename = create_total_tokens_plot(all_results)
|
||||
# Create comprehensive 3-way comparison plots
|
||||
combined_plot_filename = create_combined_plot(all_results)
|
||||
total_tokens_plot_filename = create_total_tokens_plot(all_results)
|
||||
|
||||
print(f"\n{'=' * 60}")
|
||||
print("Benchmark Complete!")
|
||||
print(f"Generated combined plot: {combined_plot_filename}")
|
||||
print(f"{'=' * 60}")
|
||||
print(f"\n{'=' * 80}")
|
||||
print("3-Way Benchmark Suite Complete!")
|
||||
print(f"Generated combined comparison plot: {combined_plot_filename}")
|
||||
print(f"Generated total tokens analysis plot: {total_tokens_plot_filename}")
|
||||
print("Compared: SiLU V2 (CUDA), and Triton implementations")
|
||||
print(f"{'=' * 80}")
|
||||
|
@ -4,7 +4,6 @@
|
||||
import csv
|
||||
import os
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
import flashinfer
|
||||
import torch
|
||||
@ -28,9 +27,7 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
|
||||
@torch.no_grad()
|
||||
def benchmark_decode(
|
||||
dtype: torch.dtype,
|
||||
quant_dtypes: tuple[
|
||||
Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
|
||||
],
|
||||
quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None],
|
||||
batch_size: int,
|
||||
max_seq_len: int,
|
||||
num_heads: tuple[int, int] = (64, 8),
|
||||
|
@ -4,7 +4,6 @@
|
||||
import csv
|
||||
import os
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
import flashinfer
|
||||
import torch
|
||||
@ -28,9 +27,7 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
|
||||
@torch.no_grad()
|
||||
def benchmark_prefill(
|
||||
dtype: torch.dtype,
|
||||
quant_dtypes: tuple[
|
||||
Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
|
||||
],
|
||||
quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None],
|
||||
batch_size: int,
|
||||
max_seq_len: int,
|
||||
num_heads: tuple[int, int] = (64, 8),
|
||||
|
@ -14,7 +14,7 @@ import torch
|
||||
from tqdm import tqdm
|
||||
|
||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||
_w8a8_block_fp8_matmul,
|
||||
_w8a8_triton_block_scaled_mm,
|
||||
)
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.triton_utils import triton
|
||||
@ -83,7 +83,7 @@ def w8a8_block_matmul(
|
||||
)
|
||||
|
||||
if A.dtype == torch.float8_e4m3fn:
|
||||
kernel = _w8a8_block_fp8_matmul
|
||||
kernel = _w8a8_triton_block_scaled_mm
|
||||
else:
|
||||
raise RuntimeError("Currently, only support tune w8a8 block fp8 kernel.")
|
||||
|
||||
|
@ -1,6 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# fmt: off
|
||||
# ruff: noqa: E501
|
||||
import time
|
||||
|
||||
@ -20,19 +19,21 @@ from vllm.utils.deep_gemm import (
|
||||
)
|
||||
|
||||
|
||||
def benchmark_shape(m: int,
|
||||
n: int,
|
||||
k: int,
|
||||
warmup: int = 100,
|
||||
repeat: int = 10000,
|
||||
verbose: bool = False) -> dict:
|
||||
def benchmark_shape(
|
||||
m: int,
|
||||
n: int,
|
||||
k: int,
|
||||
warmup: int = 100,
|
||||
repeat: int = 10000,
|
||||
verbose: bool = False,
|
||||
) -> dict:
|
||||
"""Benchmark all implementations for a specific (m, n, k) shape."""
|
||||
if verbose:
|
||||
print(f"\n=== Benchmarking shape: m={m}, n={n}, k={k} ===")
|
||||
|
||||
# Create test tensors
|
||||
A = torch.randn((m, k), device='cuda', dtype=torch.bfloat16)
|
||||
B = torch.randn((n, k), device='cuda', dtype=torch.bfloat16)
|
||||
A = torch.randn((m, k), device="cuda", dtype=torch.bfloat16)
|
||||
B = torch.randn((n, k), device="cuda", dtype=torch.bfloat16)
|
||||
|
||||
# Reference result in BF16
|
||||
torch.cuda.synchronize()
|
||||
@ -49,34 +50,39 @@ def benchmark_shape(m: int,
|
||||
# Pre-quantize A for all implementations
|
||||
A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(A, block_size[1])
|
||||
A_scale_deepgemm = get_col_major_tma_aligned_tensor(A_scale_deepgemm)
|
||||
C_deepgemm = torch.empty((m, n), device='cuda', dtype=torch.bfloat16)
|
||||
C_deepgemm = torch.empty((m, n), device="cuda", dtype=torch.bfloat16)
|
||||
A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1])
|
||||
A_vllm_cutlass, A_scale_vllm_cutlass = per_token_group_quant_fp8(
|
||||
A, block_size[1], column_major_scales=True)
|
||||
A, block_size[1], column_major_scales=True
|
||||
)
|
||||
|
||||
# === DeepGEMM Implementation ===
|
||||
def deepgemm_gemm():
|
||||
fp8_gemm_nt((A_deepgemm, A_scale_deepgemm),
|
||||
(B_deepgemm, B_scale_deepgemm),
|
||||
C_deepgemm)
|
||||
fp8_gemm_nt(
|
||||
(A_deepgemm, A_scale_deepgemm), (B_deepgemm, B_scale_deepgemm), C_deepgemm
|
||||
)
|
||||
return C_deepgemm
|
||||
|
||||
# === vLLM Triton Implementation ===
|
||||
def vllm_triton_gemm():
|
||||
return w8a8_triton_block_scaled_mm(A_vllm,
|
||||
B_vllm,
|
||||
A_scale_vllm,
|
||||
B_scale_vllm,
|
||||
block_size,
|
||||
output_dtype=torch.bfloat16)
|
||||
return w8a8_triton_block_scaled_mm(
|
||||
A_vllm,
|
||||
B_vllm,
|
||||
A_scale_vllm,
|
||||
B_scale_vllm,
|
||||
block_size,
|
||||
output_dtype=torch.bfloat16,
|
||||
)
|
||||
|
||||
# === vLLM CUTLASS Implementation ===
|
||||
def vllm_cutlass_gemm():
|
||||
return ops.cutlass_scaled_mm(A_vllm_cutlass,
|
||||
B_vllm.T,
|
||||
scale_a=A_scale_vllm_cutlass,
|
||||
scale_b=B_scale_vllm.T,
|
||||
out_dtype=torch.bfloat16)
|
||||
return ops.cutlass_scaled_mm(
|
||||
A_vllm_cutlass,
|
||||
B_vllm.T,
|
||||
scale_a=A_scale_vllm_cutlass,
|
||||
scale_b=B_scale_vllm.T,
|
||||
out_dtype=torch.bfloat16,
|
||||
)
|
||||
|
||||
# Run correctness check first
|
||||
if verbose:
|
||||
@ -93,26 +99,23 @@ def benchmark_shape(m: int,
|
||||
print(f"DeepGEMM vs Reference difference: {deepgemm_diff:.6f}")
|
||||
print(f"vLLM Triton vs Reference difference: {vllm_triton_diff:.6f}")
|
||||
print(f"vLLM CUTLASS vs Reference difference: {vllm_cutlass_diff:.6f}")
|
||||
print("vLLM Triton vs DeepGEMM difference: "
|
||||
f"{calc_diff(C_vllm_triton, C_deepgemm):.6f}")
|
||||
print("vLLM CUTLASS vs DeepGEMM difference: "
|
||||
f"{calc_diff(C_vllm_cutlass, C_deepgemm):.6f}")
|
||||
print(
|
||||
"vLLM Triton vs DeepGEMM difference: "
|
||||
f"{calc_diff(C_vllm_triton, C_deepgemm):.6f}"
|
||||
)
|
||||
print(
|
||||
"vLLM CUTLASS vs DeepGEMM difference: "
|
||||
f"{calc_diff(C_vllm_cutlass, C_deepgemm):.6f}"
|
||||
)
|
||||
|
||||
# Benchmark implementations
|
||||
implementations = {
|
||||
"DeepGEMM": deepgemm_gemm,
|
||||
"vLLM Triton": vllm_triton_gemm,
|
||||
"vLLM CUTLASS": vllm_cutlass_gemm
|
||||
"vLLM CUTLASS": vllm_cutlass_gemm,
|
||||
}
|
||||
|
||||
benchmark_results = {
|
||||
"shape": {
|
||||
"m": m,
|
||||
"n": n,
|
||||
"k": k
|
||||
},
|
||||
"implementations": {}
|
||||
}
|
||||
benchmark_results = {"shape": {"m": m, "n": n, "k": k}, "implementations": {}}
|
||||
|
||||
for name, func in implementations.items():
|
||||
# Warmup
|
||||
@ -140,38 +143,36 @@ def benchmark_shape(m: int,
|
||||
"tflops": tflops,
|
||||
"gb_s": gb_s,
|
||||
"diff": {
|
||||
"DeepGEMM":
|
||||
0.0 if name == "DeepGEMM" else calc_diff(func(), C_deepgemm),
|
||||
"Reference":
|
||||
deepgemm_diff if name == "DeepGEMM" else
|
||||
(vllm_triton_diff
|
||||
if name == "vLLM Triton" else vllm_cutlass_diff)
|
||||
}
|
||||
"DeepGEMM": 0.0
|
||||
if name == "DeepGEMM"
|
||||
else calc_diff(func(), C_deepgemm),
|
||||
"Reference": deepgemm_diff
|
||||
if name == "DeepGEMM"
|
||||
else (vllm_triton_diff if name == "vLLM Triton" else vllm_cutlass_diff),
|
||||
},
|
||||
}
|
||||
|
||||
if verbose:
|
||||
print(
|
||||
f"{name}: {avg_time_ms:.3f} ms, {tflops:.2f} TFLOPS, {gb_s:.2f} GB/s"
|
||||
)
|
||||
print(f"{name}: {avg_time_ms:.3f} ms, {tflops:.2f} TFLOPS, {gb_s:.2f} GB/s")
|
||||
|
||||
# Calculate speedups
|
||||
baseline = benchmark_results["implementations"]["DeepGEMM"]["time_ms"]
|
||||
for name, data in benchmark_results["implementations"].items():
|
||||
if name != "DeepGEMM":
|
||||
speedup = baseline / data["time_ms"]
|
||||
benchmark_results["implementations"][name][
|
||||
"speedup_vs_deepgemm"] = speedup
|
||||
benchmark_results["implementations"][name]["speedup_vs_deepgemm"] = speedup
|
||||
if verbose:
|
||||
print(f"DeepGEMM is {1/speedup:.2f}x "
|
||||
f"{'faster' if 1/speedup > 1 else 'slower'} than {name}")
|
||||
print(
|
||||
f"DeepGEMM is {1 / speedup:.2f}x "
|
||||
f"{'faster' if 1 / speedup > 1 else 'slower'} than {name}"
|
||||
)
|
||||
|
||||
vllm_triton_time = benchmark_results["implementations"]["vLLM Triton"][
|
||||
"time_ms"]
|
||||
vllm_cutlass_time = benchmark_results["implementations"]["vLLM CUTLASS"][
|
||||
"time_ms"]
|
||||
vllm_triton_time = benchmark_results["implementations"]["vLLM Triton"]["time_ms"]
|
||||
vllm_cutlass_time = benchmark_results["implementations"]["vLLM CUTLASS"]["time_ms"]
|
||||
cutlass_vs_triton = vllm_triton_time / vllm_cutlass_time
|
||||
benchmark_results["implementations"]["vLLM CUTLASS"][
|
||||
"speedup_vs_triton"] = cutlass_vs_triton
|
||||
benchmark_results["implementations"]["vLLM CUTLASS"]["speedup_vs_triton"] = (
|
||||
cutlass_vs_triton
|
||||
)
|
||||
if verbose:
|
||||
print(
|
||||
f"vLLM CUTLASS is {cutlass_vs_triton:.2f}x "
|
||||
@ -183,8 +184,7 @@ def benchmark_shape(m: int,
|
||||
|
||||
def format_table_row(values, widths):
|
||||
"""Format a row with specified column widths."""
|
||||
return "| " + " | ".join(f"{val:{w}}"
|
||||
for val, w in zip(values, widths)) + " |"
|
||||
return "| " + " | ".join(f"{val:{w}}" for val, w in zip(values, widths)) + " |"
|
||||
|
||||
|
||||
def print_table(headers, rows, title=None):
|
||||
@ -292,38 +292,50 @@ def run_benchmarks(verbose: bool = False):
|
||||
for result in all_results:
|
||||
shape = result["shape"]
|
||||
impl_data = result["implementations"]["DeepGEMM"]
|
||||
deepgemm_rows.append([
|
||||
shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
|
||||
f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}"
|
||||
])
|
||||
deepgemm_rows.append(
|
||||
[
|
||||
shape["m"],
|
||||
shape["n"],
|
||||
shape["k"],
|
||||
f"{impl_data['time_us']:.1f}",
|
||||
f"{impl_data['tflops']:.1f}",
|
||||
f"{impl_data['gb_s']:.1f}",
|
||||
]
|
||||
)
|
||||
|
||||
print_table(deepgemm_headers,
|
||||
deepgemm_rows,
|
||||
title="DeepGEMM Implementation:")
|
||||
print_table(deepgemm_headers, deepgemm_rows, title="DeepGEMM Implementation:")
|
||||
|
||||
# Print vLLM Triton table
|
||||
triton_headers = [
|
||||
"m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM"
|
||||
]
|
||||
triton_headers = ["m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM"]
|
||||
triton_rows = []
|
||||
for result in all_results:
|
||||
shape = result["shape"]
|
||||
impl_data = result["implementations"]["vLLM Triton"]
|
||||
speedup = impl_data.get("speedup_vs_deepgemm", 1.0)
|
||||
triton_rows.append([
|
||||
shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
|
||||
f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}",
|
||||
format_speedup(speedup)
|
||||
])
|
||||
triton_rows.append(
|
||||
[
|
||||
shape["m"],
|
||||
shape["n"],
|
||||
shape["k"],
|
||||
f"{impl_data['time_us']:.1f}",
|
||||
f"{impl_data['tflops']:.1f}",
|
||||
f"{impl_data['gb_s']:.1f}",
|
||||
format_speedup(speedup),
|
||||
]
|
||||
)
|
||||
|
||||
print_table(triton_headers,
|
||||
triton_rows,
|
||||
title="vLLM Triton Implementation:")
|
||||
print_table(triton_headers, triton_rows, title="vLLM Triton Implementation:")
|
||||
|
||||
# Print vLLM CUTLASS table
|
||||
cutlass_headers = [
|
||||
"m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM",
|
||||
"vs Triton"
|
||||
"m",
|
||||
"n",
|
||||
"k",
|
||||
"Time (μs)",
|
||||
"TFLOPS",
|
||||
"GB/s",
|
||||
"vs DeepGEMM",
|
||||
"vs Triton",
|
||||
]
|
||||
cutlass_rows = []
|
||||
for result in all_results:
|
||||
@ -331,28 +343,27 @@ def run_benchmarks(verbose: bool = False):
|
||||
impl_data = result["implementations"]["vLLM CUTLASS"]
|
||||
vs_deepgemm = impl_data.get("speedup_vs_deepgemm", 1.0)
|
||||
vs_triton = impl_data.get("speedup_vs_triton", 1.0)
|
||||
cutlass_rows.append([
|
||||
shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
|
||||
f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}",
|
||||
format_speedup(vs_deepgemm),
|
||||
format_speedup(vs_triton)
|
||||
])
|
||||
cutlass_rows.append(
|
||||
[
|
||||
shape["m"],
|
||||
shape["n"],
|
||||
shape["k"],
|
||||
f"{impl_data['time_us']:.1f}",
|
||||
f"{impl_data['tflops']:.1f}",
|
||||
f"{impl_data['gb_s']:.1f}",
|
||||
format_speedup(vs_deepgemm),
|
||||
format_speedup(vs_triton),
|
||||
]
|
||||
)
|
||||
|
||||
print_table(cutlass_headers,
|
||||
cutlass_rows,
|
||||
title="vLLM CUTLASS Implementation:")
|
||||
print_table(cutlass_headers, cutlass_rows, title="vLLM CUTLASS Implementation:")
|
||||
|
||||
# Calculate and print averages
|
||||
print("\n===== AVERAGE PERFORMANCE =====")
|
||||
|
||||
implementations = ["DeepGEMM", "vLLM Triton", "vLLM CUTLASS"]
|
||||
avg_metrics = {
|
||||
impl: {
|
||||
"tflops": 0,
|
||||
"gb_s": 0,
|
||||
"time_ms": 0
|
||||
}
|
||||
for impl in implementations
|
||||
impl: {"tflops": 0, "gb_s": 0, "time_ms": 0} for impl in implementations
|
||||
}
|
||||
|
||||
for result in all_results:
|
||||
@ -370,9 +381,9 @@ def run_benchmarks(verbose: bool = False):
|
||||
avg_tflops = avg_metrics[impl]["tflops"] / num_shapes
|
||||
avg_mem_bw = avg_metrics[impl]["gb_s"] / num_shapes
|
||||
avg_time = avg_metrics[impl]["time_ms"] / num_shapes
|
||||
avg_rows.append([
|
||||
impl, f"{avg_tflops:.2f}", f"{avg_mem_bw:.2f}", f"{avg_time:.2f}"
|
||||
])
|
||||
avg_rows.append(
|
||||
[impl, f"{avg_tflops:.2f}", f"{avg_mem_bw:.2f}", f"{avg_time:.2f}"]
|
||||
)
|
||||
|
||||
print_table(avg_headers, avg_rows)
|
||||
|
||||
@ -380,21 +391,19 @@ def run_benchmarks(verbose: bool = False):
|
||||
avg_speedups = {
|
||||
"DeepGEMM vs vLLM Triton": 0,
|
||||
"DeepGEMM vs vLLM CUTLASS": 0,
|
||||
"vLLM CUTLASS vs vLLM Triton": 0
|
||||
"vLLM CUTLASS vs vLLM Triton": 0,
|
||||
}
|
||||
|
||||
for result in all_results:
|
||||
deepgemm_time = result["implementations"]["DeepGEMM"]["time_ms"]
|
||||
vllm_triton_time = result["implementations"]["vLLM Triton"]["time_ms"]
|
||||
vllm_cutlass_time = result["implementations"]["vLLM CUTLASS"][
|
||||
"time_ms"]
|
||||
vllm_cutlass_time = result["implementations"]["vLLM CUTLASS"]["time_ms"]
|
||||
|
||||
avg_speedups[
|
||||
"DeepGEMM vs vLLM Triton"] += vllm_triton_time / deepgemm_time
|
||||
avg_speedups[
|
||||
"DeepGEMM vs vLLM CUTLASS"] += vllm_cutlass_time / deepgemm_time
|
||||
avg_speedups[
|
||||
"vLLM CUTLASS vs vLLM Triton"] += vllm_triton_time / vllm_cutlass_time
|
||||
avg_speedups["DeepGEMM vs vLLM Triton"] += vllm_triton_time / deepgemm_time
|
||||
avg_speedups["DeepGEMM vs vLLM CUTLASS"] += vllm_cutlass_time / deepgemm_time
|
||||
avg_speedups["vLLM CUTLASS vs vLLM Triton"] += (
|
||||
vllm_triton_time / vllm_cutlass_time
|
||||
)
|
||||
|
||||
print("\n===== AVERAGE SPEEDUPS =====")
|
||||
speedup_headers = ["Comparison", "Speedup"]
|
||||
@ -412,8 +421,7 @@ def run_benchmarks(verbose: bool = False):
|
||||
|
||||
for result in all_results:
|
||||
for impl in implementations:
|
||||
avg_diff[impl] += result["implementations"][impl]["diff"][
|
||||
"Reference"]
|
||||
avg_diff[impl] += result["implementations"][impl]["diff"]["Reference"]
|
||||
|
||||
diff_headers = ["Implementation", "Avg Diff vs Reference"]
|
||||
diff_rows = []
|
||||
|
@ -2,8 +2,8 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import dataclasses
|
||||
from collections.abc import Iterable
|
||||
from typing import Any, Callable, Optional
|
||||
from collections.abc import Callable, Iterable
|
||||
from typing import Any
|
||||
|
||||
import torch
|
||||
import torch.utils.benchmark as TBenchmark
|
||||
@ -55,7 +55,7 @@ class Bench:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
cuda_graph_params: Optional[CudaGraphBenchParams],
|
||||
cuda_graph_params: CudaGraphBenchParams | None,
|
||||
label: str,
|
||||
sub_label: str,
|
||||
description: str,
|
||||
|
@ -2,7 +2,7 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from abc import ABC, abstractmethod
|
||||
from statistics import mean
|
||||
from typing import Any, NamedTuple, Optional, Union
|
||||
from typing import Any, NamedTuple
|
||||
|
||||
import numpy as np # type: ignore
|
||||
import pandas as pd # type: ignore
|
||||
@ -35,8 +35,8 @@ class Distribution(ABC):
|
||||
class UniformDistribution(Distribution):
|
||||
def __init__(
|
||||
self,
|
||||
min_val: Union[int, float],
|
||||
max_val: Union[int, float],
|
||||
min_val: int | float,
|
||||
max_val: int | float,
|
||||
is_integer: bool = True,
|
||||
) -> None:
|
||||
self.min_val = min_val
|
||||
@ -56,7 +56,7 @@ class UniformDistribution(Distribution):
|
||||
|
||||
|
||||
class ConstantDistribution(Distribution):
|
||||
def __init__(self, value: Union[int, float]) -> None:
|
||||
def __init__(self, value: int | float) -> None:
|
||||
self.value = value
|
||||
self.max_val = value
|
||||
|
||||
@ -68,7 +68,7 @@ class ConstantDistribution(Distribution):
|
||||
|
||||
|
||||
class ZipfDistribution(Distribution):
|
||||
def __init__(self, alpha: float, max_val: Optional[int] = None) -> None:
|
||||
def __init__(self, alpha: float, max_val: int | None = None) -> None:
|
||||
self.alpha = alpha
|
||||
self.max_val = max_val
|
||||
|
||||
@ -83,7 +83,7 @@ class ZipfDistribution(Distribution):
|
||||
|
||||
|
||||
class PoissonDistribution(Distribution):
|
||||
def __init__(self, alpha: float, max_val: Optional[int] = None) -> None:
|
||||
def __init__(self, alpha: float, max_val: int | None = None) -> None:
|
||||
self.alpha = alpha
|
||||
self.max_val = max_val
|
||||
|
||||
@ -100,11 +100,11 @@ class PoissonDistribution(Distribution):
|
||||
class LognormalDistribution(Distribution):
|
||||
def __init__(
|
||||
self,
|
||||
mean: Optional[float] = None,
|
||||
sigma: Optional[float] = None,
|
||||
average: Optional[int] = None,
|
||||
median_ratio: Optional[float] = None,
|
||||
max_val: Optional[int] = None,
|
||||
mean: float | None = None,
|
||||
sigma: float | None = None,
|
||||
average: int | None = None,
|
||||
median_ratio: float | None = None,
|
||||
max_val: int | None = None,
|
||||
) -> None:
|
||||
self.average = average
|
||||
self.median_ratio = median_ratio
|
||||
|
@ -13,7 +13,7 @@ from datetime import datetime
|
||||
from enum import Enum
|
||||
from http import HTTPStatus
|
||||
from statistics import mean
|
||||
from typing import NamedTuple, Optional, Union
|
||||
from typing import NamedTuple
|
||||
|
||||
import aiohttp # type: ignore
|
||||
import numpy as np # type: ignore
|
||||
@ -46,9 +46,9 @@ class ConversationSampling(str, Enum):
|
||||
|
||||
class ClientArgs(NamedTuple):
|
||||
seed: int
|
||||
max_num_requests: Optional[int]
|
||||
max_num_requests: int | None
|
||||
skip_first_turn: bool
|
||||
max_turns: Optional[int]
|
||||
max_turns: int | None
|
||||
max_active_conversations: int
|
||||
verbose: bool
|
||||
print_content: bool
|
||||
@ -109,9 +109,9 @@ class RequestStats(NamedTuple):
|
||||
|
||||
class MetricStats:
|
||||
def __init__(self) -> None:
|
||||
self.min: Optional[float] = None
|
||||
self.max: Optional[float] = None
|
||||
self.avg: Optional[float] = None
|
||||
self.min: float | None = None
|
||||
self.max: float | None = None
|
||||
self.avg: float | None = None
|
||||
self.sum = 0.0
|
||||
self.count = 0
|
||||
|
||||
@ -143,7 +143,7 @@ class MovingAverage:
|
||||
self.index = 0
|
||||
self.sum = 0.0
|
||||
self.count = 0
|
||||
self.avg: Optional[float] = None
|
||||
self.avg: float | None = None
|
||||
|
||||
def update(self, new_value: float) -> None:
|
||||
if self.count < self.window_size:
|
||||
@ -169,7 +169,7 @@ class MovingAverage:
|
||||
class DebugStats:
|
||||
def __init__(self, logger: logging.Logger, window_size: int) -> None:
|
||||
self.logger = logger
|
||||
self.metrics: dict[str, Union[MovingAverage, MetricStats]] = {
|
||||
self.metrics: dict[str, MovingAverage | MetricStats] = {
|
||||
"moving_avg_ttft_ms": MovingAverage(window_size),
|
||||
"moving_avg_tpot_ms": MovingAverage(window_size),
|
||||
"ttft_ms": MetricStats(),
|
||||
@ -198,14 +198,6 @@ class DebugStats:
|
||||
self.logger.info("-" * 50)
|
||||
|
||||
|
||||
# Must support Python 3.8, we can't use str.removeprefix(prefix)
|
||||
# introduced in Python 3.9
|
||||
def remove_prefix(text: str, prefix: str) -> str:
|
||||
if text.startswith(prefix):
|
||||
return text[len(prefix) :]
|
||||
return text
|
||||
|
||||
|
||||
def nanosec_to_millisec(value: float) -> float:
|
||||
return value / 1000000.0
|
||||
|
||||
@ -220,8 +212,8 @@ async def send_request(
|
||||
chat_url: str,
|
||||
model: str,
|
||||
stream: bool = True,
|
||||
min_tokens: Optional[int] = None,
|
||||
max_tokens: Optional[int] = None,
|
||||
min_tokens: int | None = None,
|
||||
max_tokens: int | None = None,
|
||||
) -> ServerResponse:
|
||||
payload = {
|
||||
"model": model,
|
||||
@ -250,9 +242,9 @@ async def send_request(
|
||||
timeout = aiohttp.ClientTimeout(total=timeout_sec)
|
||||
|
||||
valid_response = True
|
||||
ttft: Optional[float] = None
|
||||
ttft: float | None = None
|
||||
chunk_delay: list[int] = []
|
||||
latency: Optional[float] = None
|
||||
latency: float | None = None
|
||||
first_chunk = ""
|
||||
generated_text = ""
|
||||
|
||||
@ -269,7 +261,7 @@ async def send_request(
|
||||
if not chunk_bytes:
|
||||
continue
|
||||
|
||||
chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ")
|
||||
chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
|
||||
if chunk == "[DONE]":
|
||||
# End of stream
|
||||
latency = time.perf_counter_ns() - start_time
|
||||
@ -364,7 +356,7 @@ async def send_turn(
|
||||
req_args: RequestArgs,
|
||||
verbose: bool,
|
||||
verify_output: bool,
|
||||
) -> Optional[RequestStats]:
|
||||
) -> RequestStats | None:
|
||||
assert messages_to_use > 0
|
||||
assert messages_to_use <= len(conversation_messages)
|
||||
|
||||
@ -644,7 +636,7 @@ async def client_main(
|
||||
|
||||
if args.verbose:
|
||||
curr_time_sec: float = time.perf_counter()
|
||||
time_since_last_turn: Union[str, float] = "N/A"
|
||||
time_since_last_turn: str | float = "N/A"
|
||||
if conv_id in time_of_last_turn:
|
||||
time_since_last_turn = round(
|
||||
curr_time_sec - time_of_last_turn[conv_id], 3
|
||||
@ -769,7 +761,7 @@ def get_client_config(
|
||||
"Number of conversations must be equal or larger than the number of clients"
|
||||
)
|
||||
|
||||
max_req_per_client: Optional[int] = None
|
||||
max_req_per_client: int | None = None
|
||||
if args.max_num_requests is not None:
|
||||
# Max number of requests per client
|
||||
req_per_client = args.max_num_requests // args.num_clients
|
||||
@ -936,13 +928,13 @@ async def main_mp(
|
||||
f"{num_clients_finished} out of {bench_args.num_clients} clients finished, collected {len(client_metrics)} measurements, runtime {runtime_sec:.3f} sec{Color.RESET}" # noqa: E501
|
||||
)
|
||||
|
||||
rps: Union[str, float] = round(len(client_metrics) / runtime_sec, 3)
|
||||
rps: str | float = round(len(client_metrics) / runtime_sec, 3)
|
||||
if len(client_metrics) < (5 * bench_args.num_clients):
|
||||
# Do not estimate the RPS if the number of samples is very low
|
||||
# (threshold can be tuned if needed)
|
||||
rps = "N/A"
|
||||
|
||||
runtime_left_sec: Union[str, float] = round(
|
||||
runtime_left_sec: str | float = round(
|
||||
(runtime_sec / finished_convs) * (total_convs - finished_convs), 3
|
||||
)
|
||||
if percent < 0.05:
|
||||
@ -1032,7 +1024,7 @@ def process_statistics(
|
||||
warmup_percentages: list[float],
|
||||
test_params: dict,
|
||||
verbose: bool,
|
||||
gen_conv_args: Optional[GenConvArgs] = None,
|
||||
gen_conv_args: GenConvArgs | None = None,
|
||||
excel_output: bool = False,
|
||||
) -> None:
|
||||
if len(client_metrics) == 0:
|
||||
@ -1259,7 +1251,7 @@ async def main() -> None:
|
||||
default=None,
|
||||
help="The model name used in the API. "
|
||||
"If not specified, the model name will be the "
|
||||
"same as the ``--model`` argument. ",
|
||||
"same as the `--model` argument. ",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
|
@ -13,7 +13,7 @@ import argparse
|
||||
import json
|
||||
import random
|
||||
from statistics import mean
|
||||
from typing import Any, Optional
|
||||
from typing import Any
|
||||
|
||||
import pandas as pd # type: ignore
|
||||
import tqdm # type: ignore
|
||||
@ -25,7 +25,7 @@ def has_non_english_chars(text: str) -> bool:
|
||||
|
||||
|
||||
def content_is_valid(
|
||||
content: str, min_content_len: Optional[int], max_content_len: Optional[int]
|
||||
content: str, min_content_len: int | None, max_content_len: int | None
|
||||
) -> bool:
|
||||
if min_content_len and len(content) < min_content_len:
|
||||
return False
|
||||
@ -37,7 +37,7 @@ def content_is_valid(
|
||||
|
||||
|
||||
def print_stats(
|
||||
conversations: "list[dict[Any, Any]]", tokenizer: Optional[AutoTokenizer] = None
|
||||
conversations: "list[dict[Any, Any]]", tokenizer: AutoTokenizer | None = None
|
||||
) -> None:
|
||||
# Collect statistics
|
||||
stats = []
|
||||
@ -109,12 +109,12 @@ def convert_sharegpt_to_openai(
|
||||
seed: int,
|
||||
input_file: str,
|
||||
output_file: str,
|
||||
max_items: Optional[int],
|
||||
min_content_len: Optional[int] = None,
|
||||
max_content_len: Optional[int] = None,
|
||||
min_turns: Optional[int] = None,
|
||||
max_turns: Optional[int] = None,
|
||||
model: Optional[str] = None,
|
||||
max_items: int | None,
|
||||
min_content_len: int | None = None,
|
||||
max_content_len: int | None = None,
|
||||
min_turns: int | None = None,
|
||||
max_turns: int | None = None,
|
||||
model: str | None = None,
|
||||
) -> None:
|
||||
if min_turns and max_turns:
|
||||
assert min_turns <= max_turns
|
||||
|
@ -1,49 +0,0 @@
|
||||
# This local pyproject file is part of the migration from yapf to ruff format.
|
||||
# It uses the same core rules as the main pyproject.toml file, but with the
|
||||
# following differences:
|
||||
# - ruff line length is overridden to 88
|
||||
# - deprecated typing ignores (UP006, UP035) have been removed
|
||||
|
||||
[tool.ruff]
|
||||
line-length = 88
|
||||
|
||||
[tool.ruff.lint.per-file-ignores]
|
||||
"vllm/third_party/**" = ["ALL"]
|
||||
"vllm/version.py" = ["F401"]
|
||||
"vllm/_version.py" = ["ALL"]
|
||||
|
||||
[tool.ruff.lint]
|
||||
select = [
|
||||
# pycodestyle
|
||||
"E",
|
||||
# Pyflakes
|
||||
"F",
|
||||
# pyupgrade
|
||||
"UP",
|
||||
# flake8-bugbear
|
||||
"B",
|
||||
# flake8-simplify
|
||||
"SIM",
|
||||
# isort
|
||||
"I",
|
||||
# flake8-logging-format
|
||||
"G",
|
||||
]
|
||||
ignore = [
|
||||
# star imports
|
||||
"F405", "F403",
|
||||
# lambda expression assignment
|
||||
"E731",
|
||||
# Loop control variable not used within loop body
|
||||
"B007",
|
||||
# f-string format
|
||||
"UP032",
|
||||
# Can remove once 3.10+ is the minimum Python version
|
||||
"UP007",
|
||||
]
|
||||
|
||||
[tool.ruff.lint.isort]
|
||||
known-first-party = ["vllm"]
|
||||
|
||||
[tool.ruff.format]
|
||||
docstring-code-format = true
|
@ -198,13 +198,24 @@ else()
|
||||
endif()
|
||||
|
||||
if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
|
||||
FetchContent_Declare(
|
||||
oneDNN
|
||||
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
|
||||
GIT_TAG v3.9
|
||||
GIT_PROGRESS TRUE
|
||||
GIT_SHALLOW TRUE
|
||||
)
|
||||
set(FETCHCONTENT_SOURCE_DIR_ONEDNN "$ENV{FETCHCONTENT_SOURCE_DIR_ONEDNN}" CACHE PATH "Path to a local oneDNN source directory.")
|
||||
|
||||
if(FETCHCONTENT_SOURCE_DIR_ONEDNN)
|
||||
message(STATUS "Using oneDNN from specified source directory: ${FETCHCONTENT_SOURCE_DIR_ONEDNN}")
|
||||
FetchContent_Declare(
|
||||
oneDNN
|
||||
SOURCE_DIR ${FETCHCONTENT_SOURCE_DIR_ONEDNN}
|
||||
)
|
||||
else()
|
||||
message(STATUS "Downloading oneDNN from GitHub")
|
||||
FetchContent_Declare(
|
||||
oneDNN
|
||||
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
|
||||
GIT_TAG v3.9
|
||||
GIT_PROGRESS TRUE
|
||||
GIT_SHALLOW TRUE
|
||||
)
|
||||
endif()
|
||||
|
||||
if(USE_ACL)
|
||||
find_library(ARM_COMPUTE_LIBRARY NAMES arm_compute PATHS $ENV{ACL_ROOT_DIR}/build/)
|
||||
@ -213,6 +224,7 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
|
||||
endif()
|
||||
set(ONEDNN_AARCH64_USE_ACL "ON")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ENV{ACL_ROOT_DIR}/build/")
|
||||
add_compile_definitions(VLLM_USE_ACL)
|
||||
endif()
|
||||
|
||||
set(ONEDNN_LIBRARY_TYPE "STATIC")
|
||||
@ -308,4 +320,4 @@ define_gpu_extension_target(
|
||||
WITH_SOABI
|
||||
)
|
||||
|
||||
message(STATUS "Enabling C extension.")
|
||||
message(STATUS "Enabling C extension.")
|
97
cmake/external_projects/qutlass.cmake
Normal file
97
cmake/external_projects/qutlass.cmake
Normal file
@ -0,0 +1,97 @@
|
||||
include(FetchContent)
|
||||
|
||||
set(CUTLASS_INCLUDE_DIR "${CUTLASS_INCLUDE_DIR}" CACHE PATH "Path to CUTLASS include/ directory")
|
||||
|
||||
if(DEFINED ENV{QUTLASS_SRC_DIR})
|
||||
set(QUTLASS_SRC_DIR $ENV{QUTLASS_SRC_DIR})
|
||||
endif()
|
||||
|
||||
if(QUTLASS_SRC_DIR)
|
||||
FetchContent_Declare(
|
||||
qutlass
|
||||
SOURCE_DIR ${QUTLASS_SRC_DIR}
|
||||
CONFIGURE_COMMAND ""
|
||||
BUILD_COMMAND ""
|
||||
)
|
||||
else()
|
||||
FetchContent_Declare(
|
||||
qutlass
|
||||
GIT_REPOSITORY https://github.com/IST-DASLab/qutlass.git
|
||||
GIT_TAG 830d2c4537c7396e14a02a46fbddd18b5d107c65
|
||||
GIT_PROGRESS TRUE
|
||||
CONFIGURE_COMMAND ""
|
||||
BUILD_COMMAND ""
|
||||
)
|
||||
endif()
|
||||
|
||||
FetchContent_Populate(qutlass)
|
||||
|
||||
if(NOT qutlass_SOURCE_DIR)
|
||||
message(FATAL_ERROR "[QUTLASS] source directory could not be resolved.")
|
||||
endif()
|
||||
message(STATUS "[QUTLASS] QuTLASS is available at ${qutlass_SOURCE_DIR}")
|
||||
|
||||
cuda_archs_loose_intersection(QUTLASS_ARCHS "12.0a;10.0a" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND QUTLASS_ARCHS)
|
||||
|
||||
if(QUTLASS_ARCHS MATCHES "10\\.0a")
|
||||
set(QUTLASS_TARGET_CC 100)
|
||||
elseif(QUTLASS_ARCHS MATCHES "12\\.0a")
|
||||
set(QUTLASS_TARGET_CC 120)
|
||||
else()
|
||||
message(FATAL_ERROR "[QUTLASS] internal error parsing CUDA_ARCHS='${QUTLASS_ARCHS}'.")
|
||||
endif()
|
||||
|
||||
set(QUTLASS_SOURCES
|
||||
${qutlass_SOURCE_DIR}/qutlass/csrc/bindings.cpp
|
||||
${qutlass_SOURCE_DIR}/qutlass/csrc/gemm.cu
|
||||
${qutlass_SOURCE_DIR}/qutlass/csrc/gemm_ada.cu
|
||||
${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_mx.cu
|
||||
${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_nv.cu
|
||||
${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_mx_sm100.cu
|
||||
${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_nv_sm100.cu
|
||||
)
|
||||
|
||||
set(QUTLASS_INCLUDES
|
||||
${qutlass_SOURCE_DIR}
|
||||
${qutlass_SOURCE_DIR}/qutlass
|
||||
${qutlass_SOURCE_DIR}/qutlass/csrc/include
|
||||
${qutlass_SOURCE_DIR}/qutlass/csrc/include/cutlass_extensions
|
||||
)
|
||||
|
||||
if(CUTLASS_INCLUDE_DIR AND EXISTS "${CUTLASS_INCLUDE_DIR}/cutlass/cutlass.h")
|
||||
list(APPEND QUTLASS_INCLUDES "${CUTLASS_INCLUDE_DIR}")
|
||||
elseif(EXISTS "${qutlass_SOURCE_DIR}/qutlass/third_party/cutlass/include/cutlass/cutlass.h")
|
||||
list(APPEND QUTLASS_INCLUDES "${qutlass_SOURCE_DIR}/qutlass/third_party/cutlass/include")
|
||||
message(STATUS "[QUTLASS] Using QuTLASS vendored CUTLASS headers (no vLLM CUTLASS detected).")
|
||||
else()
|
||||
message(FATAL_ERROR "[QUTLASS] CUTLASS headers not found. "
|
||||
"Set -DCUTLASS_INCLUDE_DIR=/path/to/cutlass/include")
|
||||
endif()
|
||||
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${QUTLASS_SOURCES}"
|
||||
CUDA_ARCHS "${QUTLASS_ARCHS}"
|
||||
)
|
||||
|
||||
target_sources(_C PRIVATE ${QUTLASS_SOURCES})
|
||||
target_include_directories(_C PRIVATE ${QUTLASS_INCLUDES})
|
||||
target_compile_definitions(_C PRIVATE
|
||||
QUTLASS_DISABLE_PYBIND=1
|
||||
TARGET_CUDA_ARCH=${QUTLASS_TARGET_CC}
|
||||
)
|
||||
|
||||
set_property(SOURCE ${QUTLASS_SOURCES} APPEND PROPERTY COMPILE_OPTIONS
|
||||
$<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr --use_fast_math -O3>
|
||||
)
|
||||
|
||||
else()
|
||||
if("${CMAKE_CUDA_COMPILER_VERSION}" VERSION_LESS "12.8")
|
||||
message(STATUS
|
||||
"[QUTLASS] Skipping build: CUDA 12.8 or newer is required (found ${CMAKE_CUDA_COMPILER_VERSION}).")
|
||||
else()
|
||||
message(STATUS
|
||||
"[QUTLASS] Skipping build: no supported arch (12.0a / 10.0a) found in "
|
||||
"CUDA_ARCHS='${CUDA_ARCHS}'.")
|
||||
endif()
|
||||
endif()
|
@ -38,7 +38,7 @@ else()
|
||||
FetchContent_Declare(
|
||||
vllm-flash-attn
|
||||
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
|
||||
GIT_TAG 4695e6bed5366c41e28c06cd86170166e4f43d00
|
||||
GIT_TAG a893712401d70362fbb299cd9c4b3476e8e9ed54
|
||||
GIT_PROGRESS TRUE
|
||||
# Don't share the vllm-flash-attn build between build types
|
||||
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
||||
|
@ -16,7 +16,7 @@ import shutil
|
||||
|
||||
from torch.utils.hipify.hipify_python import hipify
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
# Project directory where all the source + include files live.
|
||||
@ -34,15 +34,14 @@ if __name__ == '__main__':
|
||||
)
|
||||
|
||||
# Source files to convert.
|
||||
parser.add_argument("sources",
|
||||
help="Source files to hipify.",
|
||||
nargs="*",
|
||||
default=[])
|
||||
parser.add_argument(
|
||||
"sources", help="Source files to hipify.", nargs="*", default=[]
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Limit include scope to project_dir only
|
||||
includes = [os.path.join(args.project_dir, '*')]
|
||||
includes = [os.path.join(args.project_dir, "*")]
|
||||
|
||||
# Get absolute path for all source files.
|
||||
extra_files = [os.path.abspath(s) for s in args.sources]
|
||||
@ -51,25 +50,31 @@ if __name__ == '__main__':
|
||||
# The directory might already exist to hold object files so we ignore that.
|
||||
shutil.copytree(args.project_dir, args.output_dir, dirs_exist_ok=True)
|
||||
|
||||
hipify_result = hipify(project_directory=args.project_dir,
|
||||
output_directory=args.output_dir,
|
||||
header_include_dirs=[],
|
||||
includes=includes,
|
||||
extra_files=extra_files,
|
||||
show_detailed=True,
|
||||
is_pytorch_extension=True,
|
||||
hipify_extra_files_only=True)
|
||||
hipify_result = hipify(
|
||||
project_directory=args.project_dir,
|
||||
output_directory=args.output_dir,
|
||||
header_include_dirs=[],
|
||||
includes=includes,
|
||||
extra_files=extra_files,
|
||||
show_detailed=True,
|
||||
is_pytorch_extension=True,
|
||||
hipify_extra_files_only=True,
|
||||
)
|
||||
|
||||
hipified_sources = []
|
||||
for source in args.sources:
|
||||
s_abs = os.path.abspath(source)
|
||||
hipified_s_abs = (hipify_result[s_abs].hipified_path if
|
||||
(s_abs in hipify_result
|
||||
and hipify_result[s_abs].hipified_path is not None)
|
||||
else s_abs)
|
||||
hipified_s_abs = (
|
||||
hipify_result[s_abs].hipified_path
|
||||
if (
|
||||
s_abs in hipify_result
|
||||
and hipify_result[s_abs].hipified_path is not None
|
||||
)
|
||||
else s_abs
|
||||
)
|
||||
hipified_sources.append(hipified_s_abs)
|
||||
|
||||
assert (len(hipified_sources) == len(args.sources))
|
||||
assert len(hipified_sources) == len(args.sources)
|
||||
|
||||
# Print hipified source files.
|
||||
print("\n".join(hipified_sources))
|
||||
|
12
codecov.yml
Normal file
12
codecov.yml
Normal file
@ -0,0 +1,12 @@
|
||||
codecov:
|
||||
require_ci_to_pass: false
|
||||
|
||||
fixes:
|
||||
# Map source code paths to repository root paths
|
||||
# Wildcards match any Python version (python3.*)
|
||||
- "/vllm-workspace/src/vllm/::vllm/"
|
||||
- "/vllm-workspace/vllm/::vllm/"
|
||||
- "/usr/local/lib/python3.*/dist-packages/vllm/::vllm/"
|
||||
- "/usr/local/lib/python3.*/site-packages/vllm/::vllm/"
|
||||
- "/usr/lib/python3.*/dist-packages/vllm/::vllm/"
|
||||
- "/usr/lib/python3.*/site-packages/vllm/::vllm/"
|
@ -28,10 +28,10 @@
|
||||
|
||||
#ifdef USE_ROCM
|
||||
#include <hip/hip_bf16.h>
|
||||
#include "../quantization/fp8/amd/quant_utils.cuh"
|
||||
#include "../quantization/w8a8/fp8/amd/quant_utils.cuh"
|
||||
typedef __hip_bfloat16 __nv_bfloat16;
|
||||
#else
|
||||
#include "../quantization/fp8/nvidia/quant_utils.cuh"
|
||||
#include "../quantization/w8a8/fp8/nvidia/quant_utils.cuh"
|
||||
#endif
|
||||
|
||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
|
@ -125,32 +125,37 @@ public:
|
||||
}
|
||||
|
||||
static void set_split_kv (KernelArguments& args) {
|
||||
// printf("set_split_kv start");
|
||||
if (args.split_kv >= 1) return;
|
||||
auto [H, K, D, B] = args.problem_shape;
|
||||
// std::cout << H << " " << K << " " << D << " " << B << "\n";
|
||||
int sm_count = args.hw_info.sm_count;
|
||||
// printf(" sm_count = %d\n", sm_count);
|
||||
int max_splits = ceil_div(K, 128);
|
||||
max_splits = min(16, max_splits);
|
||||
float seq_length_k = static_cast<float>(K) / 1024.0f;
|
||||
int max_splits = 1;
|
||||
|
||||
// TODO: This avoids a hang when the batch size larger than 1 and
|
||||
// there is more than 1 kv_splits.
|
||||
// Discuss with NVIDIA how this can be fixed.
|
||||
if (B > 1) {
|
||||
max_splits = min(1, max_splits);
|
||||
if (B <= 4 && seq_length_k >= 16) {
|
||||
max_splits = 16;
|
||||
}
|
||||
|
||||
// printf(" max_splits = %d\n", max_splits);
|
||||
else if (B <= 8 && seq_length_k >= 4) {
|
||||
max_splits = 8;
|
||||
}
|
||||
else if ((B <= 16 && seq_length_k >= 8) ||
|
||||
(B == 48 && seq_length_k >= 32)) {
|
||||
max_splits = 4;
|
||||
}
|
||||
else if ((B <= 32 && seq_length_k >= 16) ||
|
||||
(B == 96 && seq_length_k >= 16)) {
|
||||
max_splits = 2;
|
||||
}
|
||||
else {
|
||||
max_splits = 1;
|
||||
}
|
||||
|
||||
// Wave-aware scheduling: ensure integer number of waves in K dimension
|
||||
int sms_per_batch = max(1, sm_count / B);
|
||||
// printf(" sms_per_batch = %d\n", sms_per_batch);
|
||||
int split_heur = min(max_splits, sms_per_batch);
|
||||
int waves = ceil_div(B * split_heur, sm_count);
|
||||
int k_waves = ceil_div(max_splits, split_heur);
|
||||
int split_wave_aware = ceil_div(max_splits, k_waves);
|
||||
args.split_kv = split_wave_aware;
|
||||
// printf(" args.split_kv = %d\n", args.split_kv);
|
||||
|
||||
}
|
||||
|
||||
/// Determines whether the GEMM can execute the given problem.
|
||||
|
@ -64,3 +64,11 @@ void indexer_k_quant_and_cache(
|
||||
torch::Tensor& slot_mapping, // [num_tokens]
|
||||
int64_t quant_block_size, // quantization block size
|
||||
const std::string& scale_fmt);
|
||||
|
||||
// Extract function to gather quantized K cache
|
||||
void cp_gather_indexer_k_quant_cache(
|
||||
const torch::Tensor& kv_cache, // [num_blocks, block_size, cache_stride]
|
||||
torch::Tensor& dst_k, // [num_tokens, head_dim]
|
||||
torch::Tensor& dst_scale, // [num_tokens, head_dim / quant_block_size * 4]
|
||||
const torch::Tensor& block_table, // [batch_size, num_blocks]
|
||||
const torch::Tensor& cu_seq_lens); // [batch_size + 1]
|
@ -9,14 +9,14 @@
|
||||
#include "quantization/vectorization_utils.cuh"
|
||||
|
||||
#ifdef USE_ROCM
|
||||
#include "quantization/fp8/amd/quant_utils.cuh"
|
||||
#include "quantization/w8a8/fp8/amd/quant_utils.cuh"
|
||||
#else
|
||||
#include "quantization/fp8/nvidia/quant_utils.cuh"
|
||||
#include "quantization/w8a8/fp8/nvidia/quant_utils.cuh"
|
||||
#endif
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cfloat> // FLT_MIN
|
||||
#include <cfloat>
|
||||
|
||||
#ifdef USE_ROCM
|
||||
#include <hip/hip_bf16.h>
|
||||
@ -479,6 +479,7 @@ __global__ void concat_and_cache_ds_mla_kernel(
|
||||
|
||||
// Compute the scale for the tile
|
||||
float tile_scale = max_abs / 448.f;
|
||||
tile_scale = fmaxf(tile_scale, FLT_MIN);
|
||||
|
||||
// The first lane of each half-warp writes the scale to kv_cache
|
||||
if ((lane_idx == 0) || (lane_idx == 16)) {
|
||||
@ -571,6 +572,70 @@ __global__ void indexer_k_quant_and_cache_kernel(
|
||||
}
|
||||
}
|
||||
|
||||
template <int BLOCK_Y_SIZE>
|
||||
__global__ void cp_gather_indexer_k_quant_cache_kernel(
|
||||
const char* __restrict__ kv_cache, // [num_blocks, block_size,
|
||||
// cache_stride]
|
||||
char* __restrict__ dst_k, // [num_tokens, head_dim]
|
||||
char* __restrict__ dst_scale, // [num_tokens, head_dim / quant_block_size *
|
||||
// 4]
|
||||
const int* __restrict__ block_table, // [batch_size, num_blocks]
|
||||
const int* __restrict__ cu_seq_lens, // [batch_size + 1]
|
||||
const int batch_size, // batch size
|
||||
const int64_t token_stride, // stride for each token in dst_k
|
||||
const int64_t head_dim, // dimension of each head
|
||||
const int64_t block_stride, // stride for each block in kv_cache
|
||||
const int64_t cache_token_stride, // stride for each token in kv_cache
|
||||
const int64_t cache_block_size, // num_tokens for each block in kv_cache
|
||||
const int num_blocks, // number of blocks
|
||||
const int num_tokens, // number of tokens
|
||||
const int quant_block_size // quantization block size
|
||||
) {
|
||||
constexpr int VEC_SIZE = sizeof(float4) / sizeof(char);
|
||||
const int token_idx = blockIdx.x * blockDim.y + threadIdx.y;
|
||||
const int head_idx = (blockIdx.y * blockDim.x + threadIdx.x) * VEC_SIZE;
|
||||
// Find batch index within a block
|
||||
__shared__ int batch_idx[BLOCK_Y_SIZE];
|
||||
for (int iter = 0; iter < cuda_utils::ceil_div(batch_size, int(blockDim.x));
|
||||
iter++) {
|
||||
int tid = iter * blockDim.x + threadIdx.x;
|
||||
if (tid < batch_size) {
|
||||
const int seq_start = cu_seq_lens[tid];
|
||||
const int seq_end = cu_seq_lens[tid + 1];
|
||||
if (token_idx >= seq_start && token_idx < seq_end) {
|
||||
batch_idx[threadIdx.y] = tid;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef USE_ROCM
|
||||
__syncwarp();
|
||||
#endif
|
||||
|
||||
if (head_idx >= head_dim || token_idx >= num_tokens) {
|
||||
return;
|
||||
}
|
||||
const int inbatch_seq_idx = token_idx - cu_seq_lens[batch_idx[threadIdx.y]];
|
||||
const int block_idx = block_table[batch_idx[threadIdx.y] * num_blocks +
|
||||
inbatch_seq_idx / cache_block_size];
|
||||
const int64_t src_block_offset = block_idx * block_stride;
|
||||
const int64_t cache_inblock_offset =
|
||||
(inbatch_seq_idx % cache_block_size) * head_dim + head_idx;
|
||||
const int64_t src_inblock_offset = src_block_offset + cache_inblock_offset;
|
||||
const int64_t dst_inblock_offset = token_idx * token_stride + head_idx;
|
||||
|
||||
reinterpret_cast<float4*>(dst_k)[dst_inblock_offset / VEC_SIZE] =
|
||||
reinterpret_cast<const float4*>(kv_cache)[src_inblock_offset / VEC_SIZE];
|
||||
;
|
||||
if (threadIdx.x == 0) {
|
||||
const int64_t src_scale_offset =
|
||||
src_block_offset + cache_block_size * head_dim +
|
||||
cache_inblock_offset * 4 / quant_block_size;
|
||||
reinterpret_cast<float*>(dst_scale)[dst_inblock_offset / quant_block_size] =
|
||||
reinterpret_cast<const float*>(kv_cache)[src_scale_offset / 4];
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace vllm
|
||||
|
||||
// KV_T is the data type of key and value tensors.
|
||||
@ -1172,3 +1237,59 @@ void indexer_k_quant_and_cache(
|
||||
DISPATCH_BY_KV_CACHE_DTYPE(k.dtype(), "fp8_e4m3",
|
||||
CALL_INDEXER_K_QUANT_AND_CACHE);
|
||||
}
|
||||
|
||||
// Macro to dispatch the kernel based on the data amount.
|
||||
#define CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(BLOCK_Y_SIZE) \
|
||||
vllm::cp_gather_indexer_k_quant_cache_kernel<BLOCK_Y_SIZE> \
|
||||
<<<dim3((num_tokens + BLOCK_Y_SIZE - 1) / BLOCK_Y_SIZE, \
|
||||
(head_dim + 8 * vec_size - 1) / (8 * vec_size)), \
|
||||
dim3(8, BLOCK_Y_SIZE), 0, stream>>>( \
|
||||
reinterpret_cast<char*>(kv_cache.data_ptr()), \
|
||||
reinterpret_cast<char*>(dst_k.data_ptr()), \
|
||||
reinterpret_cast<char*>(dst_scale.data_ptr()), \
|
||||
block_table.data_ptr<int32_t>(), cu_seq_lens.data_ptr<int32_t>(), \
|
||||
batch_size, dst_k.stride(0), dst_k.size(1), kv_cache.stride(0), \
|
||||
kv_cache.stride(1), kv_cache.size(1), block_table.size(1), \
|
||||
num_tokens, quant_block_size);
|
||||
|
||||
void cp_gather_indexer_k_quant_cache(
|
||||
const torch::Tensor& kv_cache, // [num_blocks, block_size, cache_stride]
|
||||
torch::Tensor& dst_k, // [num_tokens, head_dim]
|
||||
torch::Tensor& dst_scale, // [num_tokens, head_dim / quant_block_size * 4]
|
||||
const torch::Tensor& block_table, // [batch_size, num_blocks]
|
||||
const torch::Tensor& cu_seq_lens // [batch_size + 1]
|
||||
) {
|
||||
int batch_size = block_table.size(0);
|
||||
int num_tokens = dst_k.size(0);
|
||||
int head_dim = dst_k.size(1);
|
||||
int quant_block_size = head_dim * 4 / dst_scale.size(1);
|
||||
|
||||
TORCH_CHECK(kv_cache.device() == dst_k.device(),
|
||||
"kv_cache and dst_k must be on the same device");
|
||||
TORCH_CHECK(kv_cache.device() == dst_scale.device(),
|
||||
"kv_cache and dst_scale must be on the same device");
|
||||
TORCH_CHECK(kv_cache.device() == block_table.device(),
|
||||
"kv_cache and block_table must be on the same device");
|
||||
TORCH_CHECK(kv_cache.device() == cu_seq_lens.device(),
|
||||
"kv_cache and cu_seq_lens must be on the same device");
|
||||
TORCH_CHECK(head_dim % quant_block_size == 0,
|
||||
"head_dim must be divisible by quant_block_size");
|
||||
|
||||
constexpr int vec_size = 16;
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(kv_cache));
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
if (num_tokens < 32) {
|
||||
CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(1);
|
||||
} else if (num_tokens < 64) {
|
||||
CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(2);
|
||||
} else if (num_tokens < 128) {
|
||||
CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(4);
|
||||
} else if (num_tokens < 256) {
|
||||
CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(8);
|
||||
} else if (num_tokens < 512) {
|
||||
CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(16);
|
||||
} else {
|
||||
CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(32);
|
||||
}
|
||||
}
|
||||
|
@ -5,12 +5,15 @@
|
||||
|
||||
namespace vllm {
|
||||
|
||||
// vllm_kernel_override_batch_invariant(); returns true
|
||||
// if env VLLM_KERNEL_OVERRIDE_BATCH_INVARIANT=1
|
||||
inline bool vllm_kernel_override_batch_invariant() {
|
||||
std::string env_key = "VLLM_KERNEL_OVERRIDE_BATCH_INVARIANT";
|
||||
const char* val = std::getenv(env_key.c_str());
|
||||
return (val && std::atoi(val) != 0) ? 1 : 0;
|
||||
// vllm_is_batch_invariant(); returns true
|
||||
// if env VLLM_BATCH_INVARIANT=1
|
||||
inline bool vllm_is_batch_invariant() {
|
||||
static bool cached = []() {
|
||||
std::string env_key = "VLLM_BATCH_INVARIANT";
|
||||
const char* val = std::getenv(env_key.c_str());
|
||||
return (val && std::atoi(val) != 0) ? 1 : 0;
|
||||
}();
|
||||
return cached;
|
||||
}
|
||||
|
||||
} // namespace vllm
|
||||
|
@ -137,9 +137,8 @@ DNNLMatMulPrimitiveHandler::DNNLMatMulPrimitiveHandler(
|
||||
}
|
||||
|
||||
void DNNLMatMulPrimitiveHandler::prepack_weight(
|
||||
void* original_b_ptr, dnnl::memory::desc b_target_mem_desc) {
|
||||
dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_,
|
||||
{b_k_stride_, b_n_stride_});
|
||||
void* original_b_ptr, dnnl::memory::desc original_b_md,
|
||||
dnnl::memory::desc b_target_mem_desc) {
|
||||
dnnl::memory original_weight(original_b_md, default_engine(), original_b_ptr);
|
||||
dnnl::memory packed_weight(b_target_mem_desc, default_engine());
|
||||
{
|
||||
@ -250,7 +249,9 @@ W8A8MatMulPrimitiveHandler::W8A8MatMulPrimitiveHandler(const Args& args)
|
||||
if (a_qs_ == QuantizationStrategy::PER_TOKEN) {
|
||||
assert(!use_azp_);
|
||||
};
|
||||
prepack_weight(args.b_ptr,
|
||||
dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_,
|
||||
{b_k_stride_, b_n_stride_});
|
||||
prepack_weight(args.b_ptr, original_b_md,
|
||||
create_primitive_desc(
|
||||
MSizeCacheKey{.a_m_size = DNNL_RUNTIME_DIM_VAL,
|
||||
.use_bias = false,
|
||||
@ -412,12 +413,25 @@ MatMulPrimitiveHandler::MatMulPrimitiveHandler(const Args& args)
|
||||
assert(ab_type_ == dnnl::memory::data_type::f32 ||
|
||||
ab_type_ == dnnl::memory::data_type::bf16 ||
|
||||
ab_type_ == dnnl::memory::data_type::f16);
|
||||
prepack_weight(args.b_ptr,
|
||||
|
||||
dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_,
|
||||
{b_k_stride_, b_n_stride_});
|
||||
|
||||
prepack_weight(args.b_ptr, original_b_md,
|
||||
create_primitive_desc(
|
||||
MSizeCacheKey{.a_m_size = DNNL_RUNTIME_DIM_VAL,
|
||||
.a_m_stride = DNNL_RUNTIME_DIM_VAL,
|
||||
.use_bias = false,
|
||||
.bias_type = dnnl::memory::data_type::undef},
|
||||
MSizeCacheKey{
|
||||
#ifdef VLLM_USE_ACL
|
||||
// Arm Compute Library (ACL) backend for oneDNN does
|
||||
// not support runtime
|
||||
// dimensions, so we set M to a default value
|
||||
.a_m_size = 128,
|
||||
.a_m_stride = b_k_size_,
|
||||
#else
|
||||
.a_m_size = DNNL_RUNTIME_DIM_VAL,
|
||||
.a_m_stride = DNNL_RUNTIME_DIM_VAL,
|
||||
#endif
|
||||
.use_bias = false,
|
||||
.bias_type = dnnl::memory::data_type::undef},
|
||||
true)
|
||||
.weights_desc());
|
||||
init_runtime_memory_cache(args);
|
||||
@ -443,13 +457,31 @@ void MatMulPrimitiveHandler::execute(ExecArgs& args) {
|
||||
c_storage->set_data_handle((void*)args.c_ptr);
|
||||
c_mem_desc->dims[0] = args.a_m_size;
|
||||
|
||||
#ifndef VLLM_USE_ACL
|
||||
// We do not support in ACL backend of oneDNN, we handle bias by:
|
||||
// 1. copying it into the result tensor
|
||||
// 2. attaching a fused-sum post-op to the matmul primitive
|
||||
if (args.use_bias) {
|
||||
auto&& [bias_storage, bias_mem_desc] = get_runtime_memory_ptr(2);
|
||||
bias_storage->set_data_handle((void*)args.bias_ptr);
|
||||
}
|
||||
|
||||
#endif
|
||||
dnnl::matmul matmul = get_matmul_cache(args);
|
||||
|
||||
// With ACL backend of oneDNN, the required memory format might change when the
|
||||
// source tensor dims change. This does not really happen in practice, so isn't
|
||||
// a performance hit, but we need to support it because the API allows for it.
|
||||
#ifdef VLLM_USE_ACL
|
||||
auto new_expected_wei_desc =
|
||||
dnnl::matmul::primitive_desc(
|
||||
const_cast<dnnl_primitive_desc_t>(matmul.get_primitive_desc()))
|
||||
.weights_desc();
|
||||
if (new_expected_wei_desc != b_target_mem_desc_) {
|
||||
prepack_weight(memory_cache_[DNNL_ARG_WEIGHTS].get_data_handle(),
|
||||
b_target_mem_desc_, new_expected_wei_desc);
|
||||
}
|
||||
#endif
|
||||
|
||||
auto&& [scratchpad_storage, scratchpad_mem_desc] = get_runtime_memory_ptr(3);
|
||||
scratchpad_storage->set_data_handle(
|
||||
DNNLScratchPadManager::get_dnnl_scratchpad_manager()->get_data<void>());
|
||||
@ -484,7 +516,13 @@ dnnl::matmul::primitive_desc MatMulPrimitiveHandler::create_primitive_desc(
|
||||
} else {
|
||||
a_md = dnnl::memory::desc({key.a_m_size, b_k_size_}, b_type_,
|
||||
{key.a_m_stride, 1});
|
||||
#ifdef VLLM_USE_ACL
|
||||
// ACL's backend of oneDNN always expects the weight format to be "any"
|
||||
b_md = dnnl::memory::desc({b_k_size_, b_n_size_}, b_type_,
|
||||
dnnl::memory::format_tag::any);
|
||||
#else
|
||||
b_md = b_target_mem_desc_;
|
||||
#endif
|
||||
}
|
||||
dnnl::memory::desc c_md({key.a_m_size, b_n_size_}, c_type_,
|
||||
dnnl::memory::format_tag::ab);
|
||||
@ -494,8 +532,18 @@ dnnl::matmul::primitive_desc MatMulPrimitiveHandler::create_primitive_desc(
|
||||
|
||||
if (key.use_bias) {
|
||||
dnnl::memory::desc bias_md({1, b_n_size_}, key.bias_type, {b_n_size_, 1});
|
||||
// Since ACL's matmuls don't support passing a bias_md, we apply the bias
|
||||
// through a fused-sum post-op
|
||||
#ifdef VLLM_USE_ACL
|
||||
dnnl::post_ops post_ops;
|
||||
post_ops.append_sum();
|
||||
attr.set_post_ops(post_ops);
|
||||
return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, c_md,
|
||||
attr);
|
||||
#else
|
||||
return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, bias_md,
|
||||
c_md, attr);
|
||||
#endif
|
||||
} else {
|
||||
return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, c_md,
|
||||
attr);
|
||||
@ -511,13 +559,23 @@ void MatMulPrimitiveHandler::init_runtime_memory_cache(const Args& args) {
|
||||
default_engine(), nullptr);
|
||||
set_runtime_memory_ptr(1, memory_cache_[DNNL_ARG_DST].get());
|
||||
|
||||
// ACL matmuls don't support bias_md, so we don't need these
|
||||
#ifndef VLLM_USE_ACL
|
||||
memory_cache_[DNNL_ARG_BIAS] =
|
||||
dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
|
||||
default_engine(), nullptr);
|
||||
set_runtime_memory_ptr(2, memory_cache_[DNNL_ARG_BIAS].get());
|
||||
|
||||
#endif
|
||||
memory_cache_[DNNL_ARG_SCRATCHPAD] =
|
||||
dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
|
||||
default_engine(), nullptr);
|
||||
set_runtime_memory_ptr(3, memory_cache_[DNNL_ARG_SCRATCHPAD].get());
|
||||
}
|
||||
|
||||
bool is_onednn_acl_supported() {
|
||||
#ifdef VLLM_USE_ACL
|
||||
return true;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
@ -101,7 +101,7 @@ class DNNLMatMulPrimitiveHandler {
|
||||
protected:
|
||||
DNNLMatMulPrimitiveHandler(const Args& args, dnnl::memory::data_type b_type);
|
||||
|
||||
void prepack_weight(void* original_b_ptr,
|
||||
void prepack_weight(void* original_b_ptr, dnnl::memory::desc original_b_md,
|
||||
dnnl::memory::desc b_target_mem_desc);
|
||||
|
||||
void set_runtime_memory_ptr(size_t index, dnnl_memory* memory_ptr);
|
||||
|
@ -527,21 +527,42 @@ void onednn_mm(torch::Tensor& c, // [M, OC], row-major
|
||||
MatMulPrimitiveHandler* ptr =
|
||||
reinterpret_cast<MatMulPrimitiveHandler*>(handler);
|
||||
|
||||
// ACL matmuls expect contiguous source tensors
|
||||
#ifdef VLLM_USE_ACL
|
||||
torch::Tensor a_contig = a.contiguous();
|
||||
#endif
|
||||
|
||||
MatMulPrimitiveHandler::ExecArgs exec_args;
|
||||
|
||||
#ifdef VLLM_USE_ACL
|
||||
exec_args.a_m_size = a_contig.size(0);
|
||||
exec_args.a_m_stride = a_contig.stride(0);
|
||||
#else
|
||||
exec_args.a_m_size = a.size(0);
|
||||
exec_args.a_m_stride = a.stride(0);
|
||||
|
||||
#endif
|
||||
VLLM_DISPATCH_FLOATING_TYPES(a.scalar_type(), "onednn_mm", [&] {
|
||||
if (bias.has_value()) {
|
||||
exec_args.use_bias = true;
|
||||
exec_args.bias_type = get_dnnl_type<scalar_t>();
|
||||
#ifdef VLLM_USE_ACL
|
||||
// ACL matmuls in oneDNN do not support a bias.
|
||||
// We handle a matmul with bias by doing: c = bias; c += matmul(a, b)
|
||||
c.copy_(bias.value());
|
||||
#else
|
||||
exec_args.bias_ptr = bias->data_ptr<scalar_t>();
|
||||
#endif
|
||||
} else {
|
||||
exec_args.use_bias = false;
|
||||
exec_args.bias_type = get_dnnl_type<void>();
|
||||
exec_args.bias_ptr = nullptr;
|
||||
}
|
||||
#ifdef VLLM_USE_ACL
|
||||
exec_args.a_ptr = a_contig.data_ptr<scalar_t>();
|
||||
#else
|
||||
exec_args.a_ptr = a.data_ptr<scalar_t>();
|
||||
|
||||
#endif
|
||||
exec_args.c_ptr = c.data_ptr<scalar_t>();
|
||||
|
||||
ptr->execute(exec_args);
|
||||
|
@ -27,6 +27,8 @@ int64_t create_onednn_mm_handler(const torch::Tensor& b,
|
||||
void onednn_mm(torch::Tensor& c, const torch::Tensor& a,
|
||||
const std::optional<torch::Tensor>& bias, int64_t handler);
|
||||
|
||||
bool is_onednn_acl_supported();
|
||||
|
||||
void mla_decode_kvcache(torch::Tensor& out, torch::Tensor& query,
|
||||
torch::Tensor& kv_cache, double scale,
|
||||
torch::Tensor& block_tables, torch::Tensor& seq_lens);
|
||||
@ -181,6 +183,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
"int handler) -> ()");
|
||||
ops.impl("onednn_mm", torch::kCPU, &onednn_mm);
|
||||
|
||||
// Check if oneDNN was built with ACL backend
|
||||
ops.def("is_onednn_acl_supported() -> bool", &is_onednn_acl_supported);
|
||||
|
||||
// Create oneDNN W8A8 handler
|
||||
ops.def(
|
||||
"create_onednn_scaled_mm_handler(Tensor b, Tensor b_scales, ScalarType "
|
||||
|
@ -12,6 +12,7 @@ using CubMaxOp = cub::Max;
|
||||
#endif // CUB_VERSION
|
||||
#else
|
||||
#include <hipcub/hipcub.hpp>
|
||||
using CubAddOp = cub::Sum;
|
||||
using CubMaxOp = cub::Max;
|
||||
namespace cub = hipcub;
|
||||
using CubAddOp = hipcub::Sum;
|
||||
using CubMaxOp = hipcub::Max;
|
||||
#endif // USE_ROCM
|
||||
|
@ -2,7 +2,6 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import enum
|
||||
from typing import Union
|
||||
|
||||
from cutlass_library import *
|
||||
|
||||
@ -22,31 +21,31 @@ class MixedInputKernelScheduleType(enum.Enum):
|
||||
TmaWarpSpecializedCooperative = enum_auto()
|
||||
|
||||
|
||||
VLLMDataTypeNames: dict[Union[VLLMDataType, DataType], str] = {
|
||||
VLLMDataTypeNames: dict[VLLMDataType | DataType, str] = {
|
||||
**DataTypeNames, # type: ignore
|
||||
**{
|
||||
VLLMDataType.u4b8: "u4b8",
|
||||
VLLMDataType.u8b128: "u8b128",
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
VLLMDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
|
||||
VLLMDataTypeTag: dict[VLLMDataType | DataType, str] = {
|
||||
**DataTypeTag, # type: ignore
|
||||
**{
|
||||
VLLMDataType.u4b8: "cutlass::vllm_uint4b8_t",
|
||||
VLLMDataType.u8b128: "cutlass::vllm_uint8b128_t",
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
VLLMDataTypeSize: dict[Union[VLLMDataType, DataType], int] = {
|
||||
VLLMDataTypeSize: dict[VLLMDataType | DataType, int] = {
|
||||
**DataTypeSize, # type: ignore
|
||||
**{
|
||||
VLLMDataType.u4b8: 4,
|
||||
VLLMDataType.u8b128: 8,
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
VLLMDataTypeVLLMScalarTypeTag: dict[Union[VLLMDataType, DataType], str] = {
|
||||
VLLMDataTypeVLLMScalarTypeTag: dict[VLLMDataType | DataType, str] = {
|
||||
VLLMDataType.u4b8: "vllm::kU4B8",
|
||||
VLLMDataType.u8b128: "vllm::kU8B128",
|
||||
DataType.u4: "vllm::kU4",
|
||||
@ -57,7 +56,7 @@ VLLMDataTypeVLLMScalarTypeTag: dict[Union[VLLMDataType, DataType], str] = {
|
||||
DataType.bf16: "vllm::kBfloat16",
|
||||
}
|
||||
|
||||
VLLMDataTypeTorchDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
|
||||
VLLMDataTypeTorchDataTypeTag: dict[VLLMDataType | DataType, str] = {
|
||||
DataType.u8: "at::ScalarType::Byte",
|
||||
DataType.s8: "at::ScalarType::Char",
|
||||
DataType.e4m3: "at::ScalarType::Float8_e4m3fn",
|
||||
@ -67,15 +66,11 @@ VLLMDataTypeTorchDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
|
||||
DataType.f32: "at::ScalarType::Float",
|
||||
}
|
||||
|
||||
VLLMKernelScheduleTag: dict[Union[
|
||||
MixedInputKernelScheduleType, KernelScheduleType], str] = {
|
||||
**KernelScheduleTag, # type: ignore
|
||||
**{
|
||||
MixedInputKernelScheduleType.TmaWarpSpecialized:
|
||||
"cutlass::gemm::KernelTmaWarpSpecialized",
|
||||
MixedInputKernelScheduleType.TmaWarpSpecializedPingpong:
|
||||
"cutlass::gemm::KernelTmaWarpSpecializedPingpong",
|
||||
MixedInputKernelScheduleType.TmaWarpSpecializedCooperative:
|
||||
"cutlass::gemm::KernelTmaWarpSpecializedCooperative",
|
||||
}
|
||||
}
|
||||
VLLMKernelScheduleTag: dict[MixedInputKernelScheduleType | KernelScheduleType, str] = {
|
||||
**KernelScheduleTag, # type: ignore
|
||||
**{
|
||||
MixedInputKernelScheduleType.TmaWarpSpecialized: "cutlass::gemm::KernelTmaWarpSpecialized", # noqa: E501
|
||||
MixedInputKernelScheduleType.TmaWarpSpecializedPingpong: "cutlass::gemm::KernelTmaWarpSpecializedPingpong", # noqa: E501
|
||||
MixedInputKernelScheduleType.TmaWarpSpecializedCooperative: "cutlass::gemm::KernelTmaWarpSpecializedCooperative", # noqa: E501
|
||||
},
|
||||
}
|
||||
|
@ -2,6 +2,7 @@
|
||||
#include "dispatch_utils.h"
|
||||
#include "cub_helpers.h"
|
||||
#include "core/batch_invariant.hpp"
|
||||
#include "quantization/vectorization_utils.cuh"
|
||||
|
||||
#include <torch/cuda.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
@ -18,11 +19,22 @@ __global__ void rms_norm_kernel(
|
||||
const float epsilon, const int num_tokens, const int hidden_size) {
|
||||
__shared__ float s_variance;
|
||||
float variance = 0.0f;
|
||||
const scalar_t* input_row = input + blockIdx.x * input_stride;
|
||||
|
||||
for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
|
||||
const float x = (float)input[blockIdx.x * input_stride + idx];
|
||||
constexpr int VEC_SIZE = 8;
|
||||
auto vec_op = [&variance](const vec_n_t<scalar_t, VEC_SIZE>& vec) {
|
||||
#pragma unroll
|
||||
for (int i = 0; i < VEC_SIZE; ++i) {
|
||||
float x = static_cast<float>(vec.val[i]);
|
||||
variance += x * x;
|
||||
}
|
||||
};
|
||||
auto scalar_op = [&variance](const scalar_t& val) {
|
||||
float x = static_cast<float>(val);
|
||||
variance += x * x;
|
||||
}
|
||||
};
|
||||
vllm::vectorize_read_with_alignment<VEC_SIZE>(
|
||||
input_row, hidden_size, threadIdx.x, blockDim.x, vec_op, scalar_op);
|
||||
|
||||
using BlockReduce = cub::BlockReduce<float, 1024>;
|
||||
__shared__ typename BlockReduce::TempStorage reduceStore;
|
||||
@ -136,211 +148,6 @@ fused_add_rms_norm_kernel(
|
||||
}
|
||||
}
|
||||
|
||||
/* Function specialization in the case of FP16/BF16 tensors.
|
||||
Additional optimizations we can make in this case are
|
||||
packed and vectorized operations, which help with the
|
||||
memory latency bottleneck.
|
||||
|
||||
_f16VecPN struct extends _f16Vec to add operations specifically required for
|
||||
polynomial normalization (poly norm).
|
||||
The original _f16Vec does not include the sum-of-powers computation or
|
||||
in-place polynomial normalization logic. */
|
||||
template <typename scalar_t, int width>
|
||||
struct alignas(16) _f16VecPN : _f16Vec<scalar_t, width> {
|
||||
using Base = _f16Vec<scalar_t, width>;
|
||||
using Converter = typename Base::Converter;
|
||||
using T1 = typename Base::T1;
|
||||
using T2 = typename Base::T2;
|
||||
using Base::data;
|
||||
|
||||
__device__ auto sum_pows() const {
|
||||
float s2 = 0.0f, s4 = 0.0f, s6 = 0.0f;
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < width; i += 2) {
|
||||
float2 z = Converter::convert(T2{data[i], data[i + 1]});
|
||||
float x2 = z.x * z.x;
|
||||
float x4 = x2 * x2;
|
||||
float x6 = x4 * x2;
|
||||
|
||||
float y2 = z.y * z.y;
|
||||
float y4 = y2 * y2;
|
||||
float y6 = y4 * y2;
|
||||
|
||||
s2 += x2 + y2;
|
||||
s4 += x4 + y4;
|
||||
s6 += x6 + y6;
|
||||
}
|
||||
return std::make_tuple(s2, s4, s6);
|
||||
}
|
||||
|
||||
__device__ void poly_norm_inplace(const float w2_inv_std,
|
||||
const float w1_inv_std2,
|
||||
const float w0_inv_std3, const float bias) {
|
||||
#pragma unroll
|
||||
for (int i = 0; i < width; i += 2) {
|
||||
float2 z = Converter::convert(T2{data[i], data[i + 1]});
|
||||
|
||||
float x2 = z.x * z.x;
|
||||
float x3 = x2 * z.x;
|
||||
z.x = w2_inv_std * z.x + w1_inv_std2 * x2 + w0_inv_std3 * x3 + bias;
|
||||
|
||||
float y2 = z.y * z.y;
|
||||
float y3 = y2 * z.y;
|
||||
z.y = w2_inv_std * z.y + w1_inv_std2 * y2 + w0_inv_std3 * y3 + bias;
|
||||
|
||||
auto out = Converter::convert(z);
|
||||
data[i] = out.x;
|
||||
data[i + 1] = out.y;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename scalar_t, int width>
|
||||
__global__ std::enable_if_t<(width > 0) && _typeConvert<scalar_t>::exists>
|
||||
poly_norm_kernel(scalar_t* __restrict__ out, // [..., hidden_size]
|
||||
const scalar_t* __restrict__ input, // [..., hidden_size]
|
||||
const scalar_t* __restrict__ weight, // [3]
|
||||
const scalar_t* __restrict__ bias, // [1]
|
||||
const float epsilon, const int hidden_size) {
|
||||
// Sanity checks on our vector struct and type-punned pointer arithmetic
|
||||
static_assert(std::is_pod_v<_f16VecPN<scalar_t, width>>);
|
||||
static_assert(sizeof(_f16VecPN<scalar_t, width>) == sizeof(scalar_t) * width);
|
||||
|
||||
/* These and the argument pointers are all declared `restrict` as they are
|
||||
not aliased in practice. Argument pointers should not be dereferenced
|
||||
in this kernel as that would be undefined behavior */
|
||||
auto* __restrict__ input_v =
|
||||
reinterpret_cast<const _f16VecPN<scalar_t, width>*>(input);
|
||||
const int vec_hidden_size = hidden_size / width;
|
||||
float variance = 0.0f;
|
||||
float variance2 = 0.0f;
|
||||
float variance3 = 0.0f;
|
||||
|
||||
for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) {
|
||||
int id = blockIdx.x * vec_hidden_size + idx;
|
||||
_f16VecPN<scalar_t, width> temp = input_v[id];
|
||||
auto [x2, x4, x6] = temp.sum_pows();
|
||||
|
||||
variance += x2;
|
||||
variance2 += x4;
|
||||
variance3 += x6;
|
||||
}
|
||||
|
||||
float3 thread_variances = make_float3(variance, variance2, variance3);
|
||||
|
||||
struct SumOp {
|
||||
__device__ float3 operator()(const float3& a, const float3& b) const {
|
||||
return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
|
||||
}
|
||||
};
|
||||
|
||||
using BlockReduce = cub::BlockReduce<float3, 1024>;
|
||||
__shared__ typename BlockReduce::TempStorage reduceStore;
|
||||
float3 block_variances =
|
||||
BlockReduce(reduceStore).Reduce(thread_variances, SumOp{}, blockDim.x);
|
||||
|
||||
variance = block_variances.x;
|
||||
variance2 = block_variances.y;
|
||||
variance3 = block_variances.z;
|
||||
|
||||
__shared__ float s_w2_inv_std;
|
||||
__shared__ float s_w1_inv_std2;
|
||||
__shared__ float s_w0_inv_std3;
|
||||
__shared__ float s_bias;
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
float w0 = (float)weight[0];
|
||||
float w1 = (float)weight[1];
|
||||
float w2 = (float)weight[2];
|
||||
s_bias = (float)bias[0];
|
||||
|
||||
s_w2_inv_std = w2 * rsqrtf(variance / hidden_size + epsilon);
|
||||
s_w1_inv_std2 = w1 * rsqrtf(variance2 / hidden_size + epsilon);
|
||||
s_w0_inv_std3 = w0 * rsqrtf(variance3 / hidden_size + epsilon);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
auto* __restrict__ out_v = reinterpret_cast<_f16VecPN<scalar_t, width>*>(out);
|
||||
|
||||
for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) {
|
||||
int id = blockIdx.x * vec_hidden_size + idx;
|
||||
_f16VecPN<scalar_t, width> temp = input_v[id];
|
||||
temp.poly_norm_inplace(s_w2_inv_std, s_w1_inv_std2, s_w0_inv_std3, s_bias);
|
||||
out_v[id] = temp;
|
||||
}
|
||||
}
|
||||
|
||||
/* Generic poly_norm_kernel
|
||||
The width field is not used here but necessary for other specializations.
|
||||
*/
|
||||
template <typename scalar_t, int width>
|
||||
__global__ std::enable_if_t<(width == 0) || !_typeConvert<scalar_t>::exists>
|
||||
poly_norm_kernel(scalar_t* __restrict__ out, // [..., hidden_size]
|
||||
const scalar_t* __restrict__ input, // [..., hidden_size]
|
||||
const scalar_t* __restrict__ weight, // [3]
|
||||
const scalar_t* __restrict__ bias, // [1]
|
||||
const float epsilon, const int hidden_size) {
|
||||
float variance = 0.0f;
|
||||
float variance2 = 0.0f;
|
||||
float variance3 = 0.0f;
|
||||
|
||||
for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
|
||||
float x = (float)input[blockIdx.x * hidden_size + idx];
|
||||
float x2 = x * x;
|
||||
float x4 = x2 * x2;
|
||||
float x6 = x4 * x2;
|
||||
|
||||
variance += x2;
|
||||
variance2 += x4;
|
||||
variance3 += x6;
|
||||
}
|
||||
|
||||
float3 thread_variances = make_float3(variance, variance2, variance3);
|
||||
|
||||
struct SumOp {
|
||||
__device__ float3 operator()(const float3& a, const float3& b) const {
|
||||
return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
|
||||
}
|
||||
};
|
||||
|
||||
using BlockReduce = cub::BlockReduce<float3, 1024>;
|
||||
__shared__ typename BlockReduce::TempStorage reduceStore;
|
||||
float3 block_variances =
|
||||
BlockReduce(reduceStore).Reduce(thread_variances, SumOp{}, blockDim.x);
|
||||
|
||||
variance = block_variances.x;
|
||||
variance2 = block_variances.y;
|
||||
variance3 = block_variances.z;
|
||||
|
||||
__shared__ float s_w2_inv_std;
|
||||
__shared__ float s_w1_inv_std2;
|
||||
__shared__ float s_w0_inv_std3;
|
||||
__shared__ float s_bias;
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
float w0 = (float)weight[0];
|
||||
float w1 = (float)weight[1];
|
||||
float w2 = (float)weight[2];
|
||||
s_bias = (float)bias[0];
|
||||
|
||||
s_w2_inv_std = w2 * rsqrtf(variance / hidden_size + epsilon);
|
||||
s_w1_inv_std2 = w1 * rsqrtf(variance2 / hidden_size + epsilon);
|
||||
s_w0_inv_std3 = w0 * rsqrtf(variance3 / hidden_size + epsilon);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
|
||||
float x = (float)input[blockIdx.x * hidden_size + idx];
|
||||
float x2 = x * x;
|
||||
float x3 = x2 * x;
|
||||
|
||||
out[blockIdx.x * hidden_size + idx] =
|
||||
(scalar_t)(x * s_w2_inv_std + x2 * s_w1_inv_std2 + x3 * s_w0_inv_std3 +
|
||||
s_bias);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace vllm
|
||||
|
||||
void rms_norm(torch::Tensor& out, // [..., hidden_size]
|
||||
@ -352,18 +159,26 @@ void rms_norm(torch::Tensor& out, // [..., hidden_size]
|
||||
TORCH_CHECK(weight.is_contiguous());
|
||||
|
||||
int hidden_size = input.size(-1);
|
||||
int num_tokens = input.numel() / hidden_size;
|
||||
int64_t input_stride = input.stride(-2);
|
||||
|
||||
// We cannot just use `input.stride(-2)` if the tensor is not row-major.
|
||||
// Instead, we use a 2d view to get the second-innermost stride.
|
||||
// That way the dimensions (except the last one) can be arbitrarily permuted.
|
||||
torch::Tensor input_view = input.view({-1, hidden_size});
|
||||
|
||||
int num_tokens = input_view.numel() / hidden_size;
|
||||
int64_t input_stride = input_view.stride(-2);
|
||||
|
||||
dim3 grid(num_tokens);
|
||||
dim3 block(std::min(hidden_size, 1024));
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(input_view));
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "rms_norm_kernel", [&] {
|
||||
vllm::rms_norm_kernel<scalar_t><<<grid, block, 0, stream>>>(
|
||||
out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), input_stride,
|
||||
weight.data_ptr<scalar_t>(), epsilon, num_tokens, hidden_size);
|
||||
});
|
||||
VLLM_DISPATCH_FLOATING_TYPES(
|
||||
input_view.scalar_type(), "rms_norm_kernel", [&] {
|
||||
vllm::rms_norm_kernel<scalar_t><<<grid, block, 0, stream>>>(
|
||||
out.data_ptr<scalar_t>(), input_view.data_ptr<scalar_t>(),
|
||||
input_stride, weight.data_ptr<scalar_t>(), epsilon, num_tokens,
|
||||
hidden_size);
|
||||
});
|
||||
}
|
||||
|
||||
#define LAUNCH_FUSED_ADD_RMS_NORM(width) \
|
||||
@ -380,6 +195,8 @@ void fused_add_rms_norm(torch::Tensor& input, // [..., hidden_size]
|
||||
torch::Tensor& residual, // [..., hidden_size]
|
||||
torch::Tensor& weight, // [hidden_size]
|
||||
double epsilon) {
|
||||
TORCH_CHECK(weight.scalar_type() == input.scalar_type());
|
||||
TORCH_CHECK(input.scalar_type() == residual.scalar_type());
|
||||
TORCH_CHECK(residual.is_contiguous());
|
||||
TORCH_CHECK(weight.is_contiguous());
|
||||
int hidden_size = input.size(-1);
|
||||
@ -414,7 +231,7 @@ void fused_add_rms_norm(torch::Tensor& input, // [..., hidden_size]
|
||||
wt_ptr % req_alignment_bytes == 0;
|
||||
bool offsets_are_multiple_of_vector_width =
|
||||
hidden_size % vector_width == 0 && input_stride % vector_width == 0;
|
||||
bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
|
||||
bool batch_invariant_launch = vllm::vllm_is_batch_invariant();
|
||||
if (ptrs_are_aligned && offsets_are_multiple_of_vector_width &&
|
||||
!batch_invariant_launch) {
|
||||
LAUNCH_FUSED_ADD_RMS_NORM(8);
|
||||
@ -422,50 +239,3 @@ void fused_add_rms_norm(torch::Tensor& input, // [..., hidden_size]
|
||||
LAUNCH_FUSED_ADD_RMS_NORM(0);
|
||||
}
|
||||
}
|
||||
|
||||
#define LAUNCH_FUSED_POLY_NORM(width) \
|
||||
VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "poly_norm_kernel", [&] { \
|
||||
vllm::poly_norm_kernel<scalar_t, width><<<grid, block, 0, stream>>>( \
|
||||
out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), \
|
||||
weight.data_ptr<scalar_t>(), bias.data_ptr<scalar_t>(), epsilon, \
|
||||
hidden_size); \
|
||||
});
|
||||
|
||||
void poly_norm(torch::Tensor& out, // [..., hidden_size]
|
||||
torch::Tensor& input, // [..., hidden_size]
|
||||
torch::Tensor& weight, // [3]
|
||||
torch::Tensor& bias, // [1]
|
||||
double epsilon) {
|
||||
TORCH_CHECK(out.is_contiguous());
|
||||
TORCH_CHECK(input.is_contiguous());
|
||||
TORCH_CHECK(out.data_ptr() != input.data_ptr());
|
||||
|
||||
int hidden_size = input.size(-1);
|
||||
int num_tokens = input.numel() / hidden_size;
|
||||
|
||||
dim3 grid(num_tokens);
|
||||
/* This kernel is memory-latency bound in many scenarios.
|
||||
When num_tokens is large, a smaller block size allows
|
||||
for increased block occupancy on CUs and better latency
|
||||
hiding on global mem ops. */
|
||||
const int max_block_size = (num_tokens < 256) ? 1024 : 256;
|
||||
dim3 block(std::min(hidden_size, max_block_size));
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
/*If the tensor types are FP16/BF16, try to use the optimized kernel
|
||||
with packed + vectorized ops.
|
||||
Max optimization is achieved with a width-8 vector of FP16/BF16s
|
||||
since we can load at most 128 bits at once in a global memory op.
|
||||
However, this requires each tensor's data to be aligned to 16
|
||||
bytes.
|
||||
*/
|
||||
auto inp_ptr = reinterpret_cast<std::uintptr_t>(input.data_ptr());
|
||||
auto out_ptr = reinterpret_cast<std::uintptr_t>(out.data_ptr());
|
||||
bool ptrs_are_aligned = inp_ptr % 16 == 0 && out_ptr % 16 == 0;
|
||||
bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
|
||||
if (ptrs_are_aligned && hidden_size % 8 == 0 && !batch_invariant_launch) {
|
||||
LAUNCH_FUSED_POLY_NORM(8);
|
||||
} else {
|
||||
LAUNCH_FUSED_POLY_NORM(0);
|
||||
}
|
||||
}
|
||||
|
@ -6,10 +6,11 @@
|
||||
*/
|
||||
|
||||
#include "type_convert.cuh"
|
||||
#include "quantization/fp8/common.cuh"
|
||||
#include "quantization/w8a8/fp8/common.cuh"
|
||||
#include "dispatch_utils.h"
|
||||
#include "cub_helpers.h"
|
||||
#include "core/batch_invariant.hpp"
|
||||
#include "quantization/vectorization_utils.cuh"
|
||||
|
||||
#include <torch/cuda.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
@ -28,10 +29,22 @@ __global__ void rms_norm_static_fp8_quant_kernel(
|
||||
__shared__ float s_variance;
|
||||
float variance = 0.0f;
|
||||
|
||||
for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
|
||||
const float x = (float)input[blockIdx.x * input_stride + idx];
|
||||
const scalar_t* input_row = input + blockIdx.x * input_stride;
|
||||
|
||||
constexpr int VEC_SIZE = 8;
|
||||
auto vec_op = [&variance](const vec_n_t<scalar_t, VEC_SIZE>& vec) {
|
||||
#pragma unroll
|
||||
for (int i = 0; i < VEC_SIZE; ++i) {
|
||||
float x = static_cast<float>(vec.val[i]);
|
||||
variance += x * x;
|
||||
}
|
||||
};
|
||||
auto scalar_op = [&variance](const scalar_t& val) {
|
||||
float x = static_cast<float>(val);
|
||||
variance += x * x;
|
||||
}
|
||||
};
|
||||
vllm::vectorize_read_with_alignment<VEC_SIZE>(
|
||||
input_row, hidden_size, threadIdx.x, blockDim.x, vec_op, scalar_op);
|
||||
|
||||
using BlockReduce = cub::BlockReduce<float, 1024>;
|
||||
__shared__ typename BlockReduce::TempStorage reduceStore;
|
||||
@ -216,6 +229,8 @@ void fused_add_rms_norm_static_fp8_quant(
|
||||
double epsilon) {
|
||||
TORCH_CHECK(out.is_contiguous());
|
||||
TORCH_CHECK(residual.is_contiguous());
|
||||
TORCH_CHECK(residual.scalar_type() == input.scalar_type());
|
||||
TORCH_CHECK(weight.scalar_type() == input.scalar_type());
|
||||
int hidden_size = input.size(-1);
|
||||
int input_stride = input.stride(-2);
|
||||
int num_tokens = input.numel() / hidden_size;
|
||||
@ -241,7 +256,7 @@ void fused_add_rms_norm_static_fp8_quant(
|
||||
auto wt_ptr = reinterpret_cast<std::uintptr_t>(weight.data_ptr());
|
||||
bool ptrs_are_aligned =
|
||||
inp_ptr % 16 == 0 && res_ptr % 16 == 0 && wt_ptr % 16 == 0;
|
||||
bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
|
||||
bool batch_invariant_launch = vllm::vllm_is_batch_invariant();
|
||||
if (ptrs_are_aligned && hidden_size % 8 == 0 && input_stride % 8 == 0 &&
|
||||
!batch_invariant_launch) {
|
||||
LAUNCH_FUSED_ADD_RMS_NORM(8);
|
||||
|
@ -17,25 +17,30 @@ FILE_HEAD = """
|
||||
namespace MARLIN_NAMESPACE_NAME {
|
||||
""".strip()
|
||||
|
||||
TEMPLATE = ("template __global__ void Marlin<"
|
||||
"{{scalar_t}}, "
|
||||
"{{w_type_id}}, "
|
||||
"{{s_type_id}}, "
|
||||
"{{threads}}, "
|
||||
"{{thread_m_blocks}}, "
|
||||
"{{thread_n_blocks}}, "
|
||||
"{{thread_k_blocks}}, "
|
||||
"{{'true' if m_block_size_8 else 'false'}}, "
|
||||
"{{stages}}, "
|
||||
"{{group_blocks}}, "
|
||||
"{{'true' if is_zp_float else 'false'}}>"
|
||||
"( MARLIN_KERNEL_PARAMS );")
|
||||
TEMPLATE = (
|
||||
"template __global__ void Marlin<"
|
||||
"{{scalar_t}}, "
|
||||
"{{w_type_id}}, "
|
||||
"{{s_type_id}}, "
|
||||
"{{threads}}, "
|
||||
"{{thread_m_blocks}}, "
|
||||
"{{thread_n_blocks}}, "
|
||||
"{{thread_k_blocks}}, "
|
||||
"{{'true' if m_block_size_8 else 'false'}}, "
|
||||
"{{stages}}, "
|
||||
"{{group_blocks}}, "
|
||||
"{{'true' if is_zp_float else 'false'}}>"
|
||||
"( MARLIN_KERNEL_PARAMS );"
|
||||
)
|
||||
|
||||
# int8 with zero point case (vllm::kU8) is also supported,
|
||||
# we don't add it to reduce wheel size.
|
||||
SCALAR_TYPES = [
|
||||
"vllm::kU4", "vllm::kU4B8", "vllm::kU8B128", "vllm::kFE4M3fn",
|
||||
"vllm::kFE2M1f"
|
||||
"vllm::kU4",
|
||||
"vllm::kU4B8",
|
||||
"vllm::kU8B128",
|
||||
"vllm::kFE4M3fn",
|
||||
"vllm::kFE2M1f",
|
||||
]
|
||||
THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128)]
|
||||
|
||||
@ -58,11 +63,12 @@ def generate_new_kernels():
|
||||
all_template_str_list = []
|
||||
|
||||
for group_blocks, m_blocks, thread_configs in itertools.product(
|
||||
GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS):
|
||||
|
||||
GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS
|
||||
):
|
||||
# act order case only support gptq-int4 and gptq-int8
|
||||
if group_blocks == 0 and scalar_type not in [
|
||||
"vllm::kU4B8", "vllm::kU8B128"
|
||||
"vllm::kU4B8",
|
||||
"vllm::kU8B128",
|
||||
]:
|
||||
continue
|
||||
if thread_configs[2] == 256:
|
||||
|
@ -8,12 +8,77 @@
|
||||
|
||||
#include "../cuda_compat.h"
|
||||
#include "../dispatch_utils.h"
|
||||
#include "core/math.hpp"
|
||||
|
||||
#define CEILDIV(x, y) (((x) + (y) - 1) / (y))
|
||||
|
||||
namespace vllm {
|
||||
namespace moe {
|
||||
|
||||
namespace batched_moe_align_block_size {
|
||||
|
||||
// Note num_threads needs to be 1024 for BlockScan Reduction in the kernel.
|
||||
static constexpr int32_t num_threads = 1024;
|
||||
static constexpr int32_t num_blocks = 1;
|
||||
__global__ void batched_moe_align_block_size_kernel(
|
||||
int32_t const num_batches, int32_t const max_tokens_per_batch,
|
||||
int32_t const block_size, int32_t const* __restrict__ batch_num_tokens,
|
||||
int32_t* __restrict__ sorted_ids, int32_t* __restrict__ block_ids,
|
||||
int32_t* __restrict__ num_tokens_post_pad) {
|
||||
// TODO(varun): This is a naive implementation. Could be optimized.
|
||||
|
||||
size_t const batch_id = threadIdx.x;
|
||||
size_t const stride = blockDim.x * gridDim.x;
|
||||
int32_t const num_blocks_per_batch =
|
||||
CEILDIV(max_tokens_per_batch, block_size);
|
||||
int32_t const sorted_ids_size =
|
||||
num_blocks_per_batch * num_batches * block_size;
|
||||
int32_t const block_ids_size = sorted_ids_size / block_size;
|
||||
int32_t const SENTINEL =
|
||||
num_batches * max_tokens_per_batch; // To denote invalid entries.
|
||||
// Intialize sorted_ids
|
||||
for (size_t i = threadIdx.x; i < sorted_ids_size; i += stride) {
|
||||
sorted_ids[i] = SENTINEL;
|
||||
}
|
||||
// Intialize expert_ids with -1
|
||||
for (size_t i = threadIdx.x; i < block_ids_size; i += stride) {
|
||||
block_ids[i] = -1;
|
||||
}
|
||||
|
||||
int32_t b_num_tokens = 0;
|
||||
if (batch_id < num_batches) {
|
||||
b_num_tokens = batch_num_tokens[batch_id];
|
||||
}
|
||||
int32_t const ceil_b_num_tokens =
|
||||
CEILDIV(b_num_tokens, block_size) * block_size;
|
||||
|
||||
// Compute prefix sum over token counts per expert
|
||||
using BlockScan = cub::BlockScan<int32_t, 1024>;
|
||||
__shared__ typename BlockScan::TempStorage temp_storage;
|
||||
int cumsum_val;
|
||||
BlockScan(temp_storage).ExclusiveSum(ceil_b_num_tokens, cumsum_val);
|
||||
__syncthreads();
|
||||
|
||||
bool const is_last_batch = batch_id == (num_batches - 1);
|
||||
if (is_last_batch) {
|
||||
*num_tokens_post_pad = cumsum_val + ceil_b_num_tokens;
|
||||
}
|
||||
|
||||
if (batch_id < num_batches) {
|
||||
int32_t const batch_offset = batch_id * max_tokens_per_batch;
|
||||
for (size_t i = 0; i < b_num_tokens; ++i) {
|
||||
sorted_ids[cumsum_val + i] = batch_offset + i;
|
||||
}
|
||||
|
||||
int32_t const block_start = cumsum_val / block_size;
|
||||
int32_t const num_blocks = ceil_b_num_tokens / block_size;
|
||||
for (size_t i = 0; i < num_blocks; ++i) {
|
||||
block_ids[block_start + i] = batch_id;
|
||||
}
|
||||
}
|
||||
}
|
||||
} // namespace batched_moe_align_block_size
|
||||
|
||||
template <typename scalar_t>
|
||||
__global__ void moe_align_block_size_kernel(
|
||||
const scalar_t* __restrict__ topk_ids,
|
||||
@ -280,6 +345,33 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
|
||||
});
|
||||
}
|
||||
|
||||
void batched_moe_align_block_size(int64_t max_tokens_per_batch,
|
||||
int64_t block_size,
|
||||
torch::Tensor const& batch_num_tokens,
|
||||
torch::Tensor sorted_ids,
|
||||
torch::Tensor batch_ids,
|
||||
torch::Tensor num_tokens_post_pad) {
|
||||
namespace batched_kernel = vllm::moe::batched_moe_align_block_size;
|
||||
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
int32_t const B = batch_num_tokens.size(0);
|
||||
int32_t const num_blocks_per_batch =
|
||||
round_to_next_multiple_of(max_tokens_per_batch, block_size) / block_size;
|
||||
int32_t const num_blocks = num_blocks_per_batch * B;
|
||||
int64_t const sorted_ids_size = num_blocks * block_size;
|
||||
|
||||
TORCH_CHECK(sorted_ids.size(0) == sorted_ids_size);
|
||||
TORCH_CHECK(batch_ids.size(0) == sorted_ids_size / block_size);
|
||||
TORCH_CHECK(num_tokens_post_pad.size(0) == 1);
|
||||
TORCH_CHECK(B <= batched_kernel::num_threads);
|
||||
|
||||
batched_kernel::batched_moe_align_block_size_kernel<<<
|
||||
batched_kernel::num_blocks, batched_kernel::num_threads, 0, stream>>>(
|
||||
B, max_tokens_per_batch, block_size, batch_num_tokens.data_ptr<int32_t>(),
|
||||
sorted_ids.data_ptr<int32_t>(), batch_ids.data_ptr<int32_t>(),
|
||||
num_tokens_post_pad.data_ptr<int32_t>());
|
||||
}
|
||||
|
||||
void moe_sum(torch::Tensor& input, // [num_tokens, topk, hidden_size]
|
||||
torch::Tensor& output) // [num_tokens, hidden_size]
|
||||
{
|
||||
|
@ -4,7 +4,7 @@
|
||||
|
||||
void topk_softmax(torch::Tensor& topk_weights, torch::Tensor& topk_indices,
|
||||
torch::Tensor& token_expert_indices,
|
||||
torch::Tensor& gating_output);
|
||||
torch::Tensor& gating_output, bool renormalize);
|
||||
|
||||
void moe_sum(torch::Tensor& input, torch::Tensor& output);
|
||||
|
||||
@ -12,6 +12,14 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
|
||||
int64_t block_size, torch::Tensor sorted_token_ids,
|
||||
torch::Tensor experts_ids,
|
||||
torch::Tensor num_tokens_post_pad);
|
||||
|
||||
void batched_moe_align_block_size(int64_t max_tokens_per_batch,
|
||||
int64_t block_size,
|
||||
torch::Tensor const& expert_num_tokens,
|
||||
torch::Tensor sorted_ids,
|
||||
torch::Tensor expert_ids,
|
||||
torch::Tensor num_tokens_post_pad);
|
||||
|
||||
#ifndef USE_ROCM
|
||||
torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
|
||||
torch::Tensor b_qweight, torch::Tensor b_scales,
|
||||
|
@ -16,12 +16,22 @@
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include <type_traits>
|
||||
#include <torch/all.h>
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
#include "../cuda_compat.h"
|
||||
#include "../cub_helpers.h"
|
||||
#include "../core/batch_invariant.hpp"
|
||||
|
||||
#ifndef USE_ROCM
|
||||
#include <cuda_bf16.h>
|
||||
#include <cuda_fp16.h>
|
||||
#else
|
||||
#include <hip/hip_bf16.h>
|
||||
#include <hip/hip_fp16.h>
|
||||
typedef __hip_bfloat16 __nv_bfloat16;
|
||||
typedef __hip_bfloat162 __nv_bfloat162;
|
||||
#endif
|
||||
|
||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
@ -37,16 +47,27 @@ template <
|
||||
/// Alignment requirement in bytes
|
||||
int Alignment = sizeof(T) * N
|
||||
>
|
||||
class alignas(Alignment) AlignedArray {
|
||||
float data[N];
|
||||
struct alignas(Alignment) AlignedArray {
|
||||
T data[N];
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ float toFloat(T value) {
|
||||
if constexpr (std::is_same_v<T, float>) {
|
||||
return value;
|
||||
} else if constexpr (std::is_same_v<T, __nv_bfloat16>) {
|
||||
return __bfloat162float(value);
|
||||
} else if constexpr (std::is_same_v<T, __half>) {
|
||||
return __half2float(value);
|
||||
}
|
||||
}
|
||||
|
||||
// ====================== Softmax things ===============================
|
||||
// We have our own implementation of softmax here so we can support transposing the output
|
||||
// in the softmax kernel when we extend this module to support expert-choice routing.
|
||||
template <int TPB>
|
||||
template <int TPB, typename InputType>
|
||||
__launch_bounds__(TPB) __global__
|
||||
void moeSoftmax(const float* input, const bool* finished, float* output, const int num_cols)
|
||||
void moeSoftmax(const InputType* input, const bool* finished, float* output, const int num_cols)
|
||||
{
|
||||
using BlockReduce = cub::BlockReduce<float, TPB>;
|
||||
__shared__ typename BlockReduce::TempStorage tmpStorage;
|
||||
@ -67,7 +88,8 @@ __launch_bounds__(TPB) __global__
|
||||
for (int ii = threadIdx.x; ii < num_cols; ii += TPB)
|
||||
{
|
||||
const int idx = thread_row_offset + ii;
|
||||
threadData = max(static_cast<float>(input[idx]), threadData);
|
||||
const float val = toFloat(input[idx]);
|
||||
threadData = max(val, threadData);
|
||||
}
|
||||
|
||||
const float maxElem = BlockReduce(tmpStorage).Reduce(threadData, CubMaxOp());
|
||||
@ -82,7 +104,8 @@ __launch_bounds__(TPB) __global__
|
||||
for (int ii = threadIdx.x; ii < num_cols; ii += TPB)
|
||||
{
|
||||
const int idx = thread_row_offset + ii;
|
||||
threadData += exp((static_cast<float>(input[idx]) - float_max));
|
||||
const float val = toFloat(input[idx]);
|
||||
threadData += expf(val - float_max);
|
||||
}
|
||||
|
||||
const auto Z = BlockReduce(tmpStorage).Reduce(threadData, CubAddOp());
|
||||
@ -96,8 +119,9 @@ __launch_bounds__(TPB) __global__
|
||||
for (int ii = threadIdx.x; ii < num_cols; ii += TPB)
|
||||
{
|
||||
const int idx = thread_row_offset + ii;
|
||||
const float val = exp((static_cast<float>(input[idx]) - float_max)) * normalizing_factor;
|
||||
output[idx] = val;
|
||||
const float val = toFloat(input[idx]);
|
||||
const float softmax_val = expf(val - float_max) * normalizing_factor;
|
||||
output[idx] = softmax_val;
|
||||
}
|
||||
}
|
||||
|
||||
@ -111,7 +135,8 @@ __launch_bounds__(TPB) __global__ void moeTopK(
|
||||
const int num_experts,
|
||||
const int k,
|
||||
const int start_expert,
|
||||
const int end_expert)
|
||||
const int end_expert,
|
||||
const bool renormalize)
|
||||
{
|
||||
|
||||
using cub_kvp = cub::KeyValuePair<int, float>;
|
||||
@ -126,6 +151,7 @@ __launch_bounds__(TPB) __global__ void moeTopK(
|
||||
|
||||
const bool row_is_active = finished ? !finished[block_row] : true;
|
||||
const int thread_read_offset = blockIdx.x * num_experts;
|
||||
float selected_sum = 0.f;
|
||||
for (int k_idx = 0; k_idx < k; ++k_idx)
|
||||
{
|
||||
thread_kvp.key = 0;
|
||||
@ -164,9 +190,23 @@ __launch_bounds__(TPB) __global__ void moeTopK(
|
||||
indices[idx] = should_process_row ? (expert - start_expert) : num_experts;
|
||||
assert(indices[idx] >= 0);
|
||||
source_rows[idx] = k_idx * num_rows + block_row;
|
||||
if (renormalize) {
|
||||
selected_sum += result_kvp.value;
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
// Renormalize the k weights for this row to sum to 1, if requested.
|
||||
if (renormalize) {
|
||||
if (threadIdx.x == 0) {
|
||||
const float denom = selected_sum > 0.f ? selected_sum : 1.f;
|
||||
for (int k_idx = 0; k_idx < k; ++k_idx) {
|
||||
const int idx = k * block_row + k_idx;
|
||||
output[idx] = output[idx] / denom;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ====================== TopK softmax things ===============================
|
||||
@ -185,21 +225,30 @@ __launch_bounds__(TPB) __global__ void moeTopK(
|
||||
2) This implementation assumes k is small, but will work for any k.
|
||||
*/
|
||||
|
||||
template <int VPT, int NUM_EXPERTS, int WARPS_PER_CTA, int BYTES_PER_LDG, int WARP_SIZE_PARAM, typename IndType>
|
||||
template <int VPT, int NUM_EXPERTS, int WARPS_PER_CTA, int BYTES_PER_LDG, int WARP_SIZE_PARAM, typename IndType, typename InputType = float>
|
||||
__launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
|
||||
void topkGatingSoftmax(const float* input, const bool* finished, float* output, const int num_rows, IndType* indices,
|
||||
int* source_rows, const int k, const int start_expert, const int end_expert)
|
||||
void topkGatingSoftmax(const InputType* input, const bool* finished, float* output, const int num_rows, IndType* indices,
|
||||
int* source_rows, const int k, const int start_expert, const int end_expert, const bool renormalize)
|
||||
{
|
||||
static_assert(std::is_same_v<InputType, float> || std::is_same_v<InputType, __nv_bfloat16> ||
|
||||
std::is_same_v<InputType, __half>,
|
||||
"InputType must be float, __nv_bfloat16, or __half");
|
||||
|
||||
// We begin by enforcing compile time assertions and setting up compile time constants.
|
||||
static_assert(BYTES_PER_LDG == (BYTES_PER_LDG & -BYTES_PER_LDG), "BYTES_PER_LDG must be power of 2");
|
||||
static_assert(BYTES_PER_LDG <= 16, "BYTES_PER_LDG must be leq 16");
|
||||
|
||||
// Number of bytes each thread pulls in per load
|
||||
static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(float);
|
||||
static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(InputType);
|
||||
static constexpr int ELTS_PER_ROW = NUM_EXPERTS;
|
||||
static constexpr int THREADS_PER_ROW = ELTS_PER_ROW / VPT;
|
||||
static constexpr int LDG_PER_THREAD = VPT / ELTS_PER_LDG;
|
||||
|
||||
if constexpr (std::is_same_v<InputType, __nv_bfloat16> || std::is_same_v<InputType, __half>) {
|
||||
static_assert(ELTS_PER_LDG == 1 || ELTS_PER_LDG % 2 == 0,
|
||||
"ELTS_PER_LDG must be 1 or even for 16-bit conversion");
|
||||
}
|
||||
|
||||
// Restrictions based on previous section.
|
||||
static_assert(VPT % ELTS_PER_LDG == 0, "The elements per thread must be a multiple of the elements per ldg");
|
||||
static_assert(WARP_SIZE_PARAM % THREADS_PER_ROW == 0, "The threads per row must cleanly divide the threads per warp");
|
||||
@ -237,27 +286,71 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
|
||||
|
||||
// We finally start setting up the read pointers for each thread. First, each thread jumps to the start of the
|
||||
// row it will read.
|
||||
const float* thread_row_ptr = input + thread_row * ELTS_PER_ROW;
|
||||
const InputType* thread_row_ptr = input + thread_row * ELTS_PER_ROW;
|
||||
|
||||
// Now, we compute the group each thread belong to in order to determine the first column to start loads.
|
||||
const int thread_group_idx = threadIdx.x % THREADS_PER_ROW;
|
||||
const int first_elt_read_by_thread = thread_group_idx * ELTS_PER_LDG;
|
||||
const float* thread_read_ptr = thread_row_ptr + first_elt_read_by_thread;
|
||||
|
||||
// Determine the pointer type to use to read in the data depending on the BYTES_PER_LDG template param. In theory,
|
||||
// this can support all powers of 2 up to 16.
|
||||
// NOTE(woosuk): The original implementation uses CUTLASS aligned array here.
|
||||
// We defined our own aligned array and use it here to avoid the dependency on CUTLASS.
|
||||
using AccessType = AlignedArray<float, ELTS_PER_LDG>;
|
||||
const InputType* thread_read_ptr = thread_row_ptr + first_elt_read_by_thread;
|
||||
|
||||
// Finally, we pull in the data from global mem
|
||||
float row_chunk[VPT];
|
||||
AccessType* row_chunk_vec_ptr = reinterpret_cast<AccessType*>(&row_chunk);
|
||||
const AccessType* vec_thread_read_ptr = reinterpret_cast<const AccessType*>(thread_read_ptr);
|
||||
|
||||
// NOTE(zhuhaoran): dispatch different input types loading, BF16/FP16 convert to float
|
||||
if constexpr (std::is_same_v<InputType, float>) {
|
||||
using VecType = AlignedArray<float, ELTS_PER_LDG>;
|
||||
VecType* row_chunk_vec_ptr = reinterpret_cast<VecType*>(&row_chunk);
|
||||
const VecType* vec_thread_read_ptr = reinterpret_cast<const VecType*>(thread_read_ptr);
|
||||
#pragma unroll
|
||||
for (int ii = 0; ii < LDG_PER_THREAD; ++ii)
|
||||
{
|
||||
row_chunk_vec_ptr[ii] = vec_thread_read_ptr[ii * THREADS_PER_ROW];
|
||||
for (int ii = 0; ii < LDG_PER_THREAD; ++ii) {
|
||||
row_chunk_vec_ptr[ii] = vec_thread_read_ptr[ii * THREADS_PER_ROW];
|
||||
}
|
||||
} else if constexpr (std::is_same_v<InputType, __nv_bfloat16>) {
|
||||
if constexpr (ELTS_PER_LDG >= 2) {
|
||||
using VecType = AlignedArray<__nv_bfloat16, ELTS_PER_LDG>;
|
||||
float2* row_chunk_f2 = reinterpret_cast<float2*>(row_chunk);
|
||||
const VecType* vec_thread_read_ptr = reinterpret_cast<const VecType*>(thread_read_ptr);
|
||||
#pragma unroll
|
||||
for (int ii = 0; ii < LDG_PER_THREAD; ++ii) {
|
||||
VecType vec = vec_thread_read_ptr[ii * THREADS_PER_ROW];
|
||||
int base_idx_f2 = ii * ELTS_PER_LDG / 2;
|
||||
#pragma unroll
|
||||
for (int jj = 0; jj < ELTS_PER_LDG / 2; ++jj) {
|
||||
row_chunk_f2[base_idx_f2 + jj] = __bfloat1622float2(
|
||||
*reinterpret_cast<const __nv_bfloat162*>(vec.data + jj * 2)
|
||||
);
|
||||
}
|
||||
}
|
||||
} else { // ELTS_PER_LDG == 1
|
||||
#pragma unroll
|
||||
for (int ii = 0; ii < LDG_PER_THREAD; ++ii) {
|
||||
const __nv_bfloat16* scalar_ptr = thread_read_ptr + ii * THREADS_PER_ROW;
|
||||
row_chunk[ii] = __bfloat162float(*scalar_ptr);
|
||||
}
|
||||
}
|
||||
} else if constexpr (std::is_same_v<InputType, __half>) {
|
||||
if constexpr (ELTS_PER_LDG >= 2) {
|
||||
using VecType = AlignedArray<__half, ELTS_PER_LDG>;
|
||||
float2* row_chunk_f2 = reinterpret_cast<float2*>(row_chunk);
|
||||
const VecType* vec_thread_read_ptr = reinterpret_cast<const VecType*>(thread_read_ptr);
|
||||
#pragma unroll
|
||||
for (int ii = 0; ii < LDG_PER_THREAD; ++ii) {
|
||||
VecType vec = vec_thread_read_ptr[ii * THREADS_PER_ROW];
|
||||
int base_idx_f2 = ii * ELTS_PER_LDG / 2;
|
||||
#pragma unroll
|
||||
for (int jj = 0; jj < ELTS_PER_LDG / 2; ++jj) {
|
||||
row_chunk_f2[base_idx_f2 + jj] = __half22float2(
|
||||
*reinterpret_cast<const __half2*>(vec.data + jj * 2)
|
||||
);
|
||||
}
|
||||
}
|
||||
} else { // ELTS_PER_LDG == 1
|
||||
#pragma unroll
|
||||
for (int ii = 0; ii < LDG_PER_THREAD; ++ii) {
|
||||
const __half* scalar_ptr = thread_read_ptr + ii * THREADS_PER_ROW;
|
||||
row_chunk[ii] = __half2float(*scalar_ptr);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// First, we perform a max reduce within the thread. We can do the max in fp16 safely (I think) and just
|
||||
@ -311,6 +404,7 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
|
||||
int start_col = first_elt_read_by_thread;
|
||||
static constexpr int COLS_PER_GROUP_LDG = ELTS_PER_LDG * THREADS_PER_ROW;
|
||||
|
||||
float selected_sum = 0.f;
|
||||
for (int k_idx = 0; k_idx < k; ++k_idx)
|
||||
{
|
||||
// First, each thread does the local argmax
|
||||
@ -364,6 +458,9 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
|
||||
output[idx] = max_val;
|
||||
indices[idx] = should_process_row ? (expert - start_expert) : NUM_EXPERTS;
|
||||
source_rows[idx] = k_idx * num_rows + thread_row;
|
||||
if (renormalize) {
|
||||
selected_sum += max_val;
|
||||
}
|
||||
}
|
||||
|
||||
// Finally, we clear the value in the thread with the current max if there is another iteration to run.
|
||||
@ -381,15 +478,28 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Renormalize the k weights for this row to sum to 1, if requested.
|
||||
if (renormalize) {
|
||||
if (thread_group_idx == 0)
|
||||
{
|
||||
const float denom = selected_sum > 0.f ? selected_sum : 1.f;
|
||||
for (int k_idx = 0; k_idx < k; ++k_idx)
|
||||
{
|
||||
const int idx = k * thread_row + k_idx;
|
||||
output[idx] = output[idx] / denom;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
namespace detail
|
||||
{
|
||||
// Constructs some constants needed to partition the work across threads at compile time.
|
||||
template <int EXPERTS, int BYTES_PER_LDG, int WARP_SIZE_PARAM>
|
||||
template <int EXPERTS, int BYTES_PER_LDG, int WARP_SIZE_PARAM, typename InputType>
|
||||
struct TopkConstants
|
||||
{
|
||||
static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(float);
|
||||
static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(InputType);
|
||||
static_assert(EXPERTS / (ELTS_PER_LDG * WARP_SIZE_PARAM) == 0 || EXPERTS % (ELTS_PER_LDG * WARP_SIZE_PARAM) == 0, "");
|
||||
static constexpr int VECs_PER_THREAD = MAX(1, EXPERTS / (ELTS_PER_LDG * WARP_SIZE_PARAM));
|
||||
static constexpr int VPT = VECs_PER_THREAD * ELTS_PER_LDG;
|
||||
@ -398,21 +508,21 @@ struct TopkConstants
|
||||
};
|
||||
} // namespace detail
|
||||
|
||||
template <int EXPERTS, int WARPS_PER_TB, int WARP_SIZE_PARAM, int MAX_BYTES_PER_LDG, typename IndType>
|
||||
void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, float* output, IndType* indices,
|
||||
int* source_row, const int num_rows, const int k, const int start_expert, const int end_expert, cudaStream_t stream)
|
||||
template <int EXPERTS, int WARPS_PER_TB, int WARP_SIZE_PARAM, int MAX_BYTES_PER_LDG, typename IndType, typename InputType>
|
||||
void topkGatingSoftmaxLauncherHelper(const InputType* input, const bool* finished, float* output, IndType* indices,
|
||||
int* source_row, const int num_rows, const int k, const int start_expert, const int end_expert, const bool renormalize,
|
||||
cudaStream_t stream)
|
||||
{
|
||||
static constexpr int BYTES_PER_LDG = MIN(MAX_BYTES_PER_LDG, sizeof(float) * EXPERTS);
|
||||
using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG, WARP_SIZE_PARAM>;
|
||||
static constexpr int BYTES_PER_LDG = MIN(MAX_BYTES_PER_LDG, sizeof(InputType) * EXPERTS);
|
||||
using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG, WARP_SIZE_PARAM, InputType>;
|
||||
static constexpr int VPT = Constants::VPT;
|
||||
static constexpr int ROWS_PER_WARP = Constants::ROWS_PER_WARP;
|
||||
const bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
|
||||
const int num_warps = batch_invariant_launch ? 32 : (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP;
|
||||
const int num_warps = (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP;
|
||||
const int num_blocks = (num_warps + WARPS_PER_TB - 1) / WARPS_PER_TB;
|
||||
|
||||
dim3 block_dim(WARP_SIZE_PARAM, WARPS_PER_TB);
|
||||
topkGatingSoftmax<VPT, EXPERTS, WARPS_PER_TB, BYTES_PER_LDG, WARP_SIZE_PARAM><<<num_blocks, block_dim, 0, stream>>>(
|
||||
input, finished, output, num_rows, indices, source_row, k, start_expert, end_expert);
|
||||
topkGatingSoftmax<VPT, EXPERTS, WARPS_PER_TB, BYTES_PER_LDG, WARP_SIZE_PARAM, IndType, InputType><<<num_blocks, block_dim, 0, stream>>>(
|
||||
input, finished, output, num_rows, indices, source_row, k, start_expert, end_expert, renormalize);
|
||||
}
|
||||
|
||||
#ifndef USE_ROCM
|
||||
@ -420,26 +530,26 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f
|
||||
static_assert(WARP_SIZE == 32, \
|
||||
"Unsupported warp size. Only 32 is supported for CUDA"); \
|
||||
topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, WARP_SIZE, MAX_BYTES>( \
|
||||
gating_output, nullptr, topk_weights, topk_indices, \
|
||||
token_expert_indices, num_tokens, topk, 0, num_experts, stream);
|
||||
gating_output, nullptr, topk_weights, topk_indices, token_expert_indices, \
|
||||
num_tokens, topk, 0, num_experts, renormalize, stream);
|
||||
#else
|
||||
#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB, MAX_BYTES) \
|
||||
if (WARP_SIZE == 64) { \
|
||||
topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 64, MAX_BYTES>( \
|
||||
gating_output, nullptr, topk_weights, topk_indices, \
|
||||
token_expert_indices, num_tokens, topk, 0, num_experts, stream); \
|
||||
gating_output, nullptr, topk_weights, topk_indices, token_expert_indices, \
|
||||
num_tokens, topk, 0, num_experts, renormalize, stream); \
|
||||
} else if (WARP_SIZE == 32) { \
|
||||
topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 32, MAX_BYTES>( \
|
||||
gating_output, nullptr, topk_weights, topk_indices, \
|
||||
token_expert_indices, num_tokens, topk, 0, num_experts, stream); \
|
||||
gating_output, nullptr, topk_weights, topk_indices, token_expert_indices, \
|
||||
num_tokens, topk, 0, num_experts, renormalize, stream); \
|
||||
} else { \
|
||||
assert(false && "Unsupported warp size. Only 32 and 64 are supported for ROCm"); \
|
||||
}
|
||||
#endif
|
||||
|
||||
template <typename IndType>
|
||||
template <typename IndType, typename InputType>
|
||||
void topkGatingSoftmaxKernelLauncher(
|
||||
const float* gating_output,
|
||||
const InputType* gating_output,
|
||||
float* topk_weights,
|
||||
IndType* topk_indices,
|
||||
int* token_expert_indices,
|
||||
@ -447,11 +557,15 @@ void topkGatingSoftmaxKernelLauncher(
|
||||
const int num_tokens,
|
||||
const int num_experts,
|
||||
const int topk,
|
||||
const bool renormalize,
|
||||
cudaStream_t stream) {
|
||||
static constexpr int WARPS_PER_TB = 4;
|
||||
static constexpr int BYTES_PER_LDG_POWER_OF_2 = 16;
|
||||
#ifndef USE_ROCM
|
||||
static constexpr int BYTES_PER_LDG_MULTIPLE_64 = 8;
|
||||
// for bfloat16 dtype, we need 4 bytes loading to make sure num_experts
|
||||
// elements can be loaded by a warp
|
||||
static constexpr int BYTES_PER_LDG_MULTIPLE_64 =
|
||||
(std::is_same_v<InputType, __nv_bfloat16> || std::is_same_v<InputType, __half>) ? 4 : 8;
|
||||
#endif
|
||||
switch (num_experts) {
|
||||
case 1:
|
||||
@ -508,11 +622,11 @@ void topkGatingSoftmaxKernelLauncher(
|
||||
TORCH_CHECK(softmax_workspace != nullptr,
|
||||
"softmax_workspace must be provided for num_experts that are not a power of 2 or multiple of 64.");
|
||||
static constexpr int TPB = 256;
|
||||
moeSoftmax<TPB><<<num_tokens, TPB, 0, stream>>>(
|
||||
moeSoftmax<TPB, InputType><<<num_tokens, TPB, 0, stream>>>(
|
||||
gating_output, nullptr, softmax_workspace, num_experts);
|
||||
moeTopK<TPB><<<num_tokens, TPB, 0, stream>>>(
|
||||
softmax_workspace, nullptr, topk_weights, topk_indices, token_expert_indices,
|
||||
num_experts, topk, 0, num_experts);
|
||||
num_experts, topk, 0, num_experts, renormalize);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -520,11 +634,50 @@ void topkGatingSoftmaxKernelLauncher(
|
||||
} // namespace moe
|
||||
} // namespace vllm
|
||||
|
||||
|
||||
template<typename ComputeType>
|
||||
void dispatch_topk_softmax_launch(
|
||||
torch::Tensor& gating_output,
|
||||
torch::Tensor& topk_weights,
|
||||
torch::Tensor& topk_indices,
|
||||
torch::Tensor& token_expert_indices,
|
||||
torch::Tensor& softmax_workspace,
|
||||
int num_tokens, int num_experts, int topk, bool renormalize, cudaStream_t stream)
|
||||
{
|
||||
if (topk_indices.scalar_type() == at::ScalarType::Int) {
|
||||
vllm::moe::topkGatingSoftmaxKernelLauncher<int, ComputeType>(
|
||||
reinterpret_cast<const ComputeType*>(gating_output.data_ptr()),
|
||||
topk_weights.data_ptr<float>(),
|
||||
topk_indices.data_ptr<int>(),
|
||||
token_expert_indices.data_ptr<int>(),
|
||||
softmax_workspace.data_ptr<float>(),
|
||||
num_tokens, num_experts, topk, renormalize, stream);
|
||||
} else if (topk_indices.scalar_type() == at::ScalarType::UInt32) {
|
||||
vllm::moe::topkGatingSoftmaxKernelLauncher<uint32_t, ComputeType>(
|
||||
reinterpret_cast<const ComputeType*>(gating_output.data_ptr()),
|
||||
topk_weights.data_ptr<float>(),
|
||||
topk_indices.data_ptr<uint32_t>(),
|
||||
token_expert_indices.data_ptr<int>(),
|
||||
softmax_workspace.data_ptr<float>(),
|
||||
num_tokens, num_experts, topk, renormalize, stream);
|
||||
} else {
|
||||
TORCH_CHECK(topk_indices.scalar_type() == at::ScalarType::Long);
|
||||
vllm::moe::topkGatingSoftmaxKernelLauncher<int64_t, ComputeType>(
|
||||
reinterpret_cast<const ComputeType*>(gating_output.data_ptr()),
|
||||
topk_weights.data_ptr<float>(),
|
||||
topk_indices.data_ptr<int64_t>(),
|
||||
token_expert_indices.data_ptr<int>(),
|
||||
softmax_workspace.data_ptr<float>(),
|
||||
num_tokens, num_experts, topk, renormalize, stream);
|
||||
}
|
||||
}
|
||||
|
||||
void topk_softmax(
|
||||
torch::Tensor& topk_weights, // [num_tokens, topk]
|
||||
torch::Tensor& topk_indices, // [num_tokens, topk]
|
||||
torch::Tensor& token_expert_indices, // [num_tokens, topk]
|
||||
torch::Tensor& gating_output) // [num_tokens, num_experts]
|
||||
torch::Tensor& gating_output, // [num_tokens, num_experts]
|
||||
bool renormalize)
|
||||
{
|
||||
const int num_experts = gating_output.size(-1);
|
||||
const auto num_tokens = gating_output.numel() / num_experts;
|
||||
@ -536,45 +689,19 @@ void topk_softmax(
|
||||
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(gating_output));
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
torch::Tensor softmax_workspace = torch::empty({workspace_size}, gating_output.options());
|
||||
const auto workspace_options = gating_output.options().dtype(at::ScalarType::Float);
|
||||
torch::Tensor softmax_workspace = torch::empty({workspace_size}, workspace_options);
|
||||
|
||||
if(topk_indices.scalar_type() == at::ScalarType::Int)
|
||||
{
|
||||
vllm::moe::topkGatingSoftmaxKernelLauncher(
|
||||
gating_output.data_ptr<float>(),
|
||||
topk_weights.data_ptr<float>(),
|
||||
topk_indices.data_ptr<int>(),
|
||||
token_expert_indices.data_ptr<int>(),
|
||||
softmax_workspace.data_ptr<float>(),
|
||||
num_tokens,
|
||||
num_experts,
|
||||
topk,
|
||||
stream);
|
||||
}
|
||||
else if (topk_indices.scalar_type() == at::ScalarType::UInt32)
|
||||
{
|
||||
vllm::moe::topkGatingSoftmaxKernelLauncher(
|
||||
gating_output.data_ptr<float>(),
|
||||
topk_weights.data_ptr<float>(),
|
||||
topk_indices.data_ptr<uint32_t>(),
|
||||
token_expert_indices.data_ptr<int>(),
|
||||
softmax_workspace.data_ptr<float>(),
|
||||
num_tokens,
|
||||
num_experts,
|
||||
topk,
|
||||
stream);
|
||||
}
|
||||
else {
|
||||
TORCH_CHECK(topk_indices.scalar_type() == at::ScalarType::Long);
|
||||
vllm::moe::topkGatingSoftmaxKernelLauncher(
|
||||
gating_output.data_ptr<float>(),
|
||||
topk_weights.data_ptr<float>(),
|
||||
topk_indices.data_ptr<int64_t>(),
|
||||
token_expert_indices.data_ptr<int>(),
|
||||
softmax_workspace.data_ptr<float>(),
|
||||
num_tokens,
|
||||
num_experts,
|
||||
topk,
|
||||
stream);
|
||||
if (gating_output.scalar_type() == at::ScalarType::Float) {
|
||||
dispatch_topk_softmax_launch<float>(gating_output, topk_weights, topk_indices,
|
||||
token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize, stream);
|
||||
} else if (gating_output.scalar_type() == at::ScalarType::Half) {
|
||||
dispatch_topk_softmax_launch<__half>(gating_output, topk_weights, topk_indices,
|
||||
token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize, stream);
|
||||
} else if (gating_output.scalar_type() == at::ScalarType::BFloat16) {
|
||||
dispatch_topk_softmax_launch<__nv_bfloat16>(gating_output, topk_weights, topk_indices,
|
||||
token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize, stream);
|
||||
} else {
|
||||
TORCH_CHECK(false, "Unsupported gating_output data type: ", gating_output.scalar_type());
|
||||
}
|
||||
}
|
||||
|
@ -5,7 +5,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
|
||||
// Apply topk softmax to the gating outputs.
|
||||
m.def(
|
||||
"topk_softmax(Tensor! topk_weights, Tensor! topk_indices, Tensor! "
|
||||
"token_expert_indices, Tensor gating_output) -> ()");
|
||||
"token_expert_indices, Tensor gating_output, bool renormalize) -> ()");
|
||||
m.impl("topk_softmax", torch::kCUDA, &topk_softmax);
|
||||
|
||||
// Calculate the result of moe by summing up the partial results
|
||||
@ -22,6 +22,17 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
|
||||
" Tensor! num_tokens_post_pad) -> ()");
|
||||
m.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size);
|
||||
|
||||
// Aligning the number of tokens to be processed by each expert such
|
||||
// that it is divisible by the block size, but for the batched case.
|
||||
m.def(
|
||||
"batched_moe_align_block_size(int max_tokens_per_batch,"
|
||||
" int block_size, Tensor expert_num_tokens,"
|
||||
" Tensor! sorted_token_ids,"
|
||||
" Tensor! experts_ids,"
|
||||
" Tensor! num_tokens_post_pad) -> ()");
|
||||
m.impl("batched_moe_align_block_size", torch::kCUDA,
|
||||
&batched_moe_align_block_size);
|
||||
|
||||
#ifndef USE_ROCM
|
||||
m.def(
|
||||
"moe_wna16_gemm(Tensor input, Tensor! output, Tensor b_qweight, "
|
||||
|
12
csrc/ops.h
12
csrc/ops.h
@ -92,14 +92,16 @@ void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
|
||||
void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
|
||||
torch::Tensor& weight, double epsilon);
|
||||
|
||||
void poly_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
|
||||
torch::Tensor& bias, double epsilon);
|
||||
|
||||
void apply_repetition_penalties_(torch::Tensor& logits,
|
||||
const torch::Tensor& prompt_mask,
|
||||
const torch::Tensor& output_mask,
|
||||
const torch::Tensor& repetition_penalties);
|
||||
|
||||
void top_k_per_row(const torch::Tensor& logits, const torch::Tensor& rowStarts,
|
||||
const torch::Tensor& rowEnds, torch::Tensor& indices,
|
||||
torch::Tensor& values, int64_t numRows, int64_t stride0,
|
||||
int64_t stride1);
|
||||
|
||||
void rms_norm_static_fp8_quant(torch::Tensor& out, torch::Tensor& input,
|
||||
torch::Tensor& weight, torch::Tensor& scale,
|
||||
double epsilon);
|
||||
@ -133,12 +135,12 @@ void silu_and_mul_nvfp4_quant(torch::Tensor& out,
|
||||
torch::Tensor& input,
|
||||
torch::Tensor& input_global_scale);
|
||||
#endif
|
||||
void silu_mul_fp8_quant_deep_gemm_cuda(
|
||||
void persistent_masked_m_silu_mul_quant(
|
||||
const at::Tensor& input, // (E, T, 2*H)
|
||||
const at::Tensor& counts, // (E)
|
||||
at::Tensor& y_q, // (E, T, H) [OUT]
|
||||
at::Tensor& y_s, // (E, T, H//group_size) [OUT]
|
||||
int64_t group_size, bool use_ue8m0, int64_t num_parallel_tokens);
|
||||
bool use_ue8m0);
|
||||
|
||||
void mul_and_silu(torch::Tensor& out, torch::Tensor& input);
|
||||
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user