mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-21 15:43:52 +08:00
Compare commits
872 Commits
Author | SHA1 | Date | |
---|---|---|---|
ed6e9075d3 | |||
992e5c3d34 | |||
b69692a2d8 | |||
a64a84433d | |||
aa1e62d0db | |||
497bc83124 | |||
3738e6fa80 | |||
0023cd2b9d | |||
041e294716 | |||
9621667874 | |||
8c755c3b6d | |||
ba81163997 | |||
0d243f2a54 | |||
88f6ba3281 | |||
512368e34a | |||
473f51cfd9 | |||
a4c402a756 | |||
550d97eb58 | |||
fbbe1fbac6 | |||
01c184b8f3 | |||
ad5a35c21b | |||
5ae9f26a5a | |||
377d10bd14 | |||
52ce14d31f | |||
81dabf24a8 | |||
423330263b | |||
caf7ff4456 | |||
f525c0be8b | |||
983a40a8bb | |||
fdc5df6f54 | |||
3b05cd4555 | |||
d5d214ac7f | |||
fd84857f64 | |||
8aada19dfc | |||
9aa95b0e6a | |||
d0a7a2769d | |||
00b69c2d27 | |||
4c82229898 | |||
c8d70e2437 | |||
30172b4947 | |||
a4d577b379 | |||
7b203b7694 | |||
4fb8142a0e | |||
a02c86b4dd | |||
3809458456 | |||
d3231cb436 | |||
435b502a6e | |||
29fc5772c4 | |||
2358ca527b | |||
8cf97f8661 | |||
e2603fefb8 | |||
b53d79983c | |||
9915912f7f | |||
d1b649f1ef | |||
ac19b519ed | |||
a1074b3efe | |||
00294e1bc6 | |||
88787bce1d | |||
932b51cedd | |||
7c7adf81fc | |||
67ef8f666a | |||
efbe854448 | |||
b3942e157e | |||
cd4a72a28d | |||
6ac485a953 | |||
4c21ce9eba | |||
ce77eb9410 | |||
30513d1cb6 | |||
1f69c4a892 | |||
7b623fca0b | |||
238dfc8ac3 | |||
45186834a0 | |||
f857311d13 | |||
46cdd59577 | |||
2010f04c17 | |||
69e1d23e1e | |||
d67cc21b78 | |||
e18227b04a | |||
7b89386553 | |||
da833b0aee | |||
5d2965b7d7 | |||
a0231b7c25 | |||
124776ebd5 | |||
b7d309860e | |||
dc0f7ccf8b | |||
d3d547e057 | |||
12913d17ba | |||
80f63a3966 | |||
367cb8ce8c | |||
54ed913f34 | |||
9206b3d7ec | |||
ed0de3e4b8 | |||
2ad1bc7afe | |||
7fdaaf48ef | |||
067fa2255b | |||
9076325677 | |||
97a3d6d995 | |||
579d7a63b2 | |||
c9f9d5b397 | |||
0c73026844 | |||
6a854c7a2b | |||
e7eea5a520 | |||
a12934d3ec | |||
3bcb8c75da | |||
5e5c8e091e | |||
c9e2d644e7 | |||
7734e9a291 | |||
6224a9f620 | |||
085b7b2d6c | |||
4da1f667e9 | |||
556ef7f714 | |||
83481ceb49 | |||
185cc19f92 | |||
45f90bcbba | |||
b0ccfc565a | |||
ba59b78a9c | |||
cbc40128eb | |||
f0b2da72a8 | |||
f2b20fe491 | |||
40932d7a05 | |||
84683fa271 | |||
067678262a | |||
09545c0a94 | |||
dd5ede4440 | |||
8c32b08a86 | |||
410886950a | |||
e38be640e6 | |||
c1e37bf71b | |||
2344192a55 | |||
bffddd9a05 | |||
d84cef76eb | |||
37dfa60037 | |||
1bc3b5e71b | |||
02ed8a1fbe | |||
2092a6fa7d | |||
c9d3ecf016 | |||
fdcf64d3c6 | |||
578087e56c | |||
fa253f1a70 | |||
9605c1256e | |||
0ccd8769fb | |||
cb944d5818 | |||
d46d490c27 | |||
04f50ad9d1 | |||
60c68df6d1 | |||
009439caeb | |||
bc55d13070 | |||
d88c8666a1 | |||
4fc5c23bb6 | |||
9f9704dca6 | |||
8eafe5eaea | |||
4c0d93f4b2 | |||
14b7899d10 | |||
09972e716c | |||
36a08630e8 | |||
2c2b560f48 | |||
042c3419fa | |||
82cabf53a3 | |||
314cfade02 | |||
985b4a2b19 | |||
f4d97e4fc2 | |||
f1042e86f0 | |||
7c4033acd4 | |||
d59def4730 | |||
0c7d9effce | |||
dd3b4a01f8 | |||
a0597c6b75 | |||
e92694b6fe | |||
842b0fd402 | |||
974dfd4971 | |||
3ee696a63d | |||
72c2b68dc9 | |||
14ecab5be2 | |||
deb6c1c6b4 | |||
565c1efa65 | |||
2b25b7d2e1 | |||
6c4dbe23eb | |||
21f5d50fa5 | |||
bf3e05215c | |||
ad9776353e | |||
75e6e14516 | |||
110f59a33e | |||
2e3b969ec0 | |||
da317197dd | |||
7539bbc6a6 | |||
9cf4759493 | |||
41c5dd45b9 | |||
fc6485d277 | |||
78a141d768 | |||
c320ca8edd | |||
58047c6f04 | |||
cb080f32e3 | |||
2c0f58203c | |||
2ff4857678 | |||
91e876750e | |||
08b2d845d6 | |||
2ae889052c | |||
51f0b5f7f6 | |||
fde71262e0 | |||
243137143c | |||
b2496bb07f | |||
44607e07d3 | |||
67c4637ccf | |||
aa0ca5ebb7 | |||
59fff4a01a | |||
29f1d47e73 | |||
cf797aa856 | |||
24700c346b | |||
d366ccc4e3 | |||
870c37481e | |||
86222a3dab | |||
fe743b798d | |||
913df14da3 | |||
8a69e0e20e | |||
4c8dd12ef3 | |||
256a2d29dc | |||
c45d398e6f | |||
011e612d92 | |||
7e1837676a | |||
2880e21e3d | |||
407b5537db | |||
4ea48fb35c | |||
e31498bdcb | |||
91dd8f7aa6 | |||
d01f66b039 | |||
cc01223f3b | |||
306923da82 | |||
3243158336 | |||
b21f0f9d17 | |||
45cbc4991d | |||
932c6b7461 | |||
eaa92d4437 | |||
0630d4537a | |||
538fab93cd | |||
ce26b16268 | |||
1918aa1b80 | |||
6e1fc61f0f | |||
aa375dca9f | |||
433c4a4923 | |||
ef533d25fb | |||
b260782357 | |||
741429a4cd | |||
aff404571b | |||
467a96a541 | |||
8108ac841d | |||
afe74f7a96 | |||
09b95e36ab | |||
85ac82d228 | |||
1e57b1ee63 | |||
e152f29502 | |||
c786e757fa | |||
cefd56ee35 | |||
7ca9934fe7 | |||
0408efc6d0 | |||
449d1bce02 | |||
1a6fcad4c9 | |||
56534cd577 | |||
d88506dda4 | |||
9cdea30b4f | |||
76abd0c881 | |||
5b19b93082 | |||
75404d041b | |||
bf3b79efb8 | |||
9a5b1554b4 | |||
a4ce74c14a | |||
3b2005e1db | |||
af8486de49 | |||
4c3aac51e1 | |||
bc1bdecebf | |||
022bcc701a | |||
c53dc466b1 | |||
3d09e592a8 | |||
fcf2e3d7fc | |||
58b218d7ae | |||
7ff7a638b6 | |||
686006a220 | |||
98fd089fc9 | |||
249824c3bf | |||
64862d106e | |||
b3a0d01e45 | |||
75e94309e8 | |||
233df6f5c4 | |||
18016a5e62 | |||
649550f27e | |||
62467a834a | |||
6469038b14 | |||
815079de8e | |||
18a88fcccc | |||
d1ca7df84d | |||
96b23621c1 | |||
c36ac98d01 | |||
4896d0c2dd | |||
bb392af434 | |||
5d98d56089 | |||
73b35cca7f | |||
5095e96606 | |||
cf58b9c4ca | |||
4797dad3ec | |||
6dd5e52823 | |||
c11de33dad | |||
33e0602e59 | |||
a1a2aaadb9 | |||
1298a400e8 | |||
ad4a9dc817 | |||
b9986454fe | |||
c5932e5dac | |||
20579c0fae | |||
95460fc513 | |||
326fcc8b9f | |||
e64330910b | |||
e489ad7a21 | |||
f256ebe4df | |||
f8ece6e17f | |||
abfcdcdf27 | |||
e497f33491 | |||
baaa2b24da | |||
b4e5c03306 | |||
3194039c0e | |||
4f4d427ac2 | |||
1e3698393f | |||
baeded2569 | |||
3e1c76cf3a | |||
cfa134d247 | |||
35b7a05507 | |||
1867c258bd | |||
cb3e73e4c8 | |||
b1340f9d55 | |||
44bbca78d7 | |||
60808bd4c7 | |||
fc542144c4 | |||
eb5741ad42 | |||
145c2ff648 | |||
415f19474d | |||
89003c4082 | |||
60bcef000e | |||
847f883232 | |||
325f679f32 | |||
e3f7ff65e7 | |||
7a8987dac5 | |||
cabaf4eff3 | |||
a1fc18c030 | |||
9798b2fb00 | |||
4078052f09 | |||
bd2107e30a | |||
9b0c4bab36 | |||
41bf5612f5 | |||
a2769032ca | |||
f17f1d4608 | |||
1c1bb0bbf2 | |||
e0cc5f259a | |||
73aa6cfdf7 | |||
27b78c73ca | |||
b02fd288b2 | |||
ff7424f491 | |||
d93bf4da85 | |||
036ca94c25 | |||
ef001d98ef | |||
5f671cb4c3 | |||
bd02164cf9 | |||
46fb056749 | |||
dd6a3a02cb | |||
a7e3eba66f | |||
fbb5bd4cef | |||
80fcc3ed1c | |||
c386c43ca3 | |||
f26d790718 | |||
0f657bdc52 | |||
3fd1fb63ef | |||
925d2f1908 | |||
8f58a51358 | |||
2079e43bee | |||
e29d4358ef | |||
8cbc424975 | |||
dd66fd2b01 | |||
0f465ab533 | |||
23a7cbc88b | |||
426a5c3625 | |||
ddee88d0ff | |||
823ab79633 | |||
6116ca8cd7 | |||
2bc3fbba0c | |||
3f1fc7425a | |||
01ba927040 | |||
103bd17ac5 | |||
ce69f7f754 | |||
624a1e4711 | |||
372bf0890b | |||
5204ff5c3f | |||
0cc6b383d7 | |||
28e0750847 | |||
582cf78798 | |||
0034b09ceb | |||
72bac73067 | |||
68f11149d8 | |||
72f4880425 | |||
aa2cd2c43d | |||
9ddc35220b | |||
a5255270c3 | |||
0ee349b553 | |||
fa63e710c7 | |||
2a0309a646 | |||
324960a95c | |||
f1fc0510df | |||
bf21481dde | |||
fb30ee92ee | |||
221d388cc5 | |||
3132a933b6 | |||
df5dafaa5b | |||
ab5bbf5ae3 | |||
3bb8e2c9a2 | |||
e784c6b998 | |||
9a0f3bdbe5 | |||
c7c9851036 | |||
3c818bdb42 | |||
6dd94dbe94 | |||
0e74d797ce | |||
55ef66edf4 | |||
5e5630a478 | |||
d3d6bb13fb | |||
24b0205f58 | |||
c5cffcd0cd | |||
682b55bc07 | |||
9726ad676d | |||
eb5cb5e528 | |||
2cbeedad09 | |||
2c85529bfc | |||
e97f802b2d | |||
6e650f56a1 | |||
3f50c148fd | |||
8c01b8022c | |||
99d01a5e3d | |||
d07efb31c5 | |||
978b45f399 | |||
c5b4b11d7f | |||
8ae5ff2009 | |||
511627445e | |||
f0ef37233e | |||
7551a34032 | |||
01a55941f5 | |||
8d7aa9de71 | |||
68c4421b6d | |||
aea94362c9 | |||
7206ce4ce1 | |||
96f6a7596f | |||
84bee4bd5c | |||
fc66dee76d | |||
6609cdf019 | |||
16366ee8bb | |||
528dbcac7d | |||
cd7b6f0857 | |||
68ad4e3a8d | |||
4004f144f3 | |||
66818e5b63 | |||
222a9dc350 | |||
cbdc4ad5a5 | |||
016e3676e7 | |||
64ea24d0b3 | |||
df76e5af26 | |||
09ccc9c8f7 | |||
69196a9bc7 | |||
2acba47d9b | |||
9c485d9e25 | |||
fa9ee08121 | |||
347eeebe3b | |||
18fd4a8331 | |||
132a132100 | |||
1e60f87bb3 | |||
9705b90bcf | |||
3aec49e56f | |||
c64612802b | |||
9a7c3a0042 | |||
b197a5ccfd | |||
c81081fece | |||
a94eee4456 | |||
f2e9f2a3be | |||
1f1542afa9 | |||
96912550c8 | |||
2fc6944c5e | |||
5fe6bf29d6 | |||
d4b62d4641 | |||
ecf67814f1 | |||
750f4cabfa | |||
06a760d6e8 | |||
da7512215f | |||
af69a6aded | |||
7bd3630067 | |||
96663699b2 | |||
18572e3384 | |||
86bfb6dba7 | |||
5f0ec3935a | |||
c222f47992 | |||
170eb35079 | |||
b37d82791e | |||
3127e975fb | |||
4001ea1266 | |||
5c89a29c22 | |||
59a0192fb9 | |||
83609791d2 | |||
0974c9bc5c | |||
d2643128f7 | |||
c5c06209ec | |||
3ea7b94523 | |||
51ef828f10 | |||
df450aa567 | |||
bbe5f9de7d | |||
81763c58a0 | |||
edaae198e7 | |||
936db119ed | |||
e66faf4809 | |||
630eb5b5ce | |||
4e94951bb1 | |||
7a8a48d51e | |||
32eb0da808 | |||
6d0e3d3724 | |||
02798ecabe | |||
813f249f02 | |||
da02cb4b27 | |||
c09503ddd6 | |||
2b83503227 | |||
7b98a65ae6 | |||
b5b57e301e | |||
54cacf008f | |||
58fd57ff1d | |||
87a0c076af | |||
d4e6194570 | |||
07934cc237 | |||
69d765f5a5 | |||
8027a72461 | |||
d75ab55f10 | |||
d1adb9b403 | |||
b8bfa46a18 | |||
1475847a14 | |||
fead53ba78 | |||
ebc73f2828 | |||
d06e824006 | |||
62b06ba23d | |||
5fd24ec02e | |||
874f7c292a | |||
92e793d91a | |||
bf53e0c70b | |||
dd7c9ad870 | |||
9aa1519f08 | |||
f8ef146f03 | |||
fa0050db08 | |||
cd9d06fb8d | |||
ebd8c669ef | |||
70755e819e | |||
edce722eaa | |||
57e729e874 | |||
de0526f668 | |||
5ecf3e0aaf | |||
97eb97b5a4 | |||
3adf0ffda8 | |||
ad388d25a8 | |||
cbe94391eb | |||
994fc655b7 | |||
3f9b7ab9f5 | |||
ad34c0df0f | |||
f218f9c24d | |||
0794e7446e | |||
b7ee940a82 | |||
9ddac56311 | |||
1a51b9f872 | |||
42f5e7c52a | |||
a3a3ee4e6f | |||
87054a57ab | |||
c9d6ff530b | |||
a2d2acb4c8 | |||
2e0e017610 | |||
1f18adb245 | |||
bb354e6b2d | |||
ff39141a49 | |||
8a1f938e6f | |||
078da31903 | |||
1a401252b5 | |||
f35ec461fc | |||
289b5191d5 | |||
c6db21313c | |||
a7d59688fb | |||
458e63a2c6 | |||
e8c23ff989 | |||
cd8249903f | |||
0f8cafe2d1 | |||
5340a30d01 | |||
89ce62a316 | |||
c3f05b09a0 | |||
cf6bbcb493 | |||
80ea3af1a0 | |||
9dd02d85ca | |||
f7b3ba82c3 | |||
619ae268c3 | |||
d14e98d924 | |||
9597a095f2 | |||
263a870ee1 | |||
8bddb73512 | |||
f967e51f38 | |||
43f3d9e699 | |||
b25cfab9a0 | |||
4b657d3292 | |||
d697dc01b4 | |||
a991f7d508 | |||
7a3a83e3b8 | |||
c32a7c7c0c | |||
2118d0565c | |||
899136b857 | |||
c9f09a4fe8 | |||
d45cbe70f5 | |||
8a579408f3 | |||
46fa98ccad | |||
aa1e77a19c | |||
5959564f94 | |||
f33e033e27 | |||
482cdc494e | |||
20410b2fda | |||
12664ddda5 | |||
241ad7b301 | |||
d85c47d6ad | |||
ef725feafc | |||
d907be7dc7 | |||
d53575a5f0 | |||
61af633256 | |||
ac2f3f7fee | |||
cf5f000d21 | |||
3de2b1eafb | |||
b844b99ad3 | |||
c3cf54dda4 | |||
36f5303578 | |||
9a228348d2 | |||
bd82872211 | |||
405eb8e396 | |||
65097ca0af | |||
1d967acb45 | |||
0bd1ff4346 | |||
310aca88c9 | |||
a732900efc | |||
d848800e88 | |||
730e9592e9 | |||
1fe554bac3 | |||
615e4a5401 | |||
3db0cafdf1 | |||
526de822d5 | |||
56fe4c297c | |||
47de8821d3 | |||
5984499e47 | |||
ca47e176af | |||
78f4590b60 | |||
2f7024987e | |||
6cd40a5bfe | |||
aba8d6ee00 | |||
2a0596bc48 | |||
f12141170a | |||
cfd3219f58 | |||
a1b2b8606e | |||
ad9f1aa679 | |||
889e662eae | |||
ef68eb28d8 | |||
259abd8953 | |||
f645eb6954 | |||
f4923cb8bc | |||
b640b19cc0 | |||
dc71af0a71 | |||
4d29e91be8 | |||
91445c7bc8 | |||
5950f555a1 | |||
a4e2b26856 | |||
973f5dc581 | |||
c994223d56 | |||
869579a702 | |||
c0efe92d8b | |||
d9fa1c05ad | |||
2de197bdd4 | |||
869e829b85 | |||
8f37be38eb | |||
8082ad7950 | |||
1e4ce295ae | |||
ce1917fcf2 | |||
e512f76a89 | |||
898cdf033e | |||
0f3f3c86ec | |||
b278557935 | |||
8ceffbf315 | |||
d93d2d74fd | |||
d0169e1b0f | |||
08fb75c72e | |||
91b361ae89 | |||
e20c92bb61 | |||
32c9eff2ff | |||
4ca5d40adc | |||
9279b9f83d | |||
ee77fdb5de | |||
996357e480 | |||
2a622d704a | |||
9c749713f6 | |||
022c5c6944 | |||
f8fcca100b | |||
06bfb51963 | |||
408e560015 | |||
402d378360 | |||
9e764e7b10 | |||
33fc1e2e86 | |||
eba17173d3 | |||
635b897246 | |||
4068f4b5b5 | |||
47831430cc | |||
65c08928c2 | |||
ba214dffbe | |||
eed11ebee9 | |||
300acb8347 | |||
d91457d529 | |||
fbf2564554 | |||
d1d49397e7 | |||
9c93636d84 | |||
e5d7ed0c53 | |||
ad0d567e1c | |||
bf0d97d786 | |||
a655eb3025 | |||
1543914c04 | |||
61fed92c7e | |||
80c751e7f6 | |||
e1a5c2f0a1 | |||
fd3a62a122 | |||
07064cb1d4 | |||
2f1e8e8f54 | |||
68d37809b9 | |||
5dba257506 | |||
187e32997c | |||
b55ed6ef8a | |||
2f385183f3 | |||
84c35c374a | |||
8c38ee7007 | |||
b6087a6bee | |||
23c1b10a4c | |||
a115ac46b5 | |||
73001445fb | |||
6d70198b17 | |||
f962f426bc | |||
11d8a091c6 | |||
365801fedd | |||
4db72e57f6 | |||
0c6f998554 | |||
e7c7c5e822 | |||
8c3230d8c1 | |||
2c5718809b | |||
82c49d3260 | |||
74fa1d123c | |||
a2a40bcd0d | |||
ccb1aabcca | |||
36e7670045 | |||
5886aa496e | |||
8d9b6721e7 | |||
b12e87f942 | |||
5dbf854553 | |||
970d6d0776 | |||
628ec6c17b | |||
3682e33f9f | |||
0aa38d16f5 | |||
faef77c0d6 | |||
dba4d9dec6 | |||
32b4c63f02 | |||
4fb8e329fd | |||
328841d002 | |||
d427e5cfda | |||
42bb201fd6 | |||
59d6bb4c86 | |||
b7dcc003dc | |||
d34be24bb1 | |||
b5cbe8eeb3 | |||
df04dffade | |||
a60731247f | |||
ac79799403 | |||
dde1fa18c9 | |||
0240402c46 | |||
55509c2114 | |||
101418096f | |||
5ce4627a7e | |||
7af553ea30 | |||
2c9b8ea2b0 | |||
d003f3ea39 | |||
6c6f7fe8a8 | |||
2339d59f92 | |||
1b875a0ef3 | |||
eb881ed006 | |||
46d4359450 | |||
81b979f2a8 | |||
371d04d39b | |||
0c0c2015c5 | |||
82d24f7aac | |||
f49777ba62 | |||
55fb97f7bd | |||
2072924d14 | |||
720b10fdc6 | |||
b85a977822 | |||
eec906d811 | |||
f57ee5650d | |||
dcb1a944d4 | |||
7492a36207 | |||
aa25985bd1 | |||
dbeac95dbb | |||
51a624bf02 | |||
6ad909fdda | |||
b689ada91e | |||
fc601665eb | |||
9832e5572a | |||
3f3e92e1f2 | |||
409475a827 | |||
196c34b0ac | |||
5c7963249d | |||
461cde2080 | |||
7a5286cc04 | |||
b1b1038fbd | |||
9edca6bf8f | |||
4f074fbf53 | |||
a491d6f535 | |||
32aa2059ad | |||
94d545a1a1 | |||
60fb4f3bcf | |||
63afbe9215 | |||
8cef6e02dc | |||
b866cdbd05 | |||
2e726680b3 | |||
5bfb30a529 | |||
e51719ae72 | |||
f30581c518 | |||
048fc57a0f | |||
f1d1bf6288 | |||
72d9c316d3 | |||
4a9139780a | |||
29c748930e | |||
c2d1b075ba | |||
584f0ae40d | |||
51ff216d85 | |||
dd2b5633dd | |||
47a0b615b4 | |||
5d2248d81a | |||
d573aeadcc | |||
995f56236b | |||
7c7aa37c69 | |||
04139ade59 | |||
1ecc645b8f | |||
c954f21ac0 | |||
86c2d8fd1c | |||
b880ffb87e | |||
7801f56ed7 | |||
48edab8041 | |||
a985f7af9f | |||
e461c262f0 | |||
276738ce0f | |||
cdf22afdda | |||
e24113a8fe | |||
7379b3d4b2 | |||
6c7f881541 | |||
a0f7d53beb | |||
5aef49806d | |||
98356735ac | |||
f26c4aeecb | |||
8936316d58 | |||
6142ef0ada | |||
c6b0a7d3ba | |||
a30482f054 | |||
17ca964273 | |||
5a9da2e6e9 | |||
fdea8ec167 | |||
ca5f54a9b9 | |||
f954fe0e65 | |||
362cff1eb3 | |||
996aa70f00 | |||
60508ffda9 | |||
f04e407e6b | |||
8b79f9e107 | |||
866fa4550d | |||
bf8717ebae | |||
c77eb8a33c |
@ -1,9 +1,14 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import zipfile
|
import zipfile
|
||||||
|
|
||||||
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 250 MB
|
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 400 MiB
|
||||||
VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 250))
|
# Note that we have 400 MiB quota, please use it wisely.
|
||||||
|
# See https://github.com/pypi/support/issues/3792 .
|
||||||
|
# Please also sync the value with the one in Dockerfile.
|
||||||
|
VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 400))
|
||||||
|
|
||||||
|
|
||||||
def print_top_10_largest_files(zip_file):
|
def print_top_10_largest_files(zip_file):
|
||||||
|
26
.buildkite/generate_index.py
Normal file
26
.buildkite/generate_index.py
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
|
||||||
|
template = """<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<h1>Links for vLLM</h1/>
|
||||||
|
<a href="../{wheel_html_escaped}">{wheel}</a><br/>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--wheel", help="The wheel path.", required=True)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
filename = os.path.basename(args.wheel)
|
||||||
|
|
||||||
|
with open("index.html", "w") as f:
|
||||||
|
print(f"Generated index.html for {args.wheel}")
|
||||||
|
# cloudfront requires escaping the '+' character
|
||||||
|
f.write(
|
||||||
|
template.format(wheel=filename,
|
||||||
|
wheel_html_escaped=filename.replace("+", "%2B")))
|
@ -0,0 +1,11 @@
|
|||||||
|
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2
|
||||||
|
model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.6353
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.637
|
||||||
|
limit: null
|
||||||
|
num_fewshot: null
|
@ -1,3 +1,4 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
"""
|
"""
|
||||||
LM eval harness on model to compare vs HF baseline computed offline.
|
LM eval harness on model to compare vs HF baseline computed offline.
|
||||||
Configs are found in configs/$MODEL.yaml
|
Configs are found in configs/$MODEL.yaml
|
||||||
|
@ -1,15 +1,13 @@
|
|||||||
# vLLM benchmark suite
|
# vLLM benchmark suite
|
||||||
|
|
||||||
|
|
||||||
## Introduction
|
## Introduction
|
||||||
|
|
||||||
This directory contains two sets of benchmark for vllm.
|
This directory contains two sets of benchmark for vllm.
|
||||||
|
|
||||||
- Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance
|
- Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance
|
||||||
- Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm.
|
- Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm.
|
||||||
|
|
||||||
|
See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
|
||||||
See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
|
|
||||||
|
|
||||||
|
|
||||||
## Performance benchmark quick overview
|
## Performance benchmark quick overview
|
||||||
|
|
||||||
@ -19,17 +17,14 @@ See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performan
|
|||||||
|
|
||||||
**For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run.
|
**For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run.
|
||||||
|
|
||||||
|
|
||||||
## Nightly benchmark quick overview
|
## Nightly benchmark quick overview
|
||||||
|
|
||||||
**Benchmarking Coverage**: Fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) on Llama-3 8B, 70B and Mixtral 8x7B.
|
**Benchmarking Coverage**: Fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) on Llama-3 8B, 70B and Mixtral 8x7B.
|
||||||
|
|
||||||
**Benchmarking engines**: vllm, TGI, trt-llm and lmdeploy.
|
**Benchmarking engines**: vllm, TGI, trt-llm and lmdeploy.
|
||||||
|
|
||||||
**Benchmarking Duration**: about 3.5hrs.
|
**Benchmarking Duration**: about 3.5hrs.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## Trigger the benchmark
|
## Trigger the benchmark
|
||||||
|
|
||||||
Performance benchmark will be triggered when:
|
Performance benchmark will be triggered when:
|
||||||
@ -39,16 +34,11 @@ Performance benchmark will be triggered when:
|
|||||||
Nightly benchmark will be triggered when:
|
Nightly benchmark will be triggered when:
|
||||||
- Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
|
- Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## Performance benchmark details
|
## Performance benchmark details
|
||||||
|
|
||||||
|
|
||||||
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
|
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
|
||||||
|
|
||||||
|
### Latency test
|
||||||
#### Latency test
|
|
||||||
|
|
||||||
Here is an example of one test inside `latency-tests.json`:
|
Here is an example of one test inside `latency-tests.json`:
|
||||||
|
|
||||||
@ -68,23 +58,25 @@ Here is an example of one test inside `latency-tests.json`:
|
|||||||
```
|
```
|
||||||
|
|
||||||
In this example:
|
In this example:
|
||||||
- The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
|
|
||||||
- The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
|
- The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
|
||||||
|
- The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
|
||||||
|
|
||||||
Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
|
Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
|
||||||
|
|
||||||
WARNING: The benchmarking script will save json results by itself, so please do not configure `--output-json` parameter in the json file.
|
WARNING: The benchmarking script will save json results by itself, so please do not configure `--output-json` parameter in the json file.
|
||||||
|
|
||||||
|
### Throughput test
|
||||||
|
|
||||||
#### Throughput test
|
|
||||||
The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`.
|
The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`.
|
||||||
|
|
||||||
The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
|
The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
|
||||||
|
|
||||||
#### Serving test
|
### Serving test
|
||||||
|
|
||||||
We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
|
We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
|
||||||
|
|
||||||
```
|
```json
|
||||||
[
|
[
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_tp1_sharegpt",
|
"test_name": "serving_llama8B_tp1_sharegpt",
|
||||||
@ -109,6 +101,7 @@ We test the throughput by using `benchmark_serving.py` with request rate = inf t
|
|||||||
```
|
```
|
||||||
|
|
||||||
Inside this example:
|
Inside this example:
|
||||||
|
|
||||||
- The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`.
|
- The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`.
|
||||||
- The `server-parameters` includes the command line arguments for vLLM server.
|
- The `server-parameters` includes the command line arguments for vLLM server.
|
||||||
- The `client-parameters` includes the command line arguments for `benchmark_serving.py`.
|
- The `client-parameters` includes the command line arguments for `benchmark_serving.py`.
|
||||||
@ -118,36 +111,33 @@ The number of this test is less stable compared to the delay and latency benchma
|
|||||||
|
|
||||||
WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
|
WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
|
||||||
|
|
||||||
#### Visualizing the results
|
### Visualizing the results
|
||||||
|
|
||||||
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results.
|
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results.
|
||||||
You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
|
You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
|
||||||
If you do not see the table, please wait till the benchmark finish running.
|
If you do not see the table, please wait till the benchmark finish running.
|
||||||
The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
|
The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
|
||||||
The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
|
The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## Nightly test details
|
## Nightly test details
|
||||||
|
|
||||||
See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines.
|
See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines.
|
||||||
|
|
||||||
|
### Workflow
|
||||||
|
|
||||||
#### Workflow
|
- The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines.
|
||||||
|
|
||||||
- The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines.
|
|
||||||
- Inside each container, we run [run-nightly-suite.sh](run-nightly-suite.sh), which will probe the serving engine of the current container.
|
- Inside each container, we run [run-nightly-suite.sh](run-nightly-suite.sh), which will probe the serving engine of the current container.
|
||||||
- The `run-nightly-suite.sh` will redirect the request to `tests/run-[llm serving engine name]-nightly.sh`, which parses the workload described in [nightly-tests.json](tests/nightly-tests.json) and performs the benchmark.
|
- The `run-nightly-suite.sh` will redirect the request to `tests/run-[llm serving engine name]-nightly.sh`, which parses the workload described in [nightly-tests.json](tests/nightly-tests.json) and performs the benchmark.
|
||||||
- At last, we run [scripts/plot-nightly-results.py](scripts/plot-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
|
- At last, we run [scripts/plot-nightly-results.py](scripts/plot-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
|
||||||
|
|
||||||
#### Nightly tests
|
### Nightly tests
|
||||||
|
|
||||||
In [nightly-tests.json](tests/nightly-tests.json), we include the command line arguments for benchmarking commands, together with the benchmarking test cases. The format is highly similar to performance benchmark.
|
In [nightly-tests.json](tests/nightly-tests.json), we include the command line arguments for benchmarking commands, together with the benchmarking test cases. The format is highly similar to performance benchmark.
|
||||||
|
|
||||||
#### Docker containers
|
### Docker containers
|
||||||
|
|
||||||
The docker containers for benchmarking are specified in `nightly-pipeline.yaml`.
|
The docker containers for benchmarking are specified in `nightly-pipeline.yaml`.
|
||||||
|
|
||||||
WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `tests/run-[llm serving engine name]-nightly.sh`.
|
WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `tests/run-[llm serving engine name]-nightly.sh`.
|
||||||
|
|
||||||
WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git).
|
WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git).
|
||||||
|
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
steps:
|
steps:
|
||||||
- label: "Wait for container to be ready"
|
- label: "Wait for container to be ready"
|
||||||
|
key: wait-for-container-image
|
||||||
agents:
|
agents:
|
||||||
queue: A100
|
queue: A100
|
||||||
plugins:
|
plugins:
|
||||||
@ -9,13 +10,18 @@ steps:
|
|||||||
- image: badouralix/curl-jq
|
- image: badouralix/curl-jq
|
||||||
command:
|
command:
|
||||||
- sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
|
- sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
|
||||||
|
- label: "Cleanup H100"
|
||||||
- wait
|
agents:
|
||||||
|
queue: H100
|
||||||
|
depends_on: ~
|
||||||
|
command: docker system prune -a --volumes --force
|
||||||
|
|
||||||
- label: "A100"
|
- label: "A100"
|
||||||
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
||||||
agents:
|
agents:
|
||||||
queue: A100
|
queue: A100
|
||||||
|
depends_on: wait-for-container-image
|
||||||
|
if: build.branch == "main"
|
||||||
plugins:
|
plugins:
|
||||||
- kubernetes:
|
- kubernetes:
|
||||||
podSpec:
|
podSpec:
|
||||||
@ -49,6 +55,8 @@ steps:
|
|||||||
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
||||||
agents:
|
agents:
|
||||||
queue: H200
|
queue: H200
|
||||||
|
depends_on: wait-for-container-image
|
||||||
|
if: build.branch == "main"
|
||||||
plugins:
|
plugins:
|
||||||
- docker#v5.12.0:
|
- docker#v5.12.0:
|
||||||
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
|
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
|
||||||
@ -65,15 +73,16 @@ steps:
|
|||||||
- VLLM_USAGE_SOURCE
|
- VLLM_USAGE_SOURCE
|
||||||
- HF_TOKEN
|
- HF_TOKEN
|
||||||
|
|
||||||
- block: "Run H100 Benchmark"
|
#- block: "Run H100 Benchmark"
|
||||||
key: block-h100
|
#key: block-h100
|
||||||
depends_on: ~
|
#depends_on: ~
|
||||||
|
|
||||||
- label: "H100"
|
- label: "H100"
|
||||||
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
||||||
agents:
|
agents:
|
||||||
queue: H100
|
queue: H100
|
||||||
depends_on: block-h100
|
depends_on: wait-for-container-image
|
||||||
|
if: build.branch == "main"
|
||||||
plugins:
|
plugins:
|
||||||
- docker#v5.12.0:
|
- docker#v5.12.0:
|
||||||
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
|
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
|
||||||
@ -89,3 +98,87 @@ steps:
|
|||||||
environment:
|
environment:
|
||||||
- VLLM_USAGE_SOURCE
|
- VLLM_USAGE_SOURCE
|
||||||
- HF_TOKEN
|
- HF_TOKEN
|
||||||
|
|
||||||
|
# Premerge benchmark
|
||||||
|
- label: "A100"
|
||||||
|
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
||||||
|
agents:
|
||||||
|
queue: A100
|
||||||
|
depends_on: wait-for-container-image
|
||||||
|
if: build.branch != "main"
|
||||||
|
plugins:
|
||||||
|
- kubernetes:
|
||||||
|
podSpec:
|
||||||
|
priorityClassName: perf-benchmark
|
||||||
|
containers:
|
||||||
|
- image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
|
||||||
|
command:
|
||||||
|
- bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
nvidia.com/gpu: 8
|
||||||
|
volumeMounts:
|
||||||
|
- name: devshm
|
||||||
|
mountPath: /dev/shm
|
||||||
|
env:
|
||||||
|
- name: VLLM_USAGE_SOURCE
|
||||||
|
value: ci-test
|
||||||
|
- name: HF_TOKEN
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: hf-token-secret
|
||||||
|
key: token
|
||||||
|
nodeSelector:
|
||||||
|
nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
|
||||||
|
volumes:
|
||||||
|
- name: devshm
|
||||||
|
emptyDir:
|
||||||
|
medium: Memory
|
||||||
|
|
||||||
|
- label: "H200"
|
||||||
|
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
||||||
|
agents:
|
||||||
|
queue: H200
|
||||||
|
depends_on: wait-for-container-image
|
||||||
|
if: build.branch != "main"
|
||||||
|
plugins:
|
||||||
|
- docker#v5.12.0:
|
||||||
|
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
|
||||||
|
command:
|
||||||
|
- bash
|
||||||
|
- .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
||||||
|
mount-buildkite-agent: true
|
||||||
|
propagate-environment: true
|
||||||
|
ipc: host
|
||||||
|
gpus: 4,5,6,7
|
||||||
|
volumes:
|
||||||
|
- /data/benchmark-hf-cache:/root/.cache/huggingface
|
||||||
|
environment:
|
||||||
|
- VLLM_USAGE_SOURCE
|
||||||
|
- HF_TOKEN
|
||||||
|
|
||||||
|
#- block: "Run H100 Benchmark"
|
||||||
|
#key: block-h100
|
||||||
|
#depends_on: ~
|
||||||
|
|
||||||
|
- label: "H100"
|
||||||
|
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
||||||
|
agents:
|
||||||
|
queue: H100
|
||||||
|
depends_on: wait-for-container-image
|
||||||
|
if: build.branch != "main"
|
||||||
|
plugins:
|
||||||
|
- docker#v5.12.0:
|
||||||
|
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
|
||||||
|
command:
|
||||||
|
- bash
|
||||||
|
- .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
||||||
|
mount-buildkite-agent: true
|
||||||
|
propagate-environment: true
|
||||||
|
ipc: host
|
||||||
|
gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
|
||||||
|
volumes:
|
||||||
|
- /data/benchmark-hf-cache:/root/.cache/huggingface
|
||||||
|
environment:
|
||||||
|
- VLLM_USAGE_SOURCE
|
||||||
|
- HF_TOKEN
|
||||||
|
@ -9,20 +9,19 @@ This file contains the downloading link for benchmarking results.
|
|||||||
|
|
||||||
Please download the visualization scripts in the post
|
Please download the visualization scripts in the post
|
||||||
|
|
||||||
|
|
||||||
## Results reproduction
|
## Results reproduction
|
||||||
|
|
||||||
- Find the docker we use in `benchmarking pipeline`
|
- Find the docker we use in `benchmarking pipeline`
|
||||||
- Deploy the docker, and inside the docker:
|
- Deploy the docker, and inside the docker:
|
||||||
- Download `nightly-benchmarks.zip`.
|
- Download `nightly-benchmarks.zip`.
|
||||||
- In the same folder, run the following code
|
- In the same folder, run the following code:
|
||||||
```
|
|
||||||
export HF_TOKEN=<your HF token>
|
```console
|
||||||
apt update
|
export HF_TOKEN=<your HF token>
|
||||||
apt install -y git
|
apt update
|
||||||
unzip nightly-benchmarks.zip
|
apt install -y git
|
||||||
VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
|
unzip nightly-benchmarks.zip
|
||||||
```
|
VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
|
||||||
|
```
|
||||||
|
|
||||||
And the results will be inside `./benchmarks/results`.
|
And the results will be inside `./benchmarks/results`.
|
||||||
|
|
||||||
|
@ -2,6 +2,7 @@
|
|||||||
# Nightly benchmark
|
# Nightly benchmark
|
||||||
|
|
||||||
This benchmark aims to:
|
This benchmark aims to:
|
||||||
|
|
||||||
- Provide performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and SGLang) leads in performance in what workload.
|
- Provide performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and SGLang) leads in performance in what workload.
|
||||||
- Be reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions.
|
- Be reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions.
|
||||||
|
|
||||||
@ -9,7 +10,6 @@ Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html)
|
|||||||
|
|
||||||
Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
|
Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
|
||||||
|
|
||||||
|
|
||||||
## Setup
|
## Setup
|
||||||
|
|
||||||
- Docker images:
|
- Docker images:
|
||||||
@ -33,7 +33,7 @@ Latest reproduction guilde: [github issue link](https://github.com/vllm-project/
|
|||||||
- Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
|
- Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
|
||||||
- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
|
- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
|
||||||
|
|
||||||
# Known issues
|
## Known issues
|
||||||
|
|
||||||
- TRT-LLM crashes with Llama 3.1 8B [issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105).
|
- TRT-LLM crashes with Llama 3.1 8B [issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105).
|
||||||
- TGI does not support `ignore-eos` flag.
|
- TGI does not support `ignore-eos` flag.
|
||||||
|
@ -7,10 +7,8 @@
|
|||||||
- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
||||||
- Evaluation metrics: end-to-end latency (mean, median, p99).
|
- Evaluation metrics: end-to-end latency (mean, median, p99).
|
||||||
|
|
||||||
|
|
||||||
{latency_tests_markdown_table}
|
{latency_tests_markdown_table}
|
||||||
|
|
||||||
|
|
||||||
## Throughput tests
|
## Throughput tests
|
||||||
|
|
||||||
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
|
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
|
||||||
@ -19,10 +17,8 @@
|
|||||||
- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
||||||
- Evaluation metrics: throughput.
|
- Evaluation metrics: throughput.
|
||||||
|
|
||||||
|
|
||||||
{throughput_tests_markdown_table}
|
{throughput_tests_markdown_table}
|
||||||
|
|
||||||
|
|
||||||
## Serving tests
|
## Serving tests
|
||||||
|
|
||||||
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
|
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
|
||||||
@ -33,13 +29,11 @@
|
|||||||
- We also added a speculative decoding test for llama-3 70B, under QPS 2
|
- We also added a speculative decoding test for llama-3 70B, under QPS 2
|
||||||
- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
|
- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
|
||||||
|
|
||||||
|
|
||||||
{serving_tests_markdown_table}
|
{serving_tests_markdown_table}
|
||||||
|
|
||||||
|
|
||||||
## json version of the benchmarking tables
|
## json version of the benchmarking tables
|
||||||
|
|
||||||
This section contains the data of the markdown tables above in JSON format.
|
This section contains the data of the markdown tables above in JSON format.
|
||||||
You can load the benchmarking tables into pandas dataframes as follows:
|
You can load the benchmarking tables into pandas dataframes as follows:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
@ -54,9 +48,9 @@ serving_results = pd.DataFrame.from_dict(benchmarking_results["serving"])
|
|||||||
```
|
```
|
||||||
|
|
||||||
The json string for all benchmarking tables:
|
The json string for all benchmarking tables:
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{benchmarking_results_in_json_string}
|
{benchmarking_results_in_json_string}
|
||||||
```
|
```
|
||||||
|
|
||||||
You can also check the raw experiment data in the Artifact tab of the Buildkite page.
|
You can also check the raw experiment data in the Artifact tab of the Buildkite page.
|
||||||
|
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import json
|
import json
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
from lmdeploy.serve.openai.api_client import APIClient
|
from lmdeploy.serve.openai.api_client import APIClient
|
||||||
|
|
||||||
api_client = APIClient("http://localhost:8000")
|
api_client = APIClient("http://localhost:8000")
|
||||||
|
@ -43,7 +43,7 @@ main() {
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
# The figures should be genereated by a separate process outside the CI/CD pipeline
|
# The figures should be generated by a separate process outside the CI/CD pipeline
|
||||||
|
|
||||||
# # generate figures
|
# # generate figures
|
||||||
# python3 -m pip install tabulate pandas matplotlib
|
# python3 -m pip install tabulate pandas matplotlib
|
||||||
|
@ -301,6 +301,104 @@ run_serving_tests() {
|
|||||||
kill_gpu_processes
|
kill_gpu_processes
|
||||||
}
|
}
|
||||||
|
|
||||||
|
run_genai_perf_tests() {
|
||||||
|
# run genai-perf tests
|
||||||
|
|
||||||
|
# $1: a json file specifying genai-perf test cases
|
||||||
|
local genai_perf_test_file
|
||||||
|
genai_perf_test_file=$1
|
||||||
|
|
||||||
|
# Iterate over genai-perf tests
|
||||||
|
jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
|
||||||
|
# get the test name, and append the GPU type back to it.
|
||||||
|
test_name=$(echo "$params" | jq -r '.test_name')
|
||||||
|
|
||||||
|
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
||||||
|
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
||||||
|
echo "Skip test case $test_name."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# prepend the current serving engine to the test name
|
||||||
|
test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
|
||||||
|
|
||||||
|
# get common parameters
|
||||||
|
common_params=$(echo "$params" | jq -r '.common_parameters')
|
||||||
|
model=$(echo "$common_params" | jq -r '.model')
|
||||||
|
tp=$(echo "$common_params" | jq -r '.tp')
|
||||||
|
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
|
||||||
|
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
|
||||||
|
port=$(echo "$common_params" | jq -r '.port')
|
||||||
|
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
|
||||||
|
reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
|
||||||
|
|
||||||
|
# get client and server arguments
|
||||||
|
server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
|
||||||
|
qps_list=$(echo "$params" | jq -r '.qps_list')
|
||||||
|
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
||||||
|
echo "Running over qps list $qps_list"
|
||||||
|
|
||||||
|
# check if there is enough GPU to run the test
|
||||||
|
if [[ $gpu_count -lt $tp ]]; then
|
||||||
|
echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ $reuse_server == "true" ]]; then
|
||||||
|
echo "Reuse previous server for test case $test_name"
|
||||||
|
else
|
||||||
|
kill_gpu_processes
|
||||||
|
bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
|
||||||
|
"$server_params" "$common_params"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if wait_for_server; then
|
||||||
|
echo ""
|
||||||
|
echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
|
||||||
|
else
|
||||||
|
echo ""
|
||||||
|
echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
|
||||||
|
# iterate over different QPS
|
||||||
|
for qps in $qps_list; do
|
||||||
|
# remove the surrounding single quote from qps
|
||||||
|
if [[ "$qps" == *"inf"* ]]; then
|
||||||
|
echo "qps was $qps"
|
||||||
|
qps=$num_prompts
|
||||||
|
echo "now qps is $qps"
|
||||||
|
fi
|
||||||
|
|
||||||
|
new_test_name=$test_name"_qps_"$qps
|
||||||
|
backend=$CURRENT_LLM_SERVING_ENGINE
|
||||||
|
|
||||||
|
if [[ "$backend" == *"vllm"* ]]; then
|
||||||
|
backend="vllm"
|
||||||
|
fi
|
||||||
|
#TODO: add output dir.
|
||||||
|
client_command="genai-perf profile \
|
||||||
|
-m $model \
|
||||||
|
--service-kind openai \
|
||||||
|
--backend vllm \
|
||||||
|
--endpoint-type chat \
|
||||||
|
--streaming \
|
||||||
|
--url localhost:$port \
|
||||||
|
--request-rate $qps \
|
||||||
|
--num-prompts $num_prompts \
|
||||||
|
"
|
||||||
|
|
||||||
|
echo "Client command: $client_command"
|
||||||
|
|
||||||
|
eval "$client_command"
|
||||||
|
|
||||||
|
#TODO: process/record outputs
|
||||||
|
done
|
||||||
|
done
|
||||||
|
|
||||||
|
kill_gpu_processes
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
prepare_dataset() {
|
prepare_dataset() {
|
||||||
|
|
||||||
@ -328,12 +426,17 @@ main() {
|
|||||||
|
|
||||||
pip install -U transformers
|
pip install -U transformers
|
||||||
|
|
||||||
|
pip install -r requirements-dev.txt
|
||||||
|
which genai-perf
|
||||||
|
|
||||||
# check storage
|
# check storage
|
||||||
df -h
|
df -h
|
||||||
|
|
||||||
ensure_installed wget
|
ensure_installed wget
|
||||||
ensure_installed curl
|
ensure_installed curl
|
||||||
ensure_installed jq
|
ensure_installed jq
|
||||||
|
# genai-perf dependency
|
||||||
|
ensure_installed libb64-0d
|
||||||
|
|
||||||
prepare_dataset
|
prepare_dataset
|
||||||
|
|
||||||
@ -345,6 +448,10 @@ main() {
|
|||||||
# run the test
|
# run the test
|
||||||
run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json"
|
run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json"
|
||||||
|
|
||||||
|
# run genai-perf tests
|
||||||
|
run_genai_perf_tests "$BENCHMARK_ROOT/tests/genai-perf-tests.json"
|
||||||
|
mv artifacts/ $RESULTS_FOLDER/
|
||||||
|
|
||||||
# upload benchmark results to buildkite
|
# upload benchmark results to buildkite
|
||||||
python3 -m pip install tabulate pandas
|
python3 -m pip install tabulate pandas
|
||||||
python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py"
|
python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py"
|
||||||
|
@ -345,6 +345,11 @@ main() {
|
|||||||
check_gpus
|
check_gpus
|
||||||
check_hf_token
|
check_hf_token
|
||||||
|
|
||||||
|
# Set to v1 to run v1 benchmark
|
||||||
|
if [[ "${ENGINE_VERSION:-v0}" == "v1" ]]; then
|
||||||
|
export VLLM_USE_V1=1
|
||||||
|
fi
|
||||||
|
|
||||||
# dependencies
|
# dependencies
|
||||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||||
(which jq) || (apt-get update && apt-get -y install jq)
|
(which jq) || (apt-get update && apt-get -y install jq)
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
import datetime
|
import datetime
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
@ -1,6 +1,10 @@
|
|||||||
#!/bin/sh
|
#!/bin/sh
|
||||||
TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-postmerge-repo:pull" | jq -r .token)
|
TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-postmerge-repo:pull" | jq -r .token)
|
||||||
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT"
|
if [[ "$BUILDKITE_BRANCH" == "main" ]]; then
|
||||||
|
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT"
|
||||||
|
else
|
||||||
|
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
|
||||||
|
fi
|
||||||
|
|
||||||
TIMEOUT_SECONDS=10
|
TIMEOUT_SECONDS=10
|
||||||
|
|
||||||
|
23
.buildkite/nightly-benchmarks/tests/genai-perf-tests.json
Normal file
23
.buildkite/nightly-benchmarks/tests/genai-perf-tests.json
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"test_name": "llama8B_tp1_genai_perf",
|
||||||
|
"qps_list": [4,8,16,32],
|
||||||
|
"common_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3-8B-Instruct",
|
||||||
|
"tp": 1,
|
||||||
|
"port": 8000,
|
||||||
|
"num_prompts": 500,
|
||||||
|
"reuse_server": false
|
||||||
|
},
|
||||||
|
"vllm_server_parameters": {
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
|
"gpu_memory_utilization": 0.9,
|
||||||
|
"num_scheduler_steps": 10,
|
||||||
|
"max_num_seqs": 512,
|
||||||
|
"dtype": "bfloat16"
|
||||||
|
},
|
||||||
|
"genai_perf_input_parameters": {
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
@ -29,4 +29,4 @@
|
|||||||
"num-iters": 15
|
"num-iters": 15
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
@ -66,8 +66,7 @@
|
|||||||
"swap_space": 16,
|
"swap_space": 16,
|
||||||
"speculative_model": "turboderp/Qwama-0.5B-Instruct",
|
"speculative_model": "turboderp/Qwama-0.5B-Instruct",
|
||||||
"num_speculative_tokens": 4,
|
"num_speculative_tokens": 4,
|
||||||
"speculative_draft_tensor_parallel_size": 1,
|
"speculative_draft_tensor_parallel_size": 1
|
||||||
"use_v2_block_manager": ""
|
|
||||||
},
|
},
|
||||||
"client_parameters": {
|
"client_parameters": {
|
||||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||||
|
@ -55,3 +55,23 @@ steps:
|
|||||||
password-env: DOCKERHUB_TOKEN
|
password-env: DOCKERHUB_TOKEN
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
|
- input: "Provide Release version here"
|
||||||
|
fields:
|
||||||
|
- text: "What is the release version?"
|
||||||
|
key: "release-version"
|
||||||
|
|
||||||
|
- block: "Build CPU release image"
|
||||||
|
key: block-cpu-release-image-build
|
||||||
|
depends_on: ~
|
||||||
|
|
||||||
|
- label: "Build and publish CPU release image"
|
||||||
|
depends_on: block-cpu-release-image-build
|
||||||
|
agents:
|
||||||
|
queue: cpu_queue_postmerge
|
||||||
|
commands:
|
||||||
|
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||||
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --progress plain -f Dockerfile.cpu ."
|
||||||
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
|
||||||
|
env:
|
||||||
|
DOCKER_BUILDKIT: "1"
|
||||||
|
@ -121,6 +121,8 @@ if [[ $commands == *"--shard-id="* ]]; then
|
|||||||
--rm \
|
--rm \
|
||||||
-e HIP_VISIBLE_DEVICES="${GPU}" \
|
-e HIP_VISIBLE_DEVICES="${GPU}" \
|
||||||
-e HF_TOKEN \
|
-e HF_TOKEN \
|
||||||
|
-e AWS_ACCESS_KEY_ID \
|
||||||
|
-e AWS_SECRET_ACCESS_KEY \
|
||||||
-v "${HF_CACHE}:${HF_MOUNT}" \
|
-v "${HF_CACHE}:${HF_MOUNT}" \
|
||||||
-e "HF_HOME=${HF_MOUNT}" \
|
-e "HF_HOME=${HF_MOUNT}" \
|
||||||
--name "${container_name}_${GPU}" \
|
--name "${container_name}_${GPU}" \
|
||||||
@ -148,6 +150,8 @@ else
|
|||||||
--rm \
|
--rm \
|
||||||
-e HIP_VISIBLE_DEVICES=0 \
|
-e HIP_VISIBLE_DEVICES=0 \
|
||||||
-e HF_TOKEN \
|
-e HF_TOKEN \
|
||||||
|
-e AWS_ACCESS_KEY_ID \
|
||||||
|
-e AWS_SECRET_ACCESS_KEY \
|
||||||
-v "${HF_CACHE}:${HF_MOUNT}" \
|
-v "${HF_CACHE}:${HF_MOUNT}" \
|
||||||
-e "HF_HOME=${HF_MOUNT}" \
|
-e "HF_HOME=${HF_MOUNT}" \
|
||||||
--name "${container_name}" \
|
--name "${container_name}" \
|
||||||
|
@ -9,36 +9,33 @@ CORE_RANGE=${CORE_RANGE:-48-95}
|
|||||||
NUMA_NODE=${NUMA_NODE:-1}
|
NUMA_NODE=${NUMA_NODE:-1}
|
||||||
|
|
||||||
# Try building the docker image
|
# Try building the docker image
|
||||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test -f Dockerfile.cpu .
|
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test-"$BUILDKITE_BUILD_NUMBER" -f Dockerfile.cpu .
|
||||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .
|
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 -f Dockerfile.cpu .
|
||||||
|
|
||||||
# Setup cleanup
|
# Setup cleanup
|
||||||
remove_docker_container() { docker rm -f cpu-test-"$NUMA_NODE" cpu-test-avx2-"$NUMA_NODE" || true; }
|
remove_docker_container() { set -e; docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; }
|
||||||
trap remove_docker_container EXIT
|
trap remove_docker_container EXIT
|
||||||
remove_docker_container
|
remove_docker_container
|
||||||
|
|
||||||
# Run the image, setting --shm-size=4g for tensor parallel.
|
# Run the image, setting --shm-size=4g for tensor parallel.
|
||||||
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
|
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
|
||||||
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test
|
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
|
||||||
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
|
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
|
||||||
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2-"$NUMA_NODE" cpu-test-avx2
|
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
|
||||||
|
|
||||||
function cpu_tests() {
|
function cpu_tests() {
|
||||||
set -e
|
set -e
|
||||||
export NUMA_NODE=$2
|
export NUMA_NODE=$2
|
||||||
|
|
||||||
# offline inference
|
# offline inference
|
||||||
docker exec cpu-test-avx2-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
python3 examples/offline_inference.py"
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
||||||
|
|
||||||
# Run basic model test
|
# Run basic model test
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
pip install pytest pytest-asyncio \
|
pip install -r vllm/requirements-test.txt
|
||||||
decord einops librosa peft Pillow sentence-transformers soundfile \
|
|
||||||
transformers_stream_generator matplotlib datamodel_code_generator
|
|
||||||
pip install torchvision --index-url https://download.pytorch.org/whl/cpu
|
|
||||||
pytest -v -s tests/models/decoder_only/language -m cpu_model
|
pytest -v -s tests/models/decoder_only/language -m cpu_model
|
||||||
pytest -v -s tests/models/embedding/language -m cpu_model
|
pytest -v -s tests/models/embedding/language -m cpu_model
|
||||||
pytest -v -s tests/models/encoder_decoder/language -m cpu_model
|
pytest -v -s tests/models/encoder_decoder/language -m cpu_model
|
||||||
@ -46,26 +43,26 @@ function cpu_tests() {
|
|||||||
pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
|
pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
|
||||||
|
|
||||||
# Run compressed-tensor test
|
# Run compressed-tensor test
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
pytest -s -v \
|
pytest -s -v \
|
||||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
|
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
|
||||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
|
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
|
||||||
|
|
||||||
# Run AWQ test
|
# Run AWQ test
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
pytest -s -v \
|
pytest -s -v \
|
||||||
tests/quantization/test_ipex_quant.py"
|
tests/quantization/test_ipex_quant.py"
|
||||||
|
|
||||||
# Run chunked-prefill and prefix-cache test
|
# Run chunked-prefill and prefix-cache test
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
pytest -s -v -k cpu_model \
|
pytest -s -v -k cpu_model \
|
||||||
tests/basic_correctness/test_chunked_prefill.py"
|
tests/basic_correctness/test_chunked_prefill.py"
|
||||||
|
|
||||||
# online inference
|
# online serving
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
export VLLM_CPU_KVCACHE_SPACE=10
|
export VLLM_CPU_KVCACHE_SPACE=10
|
||||||
export VLLM_CPU_OMP_THREADS_BIND=$1
|
export VLLM_CPU_OMP_THREADS_BIND=$1
|
||||||
@ -78,8 +75,14 @@ function cpu_tests() {
|
|||||||
--num-prompts 20 \
|
--num-prompts 20 \
|
||||||
--endpoint /v1/completions \
|
--endpoint /v1/completions \
|
||||||
--tokenizer facebook/opt-125m"
|
--tokenizer facebook/opt-125m"
|
||||||
|
|
||||||
|
# Run multi-lora tests
|
||||||
|
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||||
|
set -e
|
||||||
|
pytest -s -v \
|
||||||
|
tests/lora/test_qwen2vl.py"
|
||||||
}
|
}
|
||||||
|
|
||||||
# All of CPU tests are expected to be finished less than 25 mins.
|
# All of CPU tests are expected to be finished less than 40 mins.
|
||||||
export -f cpu_tests
|
export -f cpu_tests
|
||||||
timeout 30m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
|
timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
|
||||||
|
@ -4,6 +4,9 @@
|
|||||||
# It serves a sanity check for compilation and basic model usage.
|
# It serves a sanity check for compilation and basic model usage.
|
||||||
set -ex
|
set -ex
|
||||||
|
|
||||||
|
# Skip the new torch installation during build since we are using the specified version for arm64 in the Dockerfile
|
||||||
|
python3 use_existing_torch.py
|
||||||
|
|
||||||
# Try building the docker image
|
# Try building the docker image
|
||||||
DOCKER_BUILDKIT=1 docker build . \
|
DOCKER_BUILDKIT=1 docker build . \
|
||||||
--target vllm-openai \
|
--target vllm-openai \
|
||||||
@ -20,6 +23,6 @@ trap remove_docker_container EXIT
|
|||||||
remove_docker_container
|
remove_docker_container
|
||||||
|
|
||||||
# Run the image and test offline inference
|
# Run the image and test offline inference
|
||||||
docker run --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
|
docker run -e HF_TOKEN -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
|
||||||
python3 examples/offline_inference.py
|
python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B
|
||||||
'
|
'
|
||||||
|
@ -8,9 +8,17 @@ set -ex
|
|||||||
docker build -t hpu-test-env -f Dockerfile.hpu .
|
docker build -t hpu-test-env -f Dockerfile.hpu .
|
||||||
|
|
||||||
# Setup cleanup
|
# Setup cleanup
|
||||||
|
# certain versions of HPU software stack have a bug that can
|
||||||
|
# override the exit code of the script, so we need to use
|
||||||
|
# separate remove_docker_container and remove_docker_container_and_exit
|
||||||
|
# functions, while other platforms only need one remove_docker_container
|
||||||
|
# function.
|
||||||
|
EXITCODE=1
|
||||||
remove_docker_container() { docker rm -f hpu-test || true; }
|
remove_docker_container() { docker rm -f hpu-test || true; }
|
||||||
trap remove_docker_container EXIT
|
remove_docker_container_and_exit() { remove_docker_container; exit $EXITCODE; }
|
||||||
|
trap remove_docker_container_and_exit EXIT
|
||||||
remove_docker_container
|
remove_docker_container
|
||||||
|
|
||||||
# Run the image and launch offline inference
|
# Run the image and launch offline inference
|
||||||
docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference.py
|
docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
|
||||||
|
EXITCODE=$?
|
||||||
|
@ -3,6 +3,18 @@
|
|||||||
# This script build the Neuron docker image and run the API server inside the container.
|
# This script build the Neuron docker image and run the API server inside the container.
|
||||||
# It serves a sanity check for compilation and basic model usage.
|
# It serves a sanity check for compilation and basic model usage.
|
||||||
set -e
|
set -e
|
||||||
|
set -v
|
||||||
|
|
||||||
|
image_name="neuron/vllm-ci"
|
||||||
|
container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
||||||
|
|
||||||
|
HF_CACHE="$(realpath ~)/huggingface"
|
||||||
|
mkdir -p "${HF_CACHE}"
|
||||||
|
HF_MOUNT="/root/.cache/huggingface"
|
||||||
|
|
||||||
|
NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
|
||||||
|
mkdir -p "${NEURON_COMPILE_CACHE_URL}"
|
||||||
|
NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
|
||||||
|
|
||||||
# Try building the docker image
|
# Try building the docker image
|
||||||
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
|
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
|
||||||
@ -13,41 +25,30 @@ if [ -f /tmp/neuron-docker-build-timestamp ]; then
|
|||||||
last_build=$(cat /tmp/neuron-docker-build-timestamp)
|
last_build=$(cat /tmp/neuron-docker-build-timestamp)
|
||||||
current_time=$(date +%s)
|
current_time=$(date +%s)
|
||||||
if [ $((current_time - last_build)) -gt 86400 ]; then
|
if [ $((current_time - last_build)) -gt 86400 ]; then
|
||||||
docker system prune -f
|
# Remove dangling images (those that are not tagged and not used by any container)
|
||||||
|
docker image prune -f
|
||||||
|
# Remove unused volumes / force the system prune for old images as well.
|
||||||
|
docker volume prune -f && docker system prune -f
|
||||||
echo "$current_time" > /tmp/neuron-docker-build-timestamp
|
echo "$current_time" > /tmp/neuron-docker-build-timestamp
|
||||||
fi
|
fi
|
||||||
else
|
else
|
||||||
date "+%s" > /tmp/neuron-docker-build-timestamp
|
date "+%s" > /tmp/neuron-docker-build-timestamp
|
||||||
fi
|
fi
|
||||||
|
|
||||||
docker build -t neuron -f Dockerfile.neuron .
|
docker build -t "${image_name}" -f Dockerfile.neuron .
|
||||||
|
|
||||||
# Setup cleanup
|
# Setup cleanup
|
||||||
remove_docker_container() { docker rm -f neuron || true; }
|
remove_docker_container() {
|
||||||
|
docker image rm -f "${image_name}" || true;
|
||||||
|
}
|
||||||
trap remove_docker_container EXIT
|
trap remove_docker_container EXIT
|
||||||
remove_docker_container
|
|
||||||
|
|
||||||
# Run the image
|
# Run the image
|
||||||
docker run --device=/dev/neuron0 --device=/dev/neuron1 --network host --name neuron neuron python3 -m vllm.entrypoints.api_server \
|
docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \
|
||||||
--model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --max-num-seqs 8 --max-model-len 128 --block-size 128 --device neuron --tensor-parallel-size 2 &
|
-v "${HF_CACHE}:${HF_MOUNT}" \
|
||||||
|
-e "HF_HOME=${HF_MOUNT}" \
|
||||||
# Wait for the server to start
|
-v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
|
||||||
wait_for_server_to_start() {
|
-e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
|
||||||
timeout=300
|
--name "${container_name}" \
|
||||||
counter=0
|
${image_name} \
|
||||||
|
/bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/ -v --capture=tee-sys"
|
||||||
while [ "$(curl -s -o /dev/null -w '%{http_code}' localhost:8000/health)" != "200" ]; do
|
|
||||||
sleep 1
|
|
||||||
counter=$((counter + 1))
|
|
||||||
if [ $counter -ge $timeout ]; then
|
|
||||||
echo "Timeout after $timeout seconds"
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
}
|
|
||||||
wait_for_server_to_start
|
|
||||||
|
|
||||||
# Test a simple prompt
|
|
||||||
curl -X POST -H "Content-Type: application/json" \
|
|
||||||
localhost:8000/generate \
|
|
||||||
-d '{"prompt": "San Francisco is a"}'
|
|
||||||
|
@ -13,4 +13,4 @@ trap remove_docker_container EXIT
|
|||||||
remove_docker_container
|
remove_docker_container
|
||||||
|
|
||||||
# Run the image and launch offline inference
|
# Run the image and launch offline inference
|
||||||
docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference.py
|
docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/basic/generate.py --model facebook/opt-125m
|
||||||
|
11
.buildkite/run-tpu-test.sh
Normal file → Executable file
11
.buildkite/run-tpu-test.sh
Normal file → Executable file
@ -14,4 +14,13 @@ remove_docker_container
|
|||||||
# For HF_TOKEN.
|
# For HF_TOKEN.
|
||||||
source /etc/environment
|
source /etc/environment
|
||||||
# Run a simple end-to-end example.
|
# Run a simple end-to-end example.
|
||||||
docker run --privileged --net host --shm-size=16G -it -e "HF_TOKEN=$HF_TOKEN" --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && python3 -m pip install lm_eval[api]==0.4.4 && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py"
|
docker run --privileged --net host --shm-size=16G -it \
|
||||||
|
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
|
||||||
|
vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
|
||||||
|
&& python3 -m pip install pytest \
|
||||||
|
&& python3 -m pip install lm_eval[api]==0.4.4 \
|
||||||
|
&& pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py \
|
||||||
|
&& pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
|
||||||
|
&& python3 /workspace/vllm/tests/tpu/test_compilation.py \
|
||||||
|
&& python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
|
||||||
|
&& python3 /workspace/vllm/examples/offline_inference/tpu.py"
|
||||||
|
@ -14,6 +14,6 @@ remove_docker_container
|
|||||||
|
|
||||||
# Run the image and test offline inference/tensor parallel
|
# Run the image and test offline inference/tensor parallel
|
||||||
docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c '
|
docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c '
|
||||||
python3 examples/offline_inference.py
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
|
||||||
python3 examples/offline_inference_cli.py -tp 2
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
|
||||||
'
|
'
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
# adding a new command to an existing step. See different options here for examples.
|
# adding a new command to an existing step. See different options here for examples.
|
||||||
|
|
||||||
# This script will be feed into Jinja template in `test-template-aws.j2` at
|
# This script will be feed into Jinja template in `test-template-aws.j2` at
|
||||||
# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2
|
# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2
|
||||||
# to generate the final pipeline yaml file.
|
# to generate the final pipeline yaml file.
|
||||||
|
|
||||||
# Documentation
|
# Documentation
|
||||||
@ -15,7 +15,7 @@
|
|||||||
# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
|
# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
|
||||||
# gpu(str): override the GPU selection for the test. default is on L4 GPUs. currently only supports a100
|
# gpu(str): override the GPU selection for the test. default is on L4 GPUs. currently only supports a100
|
||||||
# num_gpus(int): override the number of GPUs for the test. default to 1 GPU. currently support 2,4.
|
# num_gpus(int): override the number of GPUs for the test. default to 1 GPU. currently support 2,4.
|
||||||
# num_nodes(int): whether to simulate multi-node setup by launch multiple containers on one host,
|
# num_nodes(int): whether to simulate multi-node setup by launch multiple containers on one host,
|
||||||
# in this case, commands must be specified. the first command runs on first host, the second
|
# in this case, commands must be specified. the first command runs on first host, the second
|
||||||
# command runs on the second host.
|
# command runs on the second host.
|
||||||
# working_dir(str): specify the place where command should execute, default to /vllm-workspace/tests
|
# working_dir(str): specify the place where command should execute, default to /vllm-workspace/tests
|
||||||
@ -24,8 +24,8 @@
|
|||||||
# When adding a test
|
# When adding a test
|
||||||
# - If the test belong to an existing group, add it there
|
# - If the test belong to an existing group, add it there
|
||||||
# - If the test is short, add to any existing step
|
# - If the test is short, add to any existing step
|
||||||
# - If the test takes more than 10min, then it is okay to create a new step.
|
# - If the test takes more than 10min, then it is okay to create a new step.
|
||||||
# Note that all steps execute in parallel.
|
# Note that all steps execute in parallel.
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
##### fast check tests #####
|
##### fast check tests #####
|
||||||
@ -38,7 +38,7 @@ steps:
|
|||||||
- pip install -r requirements-docs.txt
|
- pip install -r requirements-docs.txt
|
||||||
- SPHINXOPTS=\"-W\" make html
|
- SPHINXOPTS=\"-W\" make html
|
||||||
# Check API reference (if it fails, you may have missing mock imports)
|
# Check API reference (if it fails, you may have missing mock imports)
|
||||||
- grep \"sig sig-object py\" build/html/dev/sampling_params.html
|
- grep \"sig sig-object py\" build/html/api/inference_params.html
|
||||||
|
|
||||||
- label: Async Engine, Inputs, Utils, Worker Test # 24min
|
- label: Async Engine, Inputs, Utils, Worker Test # 24min
|
||||||
fast_check: true
|
fast_check: true
|
||||||
@ -50,9 +50,9 @@ steps:
|
|||||||
- tests/multimodal
|
- tests/multimodal
|
||||||
- tests/test_utils
|
- tests/test_utils
|
||||||
- tests/worker
|
- tests/worker
|
||||||
- tests/standalone_tests/lazy_torch_compile.py
|
- tests/standalone_tests/lazy_imports.py
|
||||||
commands:
|
commands:
|
||||||
- python3 standalone_tests/lazy_torch_compile.py
|
- python3 standalone_tests/lazy_imports.py
|
||||||
- pytest -v -s mq_llm_engine # MQLLMEngine
|
- pytest -v -s mq_llm_engine # MQLLMEngine
|
||||||
- pytest -v -s async_engine # AsyncLLMEngine
|
- pytest -v -s async_engine # AsyncLLMEngine
|
||||||
- NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
|
- NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
|
||||||
@ -76,7 +76,9 @@ steps:
|
|||||||
- tests/basic_correctness/test_basic_correctness
|
- tests/basic_correctness/test_basic_correctness
|
||||||
- tests/basic_correctness/test_cpu_offload
|
- tests/basic_correctness/test_cpu_offload
|
||||||
- tests/basic_correctness/test_preemption
|
- tests/basic_correctness/test_preemption
|
||||||
|
- tests/basic_correctness/test_cumem.py
|
||||||
commands:
|
commands:
|
||||||
|
- pytest -v -s basic_correctness/test_cumem.py
|
||||||
- pytest -v -s basic_correctness/test_basic_correctness.py
|
- pytest -v -s basic_correctness/test_basic_correctness.py
|
||||||
- pytest -v -s basic_correctness/test_cpu_offload.py
|
- pytest -v -s basic_correctness/test_cpu_offload.py
|
||||||
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
|
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
|
||||||
@ -105,15 +107,17 @@ steps:
|
|||||||
mirror_hardwares: [amd]
|
mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
|
- tests/entrypoints/llm
|
||||||
|
- tests/entrypoints/openai
|
||||||
|
- tests/entrypoints/test_chat_utils
|
||||||
|
- tests/entrypoints/offline_mode
|
||||||
commands:
|
commands:
|
||||||
- pip install -e ./plugins/vllm_add_dummy_model
|
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
|
||||||
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py
|
|
||||||
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
|
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
|
||||||
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
||||||
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
|
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
|
||||||
- pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
|
- pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
|
||||||
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py
|
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/correctness/
|
||||||
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
|
|
||||||
- pytest -v -s entrypoints/test_chat_utils.py
|
- pytest -v -s entrypoints/test_chat_utils.py
|
||||||
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
||||||
|
|
||||||
@ -124,24 +128,31 @@ steps:
|
|||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/distributed/
|
- vllm/distributed/
|
||||||
- vllm/core/
|
- vllm/core/
|
||||||
- tests/distributed
|
- tests/distributed/test_utils
|
||||||
|
- tests/distributed/test_pynccl
|
||||||
- tests/spec_decode/e2e/test_integration_dist_tp4
|
- tests/spec_decode/e2e/test_integration_dist_tp4
|
||||||
- tests/compile
|
- tests/compile/test_basic_correctness
|
||||||
|
- examples/offline_inference/rlhf.py
|
||||||
|
- examples/offline_inference/rlhf_colocate.py
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s distributed/test_utils.py
|
- pytest -v -s distributed/test_utils.py
|
||||||
- pytest -v -s compile/test_basic_correctness.py
|
- pytest -v -s compile/test_basic_correctness.py
|
||||||
- pytest -v -s distributed/test_pynccl.py
|
- pytest -v -s distributed/test_pynccl.py
|
||||||
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
|
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
|
||||||
|
# TODO: create a dedicated test section for multi-GPU example tests
|
||||||
|
# when we have multiple distributed example tests
|
||||||
|
- python3 ../examples/offline_inference/rlhf.py
|
||||||
|
- RAY_DEDUP_LOGS=0 python3 ../examples/offline_inference/rlhf_colocate.py
|
||||||
|
|
||||||
- label: Metrics, Tracing Test # 10min
|
- label: Metrics, Tracing Test # 10min
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
fast_check: true
|
fast_check: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/metrics
|
- tests/metrics
|
||||||
- tests/tracing
|
- tests/tracing
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s metrics
|
- pytest -v -s metrics
|
||||||
- "pip install \
|
- "pip install \
|
||||||
'opentelemetry-sdk>=1.26.0,<1.27.0' \
|
'opentelemetry-sdk>=1.26.0,<1.27.0' \
|
||||||
'opentelemetry-api>=1.26.0,<1.27.0' \
|
'opentelemetry-api>=1.26.0,<1.27.0' \
|
||||||
@ -168,6 +179,9 @@ steps:
|
|||||||
- vllm/
|
- vllm/
|
||||||
- tests/engine
|
- tests/engine
|
||||||
- tests/tokenization
|
- tests/tokenization
|
||||||
|
- tests/test_sequence
|
||||||
|
- tests/test_config
|
||||||
|
- tests/test_logger
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s engine test_sequence.py test_config.py test_logger.py
|
- pytest -v -s engine test_sequence.py test_config.py test_logger.py
|
||||||
# OOM in the CI unless we run this separately
|
# OOM in the CI unless we run this separately
|
||||||
@ -179,7 +193,19 @@ steps:
|
|||||||
- vllm/
|
- vllm/
|
||||||
- tests/v1
|
- tests/v1
|
||||||
commands:
|
commands:
|
||||||
- VLLM_USE_V1=1 pytest -v -s v1
|
# split the test to avoid interference
|
||||||
|
- VLLM_USE_V1=1 pytest -v -s v1/core
|
||||||
|
- VLLM_USE_V1=1 pytest -v -s v1/engine
|
||||||
|
- VLLM_USE_V1=1 pytest -v -s v1/sample
|
||||||
|
- VLLM_USE_V1=1 pytest -v -s v1/worker
|
||||||
|
- VLLM_USE_V1=1 pytest -v -s v1/test_stats.py
|
||||||
|
- VLLM_USE_V1=1 pytest -v -s v1/test_utils.py
|
||||||
|
# TODO: accuracy does not match, whether setting
|
||||||
|
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
|
||||||
|
- VLLM_USE_V1=1 pytest -v -s v1/e2e
|
||||||
|
# Integration test for streaming correctness (requires special branch).
|
||||||
|
- pip install -U git+https://github.com/robertgshaw2-neuralmagic/lm-evaluation-harness.git@streaming-api
|
||||||
|
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
||||||
|
|
||||||
- label: Examples Test # 25min
|
- label: Examples Test # 25min
|
||||||
working_dir: "/vllm-workspace/examples"
|
working_dir: "/vllm-workspace/examples"
|
||||||
@ -189,19 +215,19 @@ steps:
|
|||||||
- examples/
|
- examples/
|
||||||
commands:
|
commands:
|
||||||
- pip install tensorizer # for tensorizer test
|
- pip install tensorizer # for tensorizer test
|
||||||
- python3 offline_inference.py
|
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
|
||||||
- python3 cpu_offload.py
|
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
||||||
- python3 offline_inference_chat.py
|
- python3 offline_inference/basic/chat.py
|
||||||
- python3 offline_inference_with_prefix.py
|
- python3 offline_inference/prefix_caching.py
|
||||||
- python3 llm_engine_example.py
|
- python3 offline_inference/llm_engine_example.py
|
||||||
- python3 offline_inference_vision_language.py
|
- python3 offline_inference/vision_language.py
|
||||||
- python3 offline_inference_vision_language_multi_image.py
|
- python3 offline_inference/vision_language_multi_image.py
|
||||||
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
- python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||||
- python3 offline_inference_encoder_decoder.py
|
- python3 offline_inference/encoder_decoder.py
|
||||||
- python3 offline_inference_classification.py
|
- python3 offline_inference/basic/classify.py
|
||||||
- python3 offline_inference_embedding.py
|
- python3 offline_inference/basic/embed.py
|
||||||
- python3 offline_inference_scoring.py
|
- python3 offline_inference/basic/score.py
|
||||||
- python3 offline_profile.py --model facebook/opt-125m run_num_steps --num-steps 2
|
- python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
|
||||||
|
|
||||||
- label: Prefix Caching Test # 9min
|
- label: Prefix Caching Test # 9min
|
||||||
mirror_hardwares: [amd]
|
mirror_hardwares: [amd]
|
||||||
@ -216,6 +242,7 @@ steps:
|
|||||||
- vllm/model_executor/layers
|
- vllm/model_executor/layers
|
||||||
- vllm/sampling_metadata.py
|
- vllm/sampling_metadata.py
|
||||||
- tests/samplers
|
- tests/samplers
|
||||||
|
- tests/conftest.py
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s samplers
|
- pytest -v -s samplers
|
||||||
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
|
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
|
||||||
@ -224,23 +251,29 @@ steps:
|
|||||||
mirror_hardwares: [amd]
|
mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/model_executor/layers
|
- vllm/model_executor/layers
|
||||||
|
- vllm/model_executor/guided_decoding
|
||||||
- tests/test_logits_processor
|
- tests/test_logits_processor
|
||||||
command: pytest -v -s test_logits_processor.py
|
- tests/model_executor/test_guided_processors
|
||||||
|
commands:
|
||||||
|
- pytest -v -s test_logits_processor.py
|
||||||
|
- pytest -v -s model_executor/test_guided_processors.py
|
||||||
|
|
||||||
- label: Speculative decoding tests # 30min
|
- label: Speculative decoding tests # 40min
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/spec_decode
|
- vllm/spec_decode
|
||||||
- tests/spec_decode
|
- tests/spec_decode
|
||||||
|
- vllm/model_executor/models/eagle.py
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s spec_decode/e2e/test_multistep_correctness.py
|
- pytest -v -s spec_decode/e2e/test_multistep_correctness.py
|
||||||
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py
|
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py --ignore=spec_decode/e2e/test_mtp_correctness.py
|
||||||
|
- pytest -v -s spec_decode/e2e/test_eagle_correctness.py
|
||||||
|
|
||||||
- label: LoRA Test %N # 15min each
|
- label: LoRA Test %N # 15min each
|
||||||
mirror_hardwares: [amd]
|
mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/lora
|
- vllm/lora
|
||||||
- tests/lora
|
- tests/lora
|
||||||
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
|
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py
|
||||||
parallelism: 4
|
parallelism: 4
|
||||||
|
|
||||||
- label: "PyTorch Fullgraph Smoke Test" # 9min
|
- label: "PyTorch Fullgraph Smoke Test" # 9min
|
||||||
@ -306,6 +339,14 @@ steps:
|
|||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- bash ./run-tests.sh -c configs/models-small.txt -t 1
|
- bash ./run-tests.sh -c configs/models-small.txt -t 1
|
||||||
|
|
||||||
|
- label: OpenAI API correctness
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/
|
||||||
|
- vllm/entrypoints/openai/
|
||||||
|
- vllm/model_executor/models/whisper.py
|
||||||
|
commands: # LMEval+Transcription WER check
|
||||||
|
- pytest -s entrypoints/openai/correctness/
|
||||||
|
|
||||||
- label: Encoder Decoder tests # 5min
|
- label: Encoder Decoder tests # 5min
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
@ -329,8 +370,7 @@ steps:
|
|||||||
- vllm/
|
- vllm/
|
||||||
- tests/models
|
- tests/models
|
||||||
commands:
|
commands:
|
||||||
- pip install -e ./plugins/vllm_add_dummy_model
|
- pytest -v -s models/test_transformers.py
|
||||||
- pytest -v -s models/test_oot_registration.py # it needs a clean process
|
|
||||||
- pytest -v -s models/test_registry.py
|
- pytest -v -s models/test_registry.py
|
||||||
- pytest -v -s models/test_initialization.py
|
- pytest -v -s models/test_initialization.py
|
||||||
|
|
||||||
@ -356,23 +396,26 @@ steps:
|
|||||||
- pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
|
- pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
|
||||||
- pytest -v -s models/embedding/language -m 'not core_model'
|
- pytest -v -s models/embedding/language -m 'not core_model'
|
||||||
|
|
||||||
- label: Multi-Modal Models Test (Standard) # 28min
|
- label: Multi-Modal Models Test (Standard) # 40min
|
||||||
#mirror_hardwares: [amd]
|
#mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/decoder_only/audio_language
|
- tests/models/decoder_only/audio_language
|
||||||
- tests/models/decoder_only/vision_language
|
- tests/models/decoder_only/vision_language
|
||||||
- tests/models/embedding/vision_language
|
- tests/models/embedding/vision_language
|
||||||
|
- tests/models/encoder_decoder/audio_language
|
||||||
- tests/models/encoder_decoder/vision_language
|
- tests/models/encoder_decoder/vision_language
|
||||||
commands:
|
commands:
|
||||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||||
|
- pytest -v -s models/multimodal
|
||||||
- pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
|
- pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
|
||||||
- pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
|
- pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
|
||||||
- pytest -v -s models/embedding/vision_language -m core_model
|
- pytest -v -s models/embedding/vision_language -m core_model
|
||||||
|
- pytest -v -s models/encoder_decoder/audio_language -m core_model
|
||||||
- pytest -v -s models/encoder_decoder/language -m core_model
|
- pytest -v -s models/encoder_decoder/language -m core_model
|
||||||
- pytest -v -s models/encoder_decoder/vision_language -m core_model
|
- pytest -v -s models/encoder_decoder/vision_language -m core_model
|
||||||
|
|
||||||
- label: Multi-Modal Models Test (Extended) 1 # 1h16m
|
- label: Multi-Modal Models Test (Extended) 1 # 48m
|
||||||
optional: true
|
optional: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
@ -455,21 +498,45 @@ steps:
|
|||||||
- vllm/worker/worker_base.py
|
- vllm/worker/worker_base.py
|
||||||
- vllm/worker/worker.py
|
- vllm/worker/worker.py
|
||||||
- vllm/worker/model_runner.py
|
- vllm/worker/model_runner.py
|
||||||
|
- entrypoints/llm/test_collective_rpc.py
|
||||||
commands:
|
commands:
|
||||||
|
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
||||||
|
- torchrun --nproc-per-node=2 distributed/test_torchrun_example.py
|
||||||
- pytest -v -s ./compile/test_basic_correctness.py
|
- pytest -v -s ./compile/test_basic_correctness.py
|
||||||
- pytest -v -s ./compile/test_wrapper.py
|
- pytest -v -s ./compile/test_wrapper.py
|
||||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||||
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
||||||
# Avoid importing model tests that cause CUDA reinitialization error
|
# Avoid importing model tests that cause CUDA reinitialization error
|
||||||
|
- pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
|
||||||
- pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
|
- pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
|
||||||
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
|
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
|
||||||
- pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
|
- pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
|
||||||
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
|
# this test fails consistently.
|
||||||
- pip install -e ./plugins/vllm_add_dummy_model
|
# TODO: investigate and fix
|
||||||
- pytest -v -s distributed/test_distributed_oot.py
|
# - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
|
||||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
|
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
|
||||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py
|
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py
|
||||||
|
|
||||||
|
- label: Plugin Tests (2 GPUs) # 40min
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
num_gpus: 2
|
||||||
|
fast_check: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/plugins/
|
||||||
|
- tests/plugins/
|
||||||
|
commands:
|
||||||
|
# begin platform plugin tests, all the code in-between runs on dummy platform
|
||||||
|
- pip install -e ./plugins/vllm_add_dummy_platform
|
||||||
|
- pytest -v -s plugins_tests/test_platform_plugins.py
|
||||||
|
- pip uninstall vllm_add_dummy_platform -y
|
||||||
|
# end platform plugin tests
|
||||||
|
# other tests continue here:
|
||||||
|
- pytest -v -s plugins_tests/test_scheduler_plugins.py
|
||||||
|
- pip install -e ./plugins/vllm_add_dummy_model
|
||||||
|
- pytest -v -s distributed/test_distributed_oot.py
|
||||||
|
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
|
||||||
|
- pytest -v -s models/test_oot_registration.py # it needs a clean process
|
||||||
|
|
||||||
- label: Multi-step Tests (4 GPUs) # 36min
|
- label: Multi-step Tests (4 GPUs) # 36min
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
@ -485,7 +552,9 @@ steps:
|
|||||||
- vllm/engine
|
- vllm/engine
|
||||||
- tests/multi_step
|
- tests/multi_step
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s multi_step/test_correctness_async_llm.py
|
# this test is quite flaky
|
||||||
|
# TODO: investigate and fix.
|
||||||
|
# - pytest -v -s multi_step/test_correctness_async_llm.py
|
||||||
- pytest -v -s multi_step/test_correctness_llm.py
|
- pytest -v -s multi_step/test_correctness_llm.py
|
||||||
|
|
||||||
- label: Pipeline Parallelism Test # 45min
|
- label: Pipeline Parallelism Test # 45min
|
||||||
@ -512,10 +581,11 @@ steps:
|
|||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
# This test runs llama 13B, so it is required to run on 4 GPUs.
|
# This test runs llama 13B, so it is required to run on 4 GPUs.
|
||||||
- pytest -v -s -x lora/test_long_context.py
|
- pytest -v -s -x lora/test_long_context.py
|
||||||
# There is some Tensor Parallelism related processing logic in LoRA that
|
# There is some Tensor Parallelism related processing logic in LoRA that
|
||||||
# requires multi-GPU testing for validation.
|
# requires multi-GPU testing for validation.
|
||||||
- pytest -v -s -x lora/test_chatglm3_tp.py
|
- pytest -v -s -x lora/test_chatglm3_tp.py
|
||||||
- pytest -v -s -x lora/test_llama_tp.py
|
- pytest -v -s -x lora/test_llama_tp.py
|
||||||
|
- pytest -v -s -x lora/test_minicpmv_tp.py
|
||||||
|
|
||||||
|
|
||||||
- label: Weight Loading Multiple GPU Test # 33min
|
- label: Weight Loading Multiple GPU Test # 33min
|
||||||
@ -536,7 +606,7 @@ steps:
|
|||||||
- vllm/
|
- vllm/
|
||||||
- tests/weight_loading
|
- tests/weight_loading
|
||||||
commands:
|
commands:
|
||||||
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
|
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
|
||||||
|
|
||||||
|
|
||||||
##### multi gpus test #####
|
##### multi gpus test #####
|
||||||
@ -548,7 +618,7 @@ steps:
|
|||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
commands:
|
commands:
|
||||||
# NOTE: don't test llama model here, it seems hf implementation is buggy
|
# NOTE: don't test llama model here, it seems hf implementation is buggy
|
||||||
# see https://github.com/vllm-project/vllm/pull/5689 for details
|
# see https://github.com/vllm-project/vllm/pull/5689 for details
|
||||||
- pytest -v -s distributed/test_custom_all_reduce.py
|
- pytest -v -s distributed/test_custom_all_reduce.py
|
||||||
|
@ -23,6 +23,8 @@ wheel="$new_wheel"
|
|||||||
version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
|
version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
|
||||||
echo "Version: $version"
|
echo "Version: $version"
|
||||||
|
|
||||||
|
normal_wheel="$wheel" # Save the original wheel filename
|
||||||
|
|
||||||
# If the version contains "dev", rename it to v1.0.0.dev for consistency
|
# If the version contains "dev", rename it to v1.0.0.dev for consistency
|
||||||
if [[ $version == *dev* ]]; then
|
if [[ $version == *dev* ]]; then
|
||||||
suffix="${version##*.}"
|
suffix="${version##*.}"
|
||||||
@ -32,12 +34,38 @@ if [[ $version == *dev* ]]; then
|
|||||||
new_version="1.0.0.dev"
|
new_version="1.0.0.dev"
|
||||||
fi
|
fi
|
||||||
new_wheel="${wheel/$version/$new_version}"
|
new_wheel="${wheel/$version/$new_version}"
|
||||||
mv -- "$wheel" "$new_wheel"
|
# use cp to keep both files in the artifacts directory
|
||||||
|
cp -- "$wheel" "$new_wheel"
|
||||||
wheel="$new_wheel"
|
wheel="$new_wheel"
|
||||||
version="$new_version"
|
version="$new_version"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Upload the wheel to S3
|
# Upload the wheel to S3
|
||||||
|
python3 .buildkite/generate_index.py --wheel "$normal_wheel"
|
||||||
|
|
||||||
|
# generate index for this commit
|
||||||
aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
||||||
|
aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
||||||
|
|
||||||
|
if [[ $normal_wheel == *"cu118"* ]]; then
|
||||||
|
# if $normal_wheel matches cu118, do not upload the index.html
|
||||||
|
echo "Skipping index files for cu118 wheels"
|
||||||
|
else
|
||||||
|
# only upload index.html for cu12 wheels (default wheels)
|
||||||
|
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
|
||||||
|
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# generate index for nightly
|
||||||
aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
|
aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
|
||||||
|
aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
|
||||||
|
|
||||||
|
if [[ $normal_wheel == *"cu118"* ]]; then
|
||||||
|
# if $normal_wheel matches cu118, do not upload the index.html
|
||||||
|
echo "Skipping index files for cu118 wheels"
|
||||||
|
else
|
||||||
|
# only upload index.html for cu12 wheels (default wheels)
|
||||||
|
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
|
||||||
|
fi
|
||||||
|
|
||||||
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
|
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
|
27
.github/CODEOWNERS
vendored
27
.github/CODEOWNERS
vendored
@ -2,32 +2,35 @@
|
|||||||
# for more info about CODEOWNERS file
|
# for more info about CODEOWNERS file
|
||||||
|
|
||||||
# This lists cover the "core" components of vLLM that require careful review
|
# This lists cover the "core" components of vLLM that require careful review
|
||||||
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
|
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||||
/vllm/core @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
|
/vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||||
/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
|
/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||||
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
|
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||||
/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
|
/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||||
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
|
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||||
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
|
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||||
|
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
|
||||||
|
/vllm/model_executor/guided_decoding @mgoin
|
||||||
|
/vllm/multimodal @DarkLight1337 @ywang96
|
||||||
CMakeLists.txt @tlrmchlsmth
|
CMakeLists.txt @tlrmchlsmth
|
||||||
|
|
||||||
# vLLM V1
|
# vLLM V1
|
||||||
/vllm/v1 @WoosukKwon @robertgshaw2-neuralmagic @njhill @ywang96 @comaniac @alexm-neuralmagic
|
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
|
||||||
|
|
||||||
# Test ownership
|
# Test ownership
|
||||||
/tests/async_engine @njhill @robertgshaw2-neuralmagic @simon-mo
|
/tests/async_engine @njhill @robertgshaw2-redhat @simon-mo
|
||||||
/tests/test_inputs.py @DarkLight1337 @ywang96
|
/tests/test_inputs.py @DarkLight1337 @ywang96
|
||||||
/tests/entrypoints @DarkLight1337 @robertgshaw2-neuralmagic @simon-mo
|
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo
|
||||||
/tests/models @DarkLight1337 @ywang96
|
/tests/models @DarkLight1337 @ywang96
|
||||||
/tests/multimodal @DarkLight1337 @ywang96
|
/tests/multimodal @DarkLight1337 @ywang96
|
||||||
/tests/prefix_caching @comaniac @KuntaiDu
|
/tests/prefix_caching @comaniac @KuntaiDu
|
||||||
/tests/spec_decode @njhill @LiuXiaoxuanPKU
|
/tests/spec_decode @njhill @LiuXiaoxuanPKU
|
||||||
/tests/kernels @tlrmchlsmth @WoosukKwon
|
/tests/kernels @tlrmchlsmth @WoosukKwon
|
||||||
/tests/quantization @mgoin @robertgshaw2-neuralmagic
|
/tests/quantization @mgoin @robertgshaw2-redhat
|
||||||
/.buildkite/lm-eval-harness @mgoin @simon-mo
|
/.buildkite/lm-eval-harness @mgoin @simon-mo
|
||||||
/tests/distributed/test_multi_node_assignment.py @youkaichao
|
/tests/distributed/test_multi_node_assignment.py @youkaichao
|
||||||
/tests/distributed/test_pipeline_parallel.py @youkaichao
|
/tests/distributed/test_pipeline_parallel.py @youkaichao
|
||||||
/tests/distributed/test_same_node.py @youkaichao
|
/tests/distributed/test_same_node.py @youkaichao
|
||||||
/tests/multi_step @alexm-neuralmagic @comaniac
|
/tests/multi_step @alexm-redhat @comaniac
|
||||||
/tests/weight_loading @mgoin @youkaichao
|
/tests/weight_loading @mgoin @youkaichao
|
||||||
/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac
|
/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac
|
||||||
|
@ -30,15 +30,6 @@ body:
|
|||||||
</details>
|
</details>
|
||||||
validations:
|
validations:
|
||||||
required: true
|
required: true
|
||||||
- type: textarea
|
|
||||||
attributes:
|
|
||||||
label: Model Input Dumps
|
|
||||||
description: |
|
|
||||||
If you are facing crashing due to illegal memory access or other issues with model execution, vLLM may dump the problematic input of the model. In this case, you will see the message `Error in model execution (input dumped to /tmp/err_xxx.pkl)`. If you see this message, please zip the file (because GitHub doesn't support .pkl file format) and upload it here. This will help us to reproduce the issue and facilitate the debugging process.
|
|
||||||
placeholder: |
|
|
||||||
Upload the dumped input file.
|
|
||||||
validations:
|
|
||||||
required: false
|
|
||||||
- type: textarea
|
- type: textarea
|
||||||
attributes:
|
attributes:
|
||||||
label: 🐛 Describe the bug
|
label: 🐛 Describe the bug
|
@ -9,7 +9,7 @@ body:
|
|||||||
value: >
|
value: >
|
||||||
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
|
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
|
||||||
|
|
||||||
#### We also highly recommend you read https://docs.vllm.ai/en/latest/models/adding_model.html first to understand how to add a new model.
|
#### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/adding_model.html first to understand how to add a new model.
|
||||||
- type: textarea
|
- type: textarea
|
||||||
attributes:
|
attributes:
|
||||||
label: The model to consider.
|
label: The model to consider.
|
3
.github/PULL_REQUEST_TEMPLATE.md
vendored
3
.github/PULL_REQUEST_TEMPLATE.md
vendored
@ -2,4 +2,5 @@ FILL IN THE PR DESCRIPTION HERE
|
|||||||
|
|
||||||
FIX #xxxx (*link existing issues this PR will resolve*)
|
FIX #xxxx (*link existing issues this PR will resolve*)
|
||||||
|
|
||||||
**BEFORE SUBMITTING, PLEASE READ https://docs.vllm.ai/en/latest/contributing/overview.html **
|
<!--- pyml disable-next-line no-emphasis-as-heading -->
|
||||||
|
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing/overview.html>**
|
||||||
|
37
.github/mergify.yml
vendored
37
.github/mergify.yml
vendored
@ -35,6 +35,43 @@ pull_request_rules:
|
|||||||
add:
|
add:
|
||||||
- frontend
|
- frontend
|
||||||
|
|
||||||
|
- name: label-structured-output
|
||||||
|
description: Automatically apply structured-output label
|
||||||
|
conditions:
|
||||||
|
- or:
|
||||||
|
- files~=^vllm/model_executor/guided_decoding/
|
||||||
|
- files=tests/model_executor/test_guided_processors.py
|
||||||
|
- files=tests/entrypoints/llm/test_guided_generate.py
|
||||||
|
- files=benchmarks/benchmark_serving_guided.py
|
||||||
|
- files=benchmarks/benchmark_guided.py
|
||||||
|
actions:
|
||||||
|
label:
|
||||||
|
add:
|
||||||
|
- structured-output
|
||||||
|
|
||||||
|
- name: label-speculative-decoding
|
||||||
|
description: Automatically apply speculative-decoding label
|
||||||
|
conditions:
|
||||||
|
- or:
|
||||||
|
- files~=^vllm/spec_decode/
|
||||||
|
- files=vllm/model_executor/layers/spec_decode_base_sampler.py
|
||||||
|
- files~=^tests/spec_decode/
|
||||||
|
actions:
|
||||||
|
label:
|
||||||
|
add:
|
||||||
|
- speculative-decoding
|
||||||
|
|
||||||
|
- name: label-v1
|
||||||
|
description: Automatically apply v1 label
|
||||||
|
conditions:
|
||||||
|
- or:
|
||||||
|
- files~=^vllm/v1/
|
||||||
|
- files~=^tests/v1/
|
||||||
|
actions:
|
||||||
|
label:
|
||||||
|
add:
|
||||||
|
- v1
|
||||||
|
|
||||||
- name: ping author on conflicts and add 'needs-rebase' label
|
- name: ping author on conflicts and add 'needs-rebase' label
|
||||||
conditions:
|
conditions:
|
||||||
- conflict
|
- conflict
|
||||||
|
40
.github/workflows/actionlint.yml
vendored
40
.github/workflows/actionlint.yml
vendored
@ -1,40 +0,0 @@
|
|||||||
name: Lint GitHub Actions workflows
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- "main"
|
|
||||||
paths:
|
|
||||||
- '.github/workflows/*.ya?ml'
|
|
||||||
- '.github/workflows/actionlint.*'
|
|
||||||
- '.github/workflows/matchers/actionlint.json'
|
|
||||||
pull_request:
|
|
||||||
branches:
|
|
||||||
- "main"
|
|
||||||
paths:
|
|
||||||
- '.github/workflows/*.ya?ml'
|
|
||||||
- '.github/workflows/actionlint.*'
|
|
||||||
- '.github/workflows/matchers/actionlint.json'
|
|
||||||
|
|
||||||
env:
|
|
||||||
LC_ALL: en_US.UTF-8
|
|
||||||
|
|
||||||
defaults:
|
|
||||||
run:
|
|
||||||
shell: bash
|
|
||||||
|
|
||||||
permissions:
|
|
||||||
contents: read
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
actionlint:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
steps:
|
|
||||||
- name: "Checkout"
|
|
||||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
|
||||||
with:
|
|
||||||
fetch-depth: 0
|
|
||||||
|
|
||||||
- name: "Run actionlint"
|
|
||||||
run: |
|
|
||||||
echo "::add-matcher::.github/workflows/matchers/actionlint.json"
|
|
||||||
tools/actionlint.sh -color
|
|
53
.github/workflows/clang-format.yml
vendored
53
.github/workflows/clang-format.yml
vendored
@ -1,53 +0,0 @@
|
|||||||
name: clang-format
|
|
||||||
|
|
||||||
on:
|
|
||||||
# Trigger the workflow on push or pull request,
|
|
||||||
# but only for the main branch
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- main
|
|
||||||
paths:
|
|
||||||
- '**/*.h'
|
|
||||||
- '**/*.cpp'
|
|
||||||
- '**/*.cu'
|
|
||||||
- '**/*.cuh'
|
|
||||||
- '.github/workflows/clang-format.yml'
|
|
||||||
pull_request:
|
|
||||||
branches:
|
|
||||||
- main
|
|
||||||
paths:
|
|
||||||
- '**/*.h'
|
|
||||||
- '**/*.cpp'
|
|
||||||
- '**/*.cu'
|
|
||||||
- '**/*.cuh'
|
|
||||||
- '.github/workflows/clang-format.yml'
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
clang-format:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
strategy:
|
|
||||||
matrix:
|
|
||||||
python-version: ["3.11"]
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
|
||||||
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
|
|
||||||
with:
|
|
||||||
python-version: ${{ matrix.python-version }}
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
python -m pip install --upgrade pip
|
|
||||||
pip install clang-format==18.1.5
|
|
||||||
- name: Running clang-format
|
|
||||||
run: |
|
|
||||||
EXCLUDES=(
|
|
||||||
'csrc/moe/topk_softmax_kernels.cu'
|
|
||||||
'csrc/quantization/gguf/ggml-common.h'
|
|
||||||
'csrc/quantization/gguf/dequantize.cuh'
|
|
||||||
'csrc/quantization/gguf/vecdotq.cuh'
|
|
||||||
'csrc/quantization/gguf/mmq.cuh'
|
|
||||||
'csrc/quantization/gguf/mmvq.cuh'
|
|
||||||
)
|
|
||||||
find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \
|
|
||||||
| grep -vFf <(printf "%s\n" "${EXCLUDES[@]}") \
|
|
||||||
| xargs clang-format --dry-run --Werror
|
|
2
.github/workflows/cleanup_pr_body.yml
vendored
2
.github/workflows/cleanup_pr_body.yml
vendored
@ -16,7 +16,7 @@ jobs:
|
|||||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
|
|
||||||
- name: Set up Python
|
- name: Set up Python
|
||||||
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
|
uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
|
||||||
with:
|
with:
|
||||||
python-version: '3.12'
|
python-version: '3.12'
|
||||||
|
|
||||||
|
45
.github/workflows/codespell.yml
vendored
45
.github/workflows/codespell.yml
vendored
@ -1,45 +0,0 @@
|
|||||||
name: codespell
|
|
||||||
|
|
||||||
on:
|
|
||||||
# Trigger the workflow on push or pull request,
|
|
||||||
# but only for the main branch
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- main
|
|
||||||
paths:
|
|
||||||
- "**/*.py"
|
|
||||||
- "**/*.md"
|
|
||||||
- "**/*.rst"
|
|
||||||
- pyproject.toml
|
|
||||||
- requirements-lint.txt
|
|
||||||
- .github/workflows/codespell.yml
|
|
||||||
pull_request:
|
|
||||||
branches:
|
|
||||||
- main
|
|
||||||
paths:
|
|
||||||
- "**/*.py"
|
|
||||||
- "**/*.md"
|
|
||||||
- "**/*.rst"
|
|
||||||
- pyproject.toml
|
|
||||||
- requirements-lint.txt
|
|
||||||
- .github/workflows/codespell.yml
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
codespell:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
strategy:
|
|
||||||
matrix:
|
|
||||||
python-version: ["3.12"]
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
|
||||||
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
|
|
||||||
with:
|
|
||||||
python-version: ${{ matrix.python-version }}
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
python -m pip install --upgrade pip
|
|
||||||
pip install -r requirements-lint.txt
|
|
||||||
- name: Spelling check with codespell
|
|
||||||
run: |
|
|
||||||
codespell --toml pyproject.toml
|
|
11
.github/workflows/lint-and-deploy.yaml
vendored
11
.github/workflows/lint-and-deploy.yaml
vendored
@ -17,17 +17,17 @@ jobs:
|
|||||||
version: v3.14.4
|
version: v3.14.4
|
||||||
|
|
||||||
#Python is required because ct lint runs Yamale and yamllint which require Python.
|
#Python is required because ct lint runs Yamale and yamllint which require Python.
|
||||||
- uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
|
- uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
|
||||||
with:
|
with:
|
||||||
python-version: '3.13'
|
python-version: '3.13'
|
||||||
|
|
||||||
- name: Set up chart-testing
|
- name: Set up chart-testing
|
||||||
uses: helm/chart-testing-action@e6669bcd63d7cb57cb4380c33043eebe5d111992 # v2.6.1
|
uses: helm/chart-testing-action@0d28d3144d3a25ea2cc349d6e59901c4ff469b3b # v2.7.0
|
||||||
with:
|
with:
|
||||||
version: v3.10.1
|
version: v3.10.1
|
||||||
|
|
||||||
- name: Run chart-testing (lint)
|
- name: Run chart-testing (lint)
|
||||||
run: ct lint --target-branch ${{ github.event.repository.default_branch }} --chart-dirs examples/chart-helm --charts examples/chart-helm
|
run: ct lint --target-branch ${{ github.event.repository.default_branch }} --chart-dirs examples/online_serving/chart-helm --charts examples/online_serving/chart-helm
|
||||||
|
|
||||||
- name: Setup minio
|
- name: Setup minio
|
||||||
run: |
|
run: |
|
||||||
@ -47,7 +47,7 @@ jobs:
|
|||||||
aws --endpoint-url http://127.0.0.1:9000/ s3 cp opt-125m/ s3://testbucket/opt-125m --recursive
|
aws --endpoint-url http://127.0.0.1:9000/ s3 cp opt-125m/ s3://testbucket/opt-125m --recursive
|
||||||
|
|
||||||
- name: Create kind cluster
|
- name: Create kind cluster
|
||||||
uses: helm/kind-action@0025e74a8c7512023d06dc019c617aa3cf561fde # v1.10.0
|
uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0
|
||||||
|
|
||||||
- name: Build the Docker image vllm cpu
|
- name: Build the Docker image vllm cpu
|
||||||
run: docker buildx build -f Dockerfile.cpu -t vllm-cpu-env .
|
run: docker buildx build -f Dockerfile.cpu -t vllm-cpu-env .
|
||||||
@ -64,7 +64,8 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
export AWS_ACCESS_KEY_ID=minioadmin
|
export AWS_ACCESS_KEY_ID=minioadmin
|
||||||
export AWS_SECRET_ACCESS_KEY=minioadmin
|
export AWS_SECRET_ACCESS_KEY=minioadmin
|
||||||
helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/chart-helm -f examples/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
|
sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" &
|
||||||
|
helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
|
||||||
|
|
||||||
- name: curl test
|
- name: curl test
|
||||||
run: |
|
run: |
|
||||||
|
17
.github/workflows/matchers/ruff.json
vendored
17
.github/workflows/matchers/ruff.json
vendored
@ -1,17 +0,0 @@
|
|||||||
{
|
|
||||||
"problemMatcher": [
|
|
||||||
{
|
|
||||||
"owner": "ruff",
|
|
||||||
"pattern": [
|
|
||||||
{
|
|
||||||
"regexp": "^(.+?):(\\d+):(\\d+): (\\w+): (.+)$",
|
|
||||||
"file": 1,
|
|
||||||
"line": 2,
|
|
||||||
"column": 3,
|
|
||||||
"code": 4,
|
|
||||||
"message": 5
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
51
.github/workflows/mypy.yaml
vendored
51
.github/workflows/mypy.yaml
vendored
@ -1,51 +0,0 @@
|
|||||||
name: mypy
|
|
||||||
|
|
||||||
on:
|
|
||||||
# Trigger the workflow on push or pull request,
|
|
||||||
# but only for the main branch
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- main
|
|
||||||
paths:
|
|
||||||
- '**/*.py'
|
|
||||||
- '.github/workflows/mypy.yaml'
|
|
||||||
- 'tools/mypy.sh'
|
|
||||||
- 'pyproject.toml'
|
|
||||||
pull_request:
|
|
||||||
branches:
|
|
||||||
- main
|
|
||||||
# This workflow is only relevant when one of the following files changes.
|
|
||||||
# However, we have github configured to expect and require this workflow
|
|
||||||
# to run and pass before github with auto-merge a pull request. Until github
|
|
||||||
# allows more flexible auto-merge policy, we can just run this on every PR.
|
|
||||||
# It doesn't take that long to run, anyway.
|
|
||||||
#paths:
|
|
||||||
# - '**/*.py'
|
|
||||||
# - '.github/workflows/mypy.yaml'
|
|
||||||
# - 'tools/mypy.sh'
|
|
||||||
# - 'pyproject.toml'
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
mypy:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
strategy:
|
|
||||||
matrix:
|
|
||||||
python-version: ["3.9", "3.10", "3.11", "3.12"]
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
|
||||||
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
|
|
||||||
with:
|
|
||||||
python-version: ${{ matrix.python-version }}
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
python -m pip install --upgrade pip
|
|
||||||
pip install mypy==1.11.1
|
|
||||||
pip install types-setuptools
|
|
||||||
pip install types-PyYAML
|
|
||||||
pip install types-requests
|
|
||||||
pip install types-setuptools
|
|
||||||
- name: Mypy
|
|
||||||
run: |
|
|
||||||
echo "::add-matcher::.github/workflows/matchers/mypy.json"
|
|
||||||
tools/mypy.sh 1 ${{ matrix.python-version }}
|
|
37
.github/workflows/png-lint.yml
vendored
37
.github/workflows/png-lint.yml
vendored
@ -1,37 +0,0 @@
|
|||||||
name: Lint PNG exports from excalidraw
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- "main"
|
|
||||||
paths:
|
|
||||||
- '*.excalidraw.png'
|
|
||||||
- '.github/workflows/png-lint.yml'
|
|
||||||
pull_request:
|
|
||||||
branches:
|
|
||||||
- "main"
|
|
||||||
paths:
|
|
||||||
- '*.excalidraw.png'
|
|
||||||
- '.github/workflows/png-lint.yml'
|
|
||||||
|
|
||||||
env:
|
|
||||||
LC_ALL: en_US.UTF-8
|
|
||||||
|
|
||||||
defaults:
|
|
||||||
run:
|
|
||||||
shell: bash
|
|
||||||
|
|
||||||
permissions:
|
|
||||||
contents: read
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
actionlint:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
steps:
|
|
||||||
- name: "Checkout"
|
|
||||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
|
||||||
with:
|
|
||||||
fetch-depth: 0
|
|
||||||
|
|
||||||
- name: "Run png-lint.sh to check excalidraw exported images"
|
|
||||||
run: |
|
|
||||||
tools/png-lint.sh
|
|
20
.github/workflows/pre-commit.yml
vendored
Normal file
20
.github/workflows/pre-commit.yml
vendored
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
name: pre-commit
|
||||||
|
|
||||||
|
on:
|
||||||
|
pull_request:
|
||||||
|
push:
|
||||||
|
branches: [main]
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
pre-commit:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
|
- uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
|
||||||
|
with:
|
||||||
|
python-version: "3.12"
|
||||||
|
- run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
|
||||||
|
- run: echo "::add-matcher::.github/workflows/matchers/mypy.json"
|
||||||
|
- uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
|
||||||
|
with:
|
||||||
|
extra_args: --all-files --hook-stage manual
|
105
.github/workflows/publish.yml
vendored
105
.github/workflows/publish.yml
vendored
@ -39,67 +39,68 @@ jobs:
|
|||||||
const script = require('.github/workflows/scripts/create_release.js')
|
const script = require('.github/workflows/scripts/create_release.js')
|
||||||
await script(github, context, core)
|
await script(github, context, core)
|
||||||
|
|
||||||
wheel:
|
# NOTE(simon): No longer build wheel using Github Actions. See buildkite's release workflow.
|
||||||
name: Build Wheel
|
# wheel:
|
||||||
runs-on: ${{ matrix.os }}
|
# name: Build Wheel
|
||||||
needs: release
|
# runs-on: ${{ matrix.os }}
|
||||||
|
# needs: release
|
||||||
|
|
||||||
strategy:
|
# strategy:
|
||||||
fail-fast: false
|
# fail-fast: false
|
||||||
matrix:
|
# matrix:
|
||||||
os: ['ubuntu-20.04']
|
# os: ['ubuntu-20.04']
|
||||||
python-version: ['3.9', '3.10', '3.11', '3.12']
|
# python-version: ['3.9', '3.10', '3.11', '3.12']
|
||||||
pytorch-version: ['2.4.0'] # Must be the most recent version that meets requirements-cuda.txt.
|
# pytorch-version: ['2.4.0'] # Must be the most recent version that meets requirements-cuda.txt.
|
||||||
cuda-version: ['11.8', '12.1']
|
# cuda-version: ['11.8', '12.1']
|
||||||
|
|
||||||
steps:
|
# steps:
|
||||||
- name: Checkout
|
# - name: Checkout
|
||||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
# uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
|
|
||||||
- name: Setup ccache
|
# - name: Setup ccache
|
||||||
uses: hendrikmuhs/ccache-action@ed74d11c0b343532753ecead8a951bb09bb34bc9 # v1.2.14
|
# uses: hendrikmuhs/ccache-action@ed74d11c0b343532753ecead8a951bb09bb34bc9 # v1.2.14
|
||||||
with:
|
# with:
|
||||||
create-symlink: true
|
# create-symlink: true
|
||||||
key: ${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}
|
# key: ${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}
|
||||||
|
|
||||||
- name: Set up Linux Env
|
# - name: Set up Linux Env
|
||||||
if: ${{ runner.os == 'Linux' }}
|
# if: ${{ runner.os == 'Linux' }}
|
||||||
run: |
|
# run: |
|
||||||
bash -x .github/workflows/scripts/env.sh
|
# bash -x .github/workflows/scripts/env.sh
|
||||||
|
|
||||||
- name: Set up Python
|
# - name: Set up Python
|
||||||
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
|
# uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
|
||||||
with:
|
# with:
|
||||||
python-version: ${{ matrix.python-version }}
|
# python-version: ${{ matrix.python-version }}
|
||||||
|
|
||||||
- name: Install CUDA ${{ matrix.cuda-version }}
|
# - name: Install CUDA ${{ matrix.cuda-version }}
|
||||||
run: |
|
# run: |
|
||||||
bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }}
|
# bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }}
|
||||||
|
|
||||||
- name: Install PyTorch ${{ matrix.pytorch-version }} with CUDA ${{ matrix.cuda-version }}
|
# - name: Install PyTorch ${{ matrix.pytorch-version }} with CUDA ${{ matrix.cuda-version }}
|
||||||
run: |
|
# run: |
|
||||||
bash -x .github/workflows/scripts/pytorch-install.sh ${{ matrix.python-version }} ${{ matrix.pytorch-version }} ${{ matrix.cuda-version }}
|
# bash -x .github/workflows/scripts/pytorch-install.sh ${{ matrix.python-version }} ${{ matrix.pytorch-version }} ${{ matrix.cuda-version }}
|
||||||
|
|
||||||
- name: Build wheel
|
# - name: Build wheel
|
||||||
shell: bash
|
# shell: bash
|
||||||
env:
|
# env:
|
||||||
CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
|
# CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
|
||||||
run: |
|
# run: |
|
||||||
bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
|
# bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
|
||||||
wheel_name=$(find dist -name "*whl" -print0 | xargs -0 -n 1 basename)
|
# wheel_name=$(find dist -name "*whl" -print0 | xargs -0 -n 1 basename)
|
||||||
asset_name=${wheel_name//"linux"/"manylinux1"}
|
# asset_name=${wheel_name//"linux"/"manylinux1"}
|
||||||
echo "wheel_name=${wheel_name}" >> "$GITHUB_ENV"
|
# echo "wheel_name=${wheel_name}" >> "$GITHUB_ENV"
|
||||||
echo "asset_name=${asset_name}" >> "$GITHUB_ENV"
|
# echo "asset_name=${asset_name}" >> "$GITHUB_ENV"
|
||||||
|
|
||||||
- name: Upload Release Asset
|
# - name: Upload Release Asset
|
||||||
uses: actions/upload-release-asset@e8f9f06c4b078e705bd2ea027f0926603fc9b4d5 # v1.0.2
|
# uses: actions/upload-release-asset@e8f9f06c4b078e705bd2ea027f0926603fc9b4d5 # v1.0.2
|
||||||
env:
|
# env:
|
||||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
# GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
with:
|
# with:
|
||||||
upload_url: ${{ needs.release.outputs.upload_url }}
|
# upload_url: ${{ needs.release.outputs.upload_url }}
|
||||||
asset_path: ./dist/${{ env.wheel_name }}
|
# asset_path: ./dist/${{ env.wheel_name }}
|
||||||
asset_name: ${{ env.asset_name }}
|
# asset_name: ${{ env.asset_name }}
|
||||||
asset_content_type: application/*
|
# asset_content_type: application/*
|
||||||
|
|
||||||
# (Danielkinz): This last step will publish the .whl to pypi. Warning: untested
|
# (Danielkinz): This last step will publish the .whl to pypi. Warning: untested
|
||||||
# - name: Publish package
|
# - name: Publish package
|
||||||
|
8
.github/workflows/reminder_comment.yml
vendored
8
.github/workflows/reminder_comment.yml
vendored
@ -2,7 +2,6 @@ name: PR Reminder Comment Bot
|
|||||||
on:
|
on:
|
||||||
pull_request_target:
|
pull_request_target:
|
||||||
types: [opened]
|
types: [opened]
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
pr_reminder:
|
pr_reminder:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
@ -15,7 +14,12 @@ jobs:
|
|||||||
owner: context.repo.owner,
|
owner: context.repo.owner,
|
||||||
repo: context.repo.repo,
|
repo: context.repo.repo,
|
||||||
issue_number: context.issue.number,
|
issue_number: context.issue.number,
|
||||||
body: '👋 Hi! Thank you for contributing to the vLLM project.\n Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping `simon-mo` or `khluu` to add you in our Buildkite org. \n\nOnce the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n To run CI, PR reviewers can do one of these:\n- Add `ready` label to the PR\n- Enable auto-merge.\n\n🚀'
|
body: '👋 Hi! Thank you for contributing to the vLLM project.\n\n' +
|
||||||
|
'💬 Join our developer Slack at https://slack.vllm.ai to discuss your PR in #pr-reviews, coordinate on features in #feat- channels, or join special interest groups in #sig- channels.\n\n' +
|
||||||
|
'Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping `simon-mo` or `khluu` to add you in our Buildkite org.\n\n' +
|
||||||
|
'Once the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n' +
|
||||||
|
'To run CI, PR reviewers can either: Add `ready` label to the PR or enable auto-merge.\n\n' +
|
||||||
|
'🚀'
|
||||||
})
|
})
|
||||||
env:
|
env:
|
||||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
52
.github/workflows/ruff.yml
vendored
52
.github/workflows/ruff.yml
vendored
@ -1,52 +0,0 @@
|
|||||||
name: ruff
|
|
||||||
|
|
||||||
on:
|
|
||||||
# Trigger the workflow on push or pull request,
|
|
||||||
# but only for the main branch
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- main
|
|
||||||
paths:
|
|
||||||
- "**/*.py"
|
|
||||||
- pyproject.toml
|
|
||||||
- requirements-lint.txt
|
|
||||||
- .github/workflows/matchers/ruff.json
|
|
||||||
- .github/workflows/ruff.yml
|
|
||||||
pull_request:
|
|
||||||
branches:
|
|
||||||
- main
|
|
||||||
# This workflow is only relevant when one of the following files changes.
|
|
||||||
# However, we have github configured to expect and require this workflow
|
|
||||||
# to run and pass before github with auto-merge a pull request. Until github
|
|
||||||
# allows more flexible auto-merge policy, we can just run this on every PR.
|
|
||||||
# It doesn't take that long to run, anyway.
|
|
||||||
#paths:
|
|
||||||
# - "**/*.py"
|
|
||||||
# - pyproject.toml
|
|
||||||
# - requirements-lint.txt
|
|
||||||
# - .github/workflows/matchers/ruff.json
|
|
||||||
# - .github/workflows/ruff.yml
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
ruff:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
strategy:
|
|
||||||
matrix:
|
|
||||||
python-version: ["3.12"]
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
|
||||||
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
|
|
||||||
with:
|
|
||||||
python-version: ${{ matrix.python-version }}
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
python -m pip install --upgrade pip
|
|
||||||
pip install -r requirements-lint.txt
|
|
||||||
- name: Analysing the code with ruff
|
|
||||||
run: |
|
|
||||||
echo "::add-matcher::.github/workflows/matchers/ruff.json"
|
|
||||||
ruff check --output-format github .
|
|
||||||
- name: Run isort
|
|
||||||
run: |
|
|
||||||
isort . --check-only
|
|
37
.github/workflows/shellcheck.yml
vendored
37
.github/workflows/shellcheck.yml
vendored
@ -1,37 +0,0 @@
|
|||||||
name: Lint shell scripts
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- "main"
|
|
||||||
paths:
|
|
||||||
- '**/*.sh'
|
|
||||||
- '.github/workflows/shellcheck.yml'
|
|
||||||
pull_request:
|
|
||||||
branches:
|
|
||||||
- "main"
|
|
||||||
paths:
|
|
||||||
- '**/*.sh'
|
|
||||||
- '.github/workflows/shellcheck.yml'
|
|
||||||
|
|
||||||
env:
|
|
||||||
LC_ALL: en_US.UTF-8
|
|
||||||
|
|
||||||
defaults:
|
|
||||||
run:
|
|
||||||
shell: bash
|
|
||||||
|
|
||||||
permissions:
|
|
||||||
contents: read
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
shellcheck:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
steps:
|
|
||||||
- name: "Checkout"
|
|
||||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
|
||||||
with:
|
|
||||||
fetch-depth: 0
|
|
||||||
|
|
||||||
- name: "Check shell scripts"
|
|
||||||
run: |
|
|
||||||
tools/shellcheck.sh
|
|
32
.github/workflows/sphinx-lint.yml
vendored
32
.github/workflows/sphinx-lint.yml
vendored
@ -1,32 +0,0 @@
|
|||||||
name: Lint documentation
|
|
||||||
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- main
|
|
||||||
paths:
|
|
||||||
- "docs/**"
|
|
||||||
pull_request:
|
|
||||||
branches:
|
|
||||||
- main
|
|
||||||
paths:
|
|
||||||
- "docs/**"
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
sphinx-lint:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
strategy:
|
|
||||||
matrix:
|
|
||||||
python-version: ["3.12"]
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
|
||||||
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
|
|
||||||
with:
|
|
||||||
python-version: ${{ matrix.python-version }}
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
python -m pip install --upgrade pip
|
|
||||||
pip install -r requirements-lint.txt
|
|
||||||
- name: Linting docs
|
|
||||||
run: tools/sphinx-lint.sh
|
|
2
.github/workflows/stale.yml
vendored
2
.github/workflows/stale.yml
vendored
@ -13,7 +13,7 @@ jobs:
|
|||||||
actions: write
|
actions: write
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/stale@28ca1036281a5e5922ead5184a1bbf96e5fc984e # v9.0.0
|
- uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0
|
||||||
with:
|
with:
|
||||||
# Increasing this value ensures that changes to this workflow
|
# Increasing this value ensures that changes to this workflow
|
||||||
# propagate to all issues and PRs in days rather than months
|
# propagate to all issues and PRs in days rather than months
|
||||||
|
38
.github/workflows/yapf.yml
vendored
38
.github/workflows/yapf.yml
vendored
@ -1,38 +0,0 @@
|
|||||||
name: yapf
|
|
||||||
|
|
||||||
on:
|
|
||||||
# Trigger the workflow on push or pull request,
|
|
||||||
# but only for the main branch
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- main
|
|
||||||
paths:
|
|
||||||
- "**/*.py"
|
|
||||||
- .github/workflows/yapf.yml
|
|
||||||
pull_request:
|
|
||||||
branches:
|
|
||||||
- main
|
|
||||||
paths:
|
|
||||||
- "**/*.py"
|
|
||||||
- .github/workflows/yapf.yml
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
yapf:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
strategy:
|
|
||||||
matrix:
|
|
||||||
python-version: ["3.12"]
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
|
||||||
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
|
|
||||||
with:
|
|
||||||
python-version: ${{ matrix.python-version }}
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
python -m pip install --upgrade pip
|
|
||||||
pip install yapf==0.32.0
|
|
||||||
pip install toml==0.10.2
|
|
||||||
- name: Running yapf
|
|
||||||
run: |
|
|
||||||
yapf --diff --recursive .
|
|
3
.gitignore
vendored
3
.gitignore
vendored
@ -79,8 +79,7 @@ instance/
|
|||||||
|
|
||||||
# Sphinx documentation
|
# Sphinx documentation
|
||||||
docs/_build/
|
docs/_build/
|
||||||
docs/source/getting_started/examples/*.rst
|
docs/source/getting_started/examples/
|
||||||
!**/*.template.rst
|
|
||||||
|
|
||||||
# PyBuilder
|
# PyBuilder
|
||||||
.pybuilder/
|
.pybuilder/
|
||||||
|
138
.pre-commit-config.yaml
Normal file
138
.pre-commit-config.yaml
Normal file
@ -0,0 +1,138 @@
|
|||||||
|
default_stages:
|
||||||
|
- pre-commit # Run locally
|
||||||
|
- manual # Run in CI
|
||||||
|
repos:
|
||||||
|
- repo: https://github.com/google/yapf
|
||||||
|
rev: v0.43.0
|
||||||
|
hooks:
|
||||||
|
- id: yapf
|
||||||
|
args: [--in-place, --verbose]
|
||||||
|
additional_dependencies: [toml] # TODO: Remove when yapf is upgraded
|
||||||
|
exclude: 'vllm/third_party/.*'
|
||||||
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||||
|
rev: v0.9.3
|
||||||
|
hooks:
|
||||||
|
- id: ruff
|
||||||
|
args: [--output-format, github, --fix]
|
||||||
|
exclude: 'vllm/third_party/.*'
|
||||||
|
- repo: https://github.com/codespell-project/codespell
|
||||||
|
rev: v2.4.0
|
||||||
|
hooks:
|
||||||
|
- id: codespell
|
||||||
|
additional_dependencies: ['tomli']
|
||||||
|
args: ['--toml', 'pyproject.toml']
|
||||||
|
- repo: https://github.com/PyCQA/isort
|
||||||
|
rev: 5.13.2
|
||||||
|
hooks:
|
||||||
|
- id: isort
|
||||||
|
exclude: 'vllm/third_party/.*'
|
||||||
|
- repo: https://github.com/pre-commit/mirrors-clang-format
|
||||||
|
rev: v19.1.7
|
||||||
|
hooks:
|
||||||
|
- id: clang-format
|
||||||
|
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
|
||||||
|
types_or: [c++, cuda]
|
||||||
|
args: [--style=file, --verbose]
|
||||||
|
- repo: https://github.com/jackdewinter/pymarkdown
|
||||||
|
rev: v0.9.27
|
||||||
|
hooks:
|
||||||
|
- id: pymarkdown
|
||||||
|
args: [fix]
|
||||||
|
exclude: 'vllm/third_party/.*'
|
||||||
|
- repo: https://github.com/rhysd/actionlint
|
||||||
|
rev: v1.7.7
|
||||||
|
hooks:
|
||||||
|
- id: actionlint
|
||||||
|
exclude: 'vllm/third_party/.*'
|
||||||
|
- repo: local
|
||||||
|
hooks:
|
||||||
|
- id: mypy-local
|
||||||
|
name: Run mypy for local Python installation
|
||||||
|
entry: tools/mypy.sh 0 "local"
|
||||||
|
language: python
|
||||||
|
types: [python]
|
||||||
|
additional_dependencies: &mypy_deps [mypy==1.11.1, types-setuptools, types-PyYAML, types-requests]
|
||||||
|
stages: [pre-commit] # Don't run in CI
|
||||||
|
exclude: 'vllm/third_party/.*'
|
||||||
|
- id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||||
|
name: Run mypy for Python 3.9
|
||||||
|
entry: tools/mypy.sh 1 "3.9"
|
||||||
|
language: python
|
||||||
|
types: [python]
|
||||||
|
additional_dependencies: *mypy_deps
|
||||||
|
stages: [manual] # Only run in CI
|
||||||
|
exclude: 'vllm/third_party/.*'
|
||||||
|
- id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||||
|
name: Run mypy for Python 3.10
|
||||||
|
entry: tools/mypy.sh 1 "3.10"
|
||||||
|
language: python
|
||||||
|
types: [python]
|
||||||
|
additional_dependencies: *mypy_deps
|
||||||
|
stages: [manual] # Only run in CI
|
||||||
|
exclude: 'vllm/third_party/.*'
|
||||||
|
- id: mypy-3.11 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||||
|
name: Run mypy for Python 3.11
|
||||||
|
entry: tools/mypy.sh 1 "3.11"
|
||||||
|
language: python
|
||||||
|
types: [python]
|
||||||
|
additional_dependencies: *mypy_deps
|
||||||
|
stages: [manual] # Only run in CI
|
||||||
|
exclude: 'vllm/third_party/.*'
|
||||||
|
- id: mypy-3.12 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||||
|
name: Run mypy for Python 3.12
|
||||||
|
entry: tools/mypy.sh 1 "3.12"
|
||||||
|
language: python
|
||||||
|
types: [python]
|
||||||
|
additional_dependencies: *mypy_deps
|
||||||
|
stages: [manual] # Only run in CI
|
||||||
|
exclude: 'vllm/third_party/.*'
|
||||||
|
- id: shellcheck
|
||||||
|
name: Lint shell scripts
|
||||||
|
entry: tools/shellcheck.sh
|
||||||
|
language: script
|
||||||
|
types: [shell]
|
||||||
|
exclude: 'vllm/third_party/.*'
|
||||||
|
- id: png-lint
|
||||||
|
name: Lint PNG exports from excalidraw
|
||||||
|
entry: tools/png-lint.sh
|
||||||
|
language: script
|
||||||
|
types: [png]
|
||||||
|
exclude: 'vllm/third_party/.*'
|
||||||
|
- id: signoff-commit
|
||||||
|
name: Sign-off Commit
|
||||||
|
entry: bash
|
||||||
|
args:
|
||||||
|
- -c
|
||||||
|
- |
|
||||||
|
if ! grep -q "^Signed-off-by: $(git config user.name) <$(git config user.email)>" .git/COMMIT_EDITMSG; then
|
||||||
|
printf "\nSigned-off-by: $(git config user.name) <$(git config user.email)>\n" >> .git/COMMIT_EDITMSG
|
||||||
|
fi
|
||||||
|
language: system
|
||||||
|
verbose: true
|
||||||
|
stages: [commit-msg]
|
||||||
|
exclude: 'vllm/third_party/.*'
|
||||||
|
- id: check-spdx-header
|
||||||
|
name: Check SPDX headers
|
||||||
|
entry: python tools/check_spdx_header.py
|
||||||
|
language: python
|
||||||
|
types: [python]
|
||||||
|
exclude: 'vllm/third_party/.*'
|
||||||
|
- id: check-filenames
|
||||||
|
name: Check for spaces in all filenames
|
||||||
|
entry: bash
|
||||||
|
args:
|
||||||
|
- -c
|
||||||
|
- 'git ls-files | grep " " && echo "Filenames should not contain spaces!" && exit 1 || exit 0'
|
||||||
|
language: system
|
||||||
|
always_run: true
|
||||||
|
pass_filenames: false
|
||||||
|
exclude: 'vllm/third_party/.*'
|
||||||
|
# Keep `suggestion` last
|
||||||
|
- id: suggestion
|
||||||
|
name: Suggestion
|
||||||
|
entry: bash -c 'echo "To bypass pre-commit hooks, add --no-verify to git commit."'
|
||||||
|
language: system
|
||||||
|
verbose: true
|
||||||
|
pass_filenames: false
|
||||||
|
exclude: 'vllm/third_party/.*'
|
||||||
|
# Insert new entries above the `suggestion` entry
|
140
CMakeLists.txt
Normal file → Executable file
140
CMakeLists.txt
Normal file → Executable file
@ -24,9 +24,6 @@ include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
|
|||||||
# Suppress potential warnings about unused manually-specified variables
|
# Suppress potential warnings about unused manually-specified variables
|
||||||
set(ignoreMe "${VLLM_PYTHON_PATH}")
|
set(ignoreMe "${VLLM_PYTHON_PATH}")
|
||||||
|
|
||||||
# Prevent installation of dependencies (cutlass) by default.
|
|
||||||
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# Supported python versions. These versions will be searched in order, the
|
# Supported python versions. These versions will be searched in order, the
|
||||||
# first match will be selected. These should be kept in sync with setup.py.
|
# first match will be selected. These should be kept in sync with setup.py.
|
||||||
@ -37,7 +34,7 @@ set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
|
|||||||
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0")
|
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0")
|
||||||
|
|
||||||
# Supported AMD GPU architectures.
|
# Supported AMD GPU architectures.
|
||||||
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101")
|
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101")
|
||||||
|
|
||||||
#
|
#
|
||||||
# Supported/expected torch versions for CUDA/ROCm.
|
# Supported/expected torch versions for CUDA/ROCm.
|
||||||
@ -181,6 +178,31 @@ message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
|
|||||||
# Define other extension targets
|
# Define other extension targets
|
||||||
#
|
#
|
||||||
|
|
||||||
|
#
|
||||||
|
# cumem_allocator extension
|
||||||
|
#
|
||||||
|
|
||||||
|
set(VLLM_CUMEM_EXT_SRC
|
||||||
|
"csrc/cumem_allocator.cpp")
|
||||||
|
|
||||||
|
set_gencode_flags_for_srcs(
|
||||||
|
SRCS "${VLLM_CUMEM_EXT_SRC}"
|
||||||
|
CUDA_ARCHS "${CUDA_ARCHS}")
|
||||||
|
|
||||||
|
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
|
message(STATUS "Enabling cumem allocator extension.")
|
||||||
|
# link against cuda driver library
|
||||||
|
list(APPEND CUMEM_LIBS CUDA::cuda_driver)
|
||||||
|
define_gpu_extension_target(
|
||||||
|
cumem_allocator
|
||||||
|
DESTINATION vllm
|
||||||
|
LANGUAGE CXX
|
||||||
|
SOURCES ${VLLM_CUMEM_EXT_SRC}
|
||||||
|
LIBRARIES ${CUMEM_LIBS}
|
||||||
|
USE_SABI 3.8
|
||||||
|
WITH_SOABI)
|
||||||
|
endif()
|
||||||
|
|
||||||
#
|
#
|
||||||
# _C extension
|
# _C extension
|
||||||
#
|
#
|
||||||
@ -206,7 +228,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
|
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
|
||||||
|
|
||||||
# Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
|
# Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
|
||||||
set(CUTLASS_REVISION "v3.5.1" CACHE STRING "CUTLASS revision to use")
|
# Please keep this in sync with FetchContent_Declare line below.
|
||||||
|
set(CUTLASS_REVISION "v3.7.0" CACHE STRING "CUTLASS revision to use")
|
||||||
|
|
||||||
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
|
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
|
||||||
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
|
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
|
||||||
@ -223,7 +246,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
FetchContent_Declare(
|
FetchContent_Declare(
|
||||||
cutlass
|
cutlass
|
||||||
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
|
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
|
||||||
GIT_TAG v3.5.1
|
# Please keep this in sync with CUTLASS_REVISION line above.
|
||||||
|
GIT_TAG v3.7.0
|
||||||
GIT_PROGRESS TRUE
|
GIT_PROGRESS TRUE
|
||||||
|
|
||||||
# Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
|
# Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
|
||||||
@ -241,7 +265,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
"csrc/quantization/awq/gemm_kernels.cu"
|
"csrc/quantization/awq/gemm_kernels.cu"
|
||||||
"csrc/custom_all_reduce.cu"
|
"csrc/custom_all_reduce.cu"
|
||||||
"csrc/permute_cols.cu"
|
"csrc/permute_cols.cu"
|
||||||
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu")
|
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
|
||||||
|
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
|
||||||
|
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
|
||||||
|
"csrc/cutlass_extensions/common.cpp")
|
||||||
|
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${VLLM_EXT_SRC}"
|
SRCS "${VLLM_EXT_SRC}"
|
||||||
@ -250,7 +277,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
# Only build Marlin kernels if we are building for at least some compatible archs.
|
# Only build Marlin kernels if we are building for at least some compatible archs.
|
||||||
# Keep building Marlin for 9.0 as there are some group sizes and shapes that
|
# Keep building Marlin for 9.0 as there are some group sizes and shapes that
|
||||||
# are not supported by Machete yet.
|
# are not supported by Machete yet.
|
||||||
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0" ${CUDA_ARCHS})
|
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
|
||||||
if (MARLIN_ARCHS)
|
if (MARLIN_ARCHS)
|
||||||
set(MARLIN_SRCS
|
set(MARLIN_SRCS
|
||||||
"csrc/quantization/fp8/fp8_marlin.cu"
|
"csrc/quantization/fp8/fp8_marlin.cu"
|
||||||
@ -270,12 +297,16 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
" in CUDA target architectures")
|
" in CUDA target architectures")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
#
|
|
||||||
# The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
|
# The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
|
||||||
# CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now).
|
# CUDA 12.0 or later (and only work on Hopper, 9.0a for now).
|
||||||
cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0;9.0a" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0a" "${CUDA_ARCHS}")
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
|
||||||
set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
|
set(SRCS
|
||||||
|
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
|
||||||
|
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
|
||||||
|
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
|
||||||
|
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu"
|
||||||
|
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
|
CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
|
||||||
@ -323,6 +354,47 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
#
|
||||||
|
# 2:4 Sparse Kernels
|
||||||
|
|
||||||
|
# The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
|
||||||
|
# require CUDA 12.2 or later (and only work on Hopper, 9.0a for now).
|
||||||
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
|
||||||
|
set(SRCS "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
|
||||||
|
set_gencode_flags_for_srcs(
|
||||||
|
SRCS "${SRCS}"
|
||||||
|
CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
|
||||||
|
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
||||||
|
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SPARSE_SCALED_MM_C3X=1")
|
||||||
|
message(STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
|
||||||
|
else()
|
||||||
|
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
|
||||||
|
message(STATUS "Not building sparse_scaled_mm_c3x kernels as CUDA Compiler version is "
|
||||||
|
"not >= 12.2, we recommend upgrading to CUDA 12.2 or later "
|
||||||
|
"if you intend on running FP8 sparse quantized models on Hopper.")
|
||||||
|
else()
|
||||||
|
message(STATUS "Not building sparse_scaled_mm_c3x as no compatible archs found "
|
||||||
|
"in CUDA target architectures")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
|
# FP4 Archs and flags
|
||||||
|
cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||||
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS)
|
||||||
|
set(SRCS
|
||||||
|
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
|
||||||
|
)
|
||||||
|
set_gencode_flags_for_srcs(
|
||||||
|
SRCS "${SRCS}"
|
||||||
|
CUDA_ARCHS "${FP4_ARCHS}")
|
||||||
|
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
||||||
|
list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4=1")
|
||||||
|
message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
|
||||||
|
else()
|
||||||
|
message(STATUS "Not building NVFP4 as no compatible archs were found.")
|
||||||
|
# clear FP4_ARCHS
|
||||||
|
set(FP4_ARCHS)
|
||||||
|
endif()
|
||||||
|
|
||||||
#
|
#
|
||||||
# Machete kernels
|
# Machete kernels
|
||||||
@ -483,7 +555,7 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
# vllm-flash-attn currently only supported on CUDA
|
# vllm-flash-attn currently only supported on CUDA
|
||||||
if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda")
|
if (NOT VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
return()
|
return()
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
@ -506,7 +578,7 @@ endif()
|
|||||||
# They should be identical but if they aren't, this is a massive footgun.
|
# They should be identical but if they aren't, this is a massive footgun.
|
||||||
#
|
#
|
||||||
# The vllm-flash-attn install rules are nested under vllm to make sure the library gets installed in the correct place.
|
# The vllm-flash-attn install rules are nested under vllm to make sure the library gets installed in the correct place.
|
||||||
# To only install vllm-flash-attn, use --component vllm_flash_attn_c.
|
# To only install vllm-flash-attn, use --component _vllm_fa2_C (for FA2) or --component _vllm_fa3_C (for FA3).
|
||||||
# If no component is specified, vllm-flash-attn is still installed.
|
# If no component is specified, vllm-flash-attn is still installed.
|
||||||
|
|
||||||
# If VLLM_FLASH_ATTN_SRC_DIR is set, vllm-flash-attn is installed from that directory instead of downloading.
|
# If VLLM_FLASH_ATTN_SRC_DIR is set, vllm-flash-attn is installed from that directory instead of downloading.
|
||||||
@ -518,43 +590,41 @@ if (DEFINED ENV{VLLM_FLASH_ATTN_SRC_DIR})
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(VLLM_FLASH_ATTN_SRC_DIR)
|
if(VLLM_FLASH_ATTN_SRC_DIR)
|
||||||
FetchContent_Declare(vllm-flash-attn SOURCE_DIR ${VLLM_FLASH_ATTN_SRC_DIR})
|
FetchContent_Declare(
|
||||||
|
vllm-flash-attn SOURCE_DIR
|
||||||
|
${VLLM_FLASH_ATTN_SRC_DIR}
|
||||||
|
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
||||||
|
)
|
||||||
else()
|
else()
|
||||||
FetchContent_Declare(
|
FetchContent_Declare(
|
||||||
vllm-flash-attn
|
vllm-flash-attn
|
||||||
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
|
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
|
||||||
GIT_TAG 04325b6798bcc326c86fb35af62d05a9c8c8eceb
|
GIT_TAG 720c94869cf2e0ff5a706e9c7f1dce0939686ade
|
||||||
GIT_PROGRESS TRUE
|
GIT_PROGRESS TRUE
|
||||||
# Don't share the vllm-flash-attn build between build types
|
# Don't share the vllm-flash-attn build between build types
|
||||||
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
||||||
)
|
)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# Set the parent build flag so that the vllm-flash-attn library does not redo compile flag and arch initialization.
|
|
||||||
set(VLLM_PARENT_BUILD ON)
|
|
||||||
|
|
||||||
# Ensure the vllm/vllm_flash_attn directory exists before installation
|
|
||||||
install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn\")" COMPONENT vllm_flash_attn_c)
|
|
||||||
|
|
||||||
# Make sure vllm-flash-attn install rules are nested under vllm/
|
|
||||||
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY FALSE)" COMPONENT vllm_flash_attn_c)
|
|
||||||
install(CODE "set(OLD_CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}\")" COMPONENT vllm_flash_attn_c)
|
|
||||||
install(CODE "set(CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}/vllm/\")" COMPONENT vllm_flash_attn_c)
|
|
||||||
|
|
||||||
# Fetch the vllm-flash-attn library
|
# Fetch the vllm-flash-attn library
|
||||||
FetchContent_MakeAvailable(vllm-flash-attn)
|
FetchContent_MakeAvailable(vllm-flash-attn)
|
||||||
message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}")
|
message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}")
|
||||||
|
|
||||||
# Restore the install prefix
|
# Copy over the vllm-flash-attn python files (duplicated for fa2 and fa3, in
|
||||||
install(CODE "set(CMAKE_INSTALL_PREFIX \"\${OLD_CMAKE_INSTALL_PREFIX}\")" COMPONENT vllm_flash_attn_c)
|
# case only one is built, in the case both are built redundant work is done)
|
||||||
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" COMPONENT vllm_flash_attn_c)
|
|
||||||
|
|
||||||
# Copy over the vllm-flash-attn python files
|
|
||||||
install(
|
install(
|
||||||
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
|
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
|
||||||
DESTINATION vllm/vllm_flash_attn
|
DESTINATION vllm_flash_attn
|
||||||
COMPONENT vllm_flash_attn_c
|
COMPONENT _vllm_fa2_C
|
||||||
FILES_MATCHING PATTERN "*.py"
|
FILES_MATCHING PATTERN "*.py"
|
||||||
|
)
|
||||||
|
|
||||||
|
install(
|
||||||
|
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
|
||||||
|
DESTINATION vllm_flash_attn
|
||||||
|
COMPONENT _vllm_fa3_C
|
||||||
|
FILES_MATCHING PATTERN "*.py"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Nothing after vllm-flash-attn, see comment about macros above
|
# Nothing after vllm-flash-attn, see comment about macros above
|
||||||
|
@ -61,7 +61,7 @@ representative at an online or offline/IRL event.
|
|||||||
|
|
||||||
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
||||||
reported to the community leaders responsible for enforcement in the #code-of-conduct
|
reported to the community leaders responsible for enforcement in the #code-of-conduct
|
||||||
channel in the [vLLM Discord](https://discord.com/invite/jz7wjKhh6g).
|
channel in the [vLLM Slack](https://slack.vllm.ai).
|
||||||
All complaints will be reviewed and investigated promptly and fairly.
|
All complaints will be reviewed and investigated promptly and fairly.
|
||||||
|
|
||||||
All community leaders are obligated to respect the privacy and security of the
|
All community leaders are obligated to respect the privacy and security of the
|
||||||
@ -125,4 +125,3 @@ Community Impact Guidelines were inspired by
|
|||||||
For answers to common questions about this code of conduct, see the
|
For answers to common questions about this code of conduct, see the
|
||||||
[Contributor Covenant FAQ](https://www.contributor-covenant.org/faq). Translations are available at
|
[Contributor Covenant FAQ](https://www.contributor-covenant.org/faq). Translations are available at
|
||||||
[Contributor Covenant translations](https://www.contributor-covenant.org/translations).
|
[Contributor Covenant translations](https://www.contributor-covenant.org/translations).
|
||||||
|
|
||||||
|
108
Dockerfile
108
Dockerfile
@ -2,8 +2,8 @@
|
|||||||
# to run the OpenAI compatible server.
|
# to run the OpenAI compatible server.
|
||||||
|
|
||||||
# Please update any changes made here to
|
# Please update any changes made here to
|
||||||
# docs/source/dev/dockerfile/dockerfile.rst and
|
# docs/source/contributing/dockerfile/dockerfile.md and
|
||||||
# docs/source/assets/dev/dockerfile-stages-dependency.png
|
# docs/source/assets/contributing/dockerfile-stages-dependency.png
|
||||||
|
|
||||||
ARG CUDA_VERSION=12.4.1
|
ARG CUDA_VERSION=12.4.1
|
||||||
#################### BASE BUILD IMAGE ####################
|
#################### BASE BUILD IMAGE ####################
|
||||||
@ -27,6 +27,9 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
|||||||
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
|
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
|
||||||
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
|
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
|
||||||
&& python3 --version && python3 -m pip --version
|
&& python3 --version && python3 -m pip --version
|
||||||
|
# Install uv for faster pip installs
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
python3 -m pip install uv
|
||||||
|
|
||||||
# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
|
# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
|
||||||
# as it was causing spam when compiling the CUTLASS kernels
|
# as it was causing spam when compiling the CUTLASS kernels
|
||||||
@ -45,17 +48,21 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
|
|||||||
WORKDIR /workspace
|
WORKDIR /workspace
|
||||||
|
|
||||||
# install build and runtime dependencies
|
# install build and runtime dependencies
|
||||||
COPY requirements-common.txt requirements-common.txt
|
|
||||||
COPY requirements-cuda.txt requirements-cuda.txt
|
|
||||||
COPY requirements-cuda-arm64.txt requirements-cuda-arm64.txt
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
python3 -m pip install -r requirements-cuda.txt
|
|
||||||
|
|
||||||
|
# arm64 (GH200) build follows the practice of "use existing pytorch" build,
|
||||||
|
# we need to install torch and torchvision from the nightly builds first,
|
||||||
|
# pytorch will not appear as a vLLM dependency in all of the following steps
|
||||||
|
# after this step
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
|
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
|
||||||
python3 -m pip install -r requirements-cuda-arm64.txt; \
|
uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu126 "torch==2.7.0.dev20250121+cu126" "torchvision==0.22.0.dev20250121"; \
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
COPY requirements-common.txt requirements-common.txt
|
||||||
|
COPY requirements-cuda.txt requirements-cuda.txt
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
uv pip install --system -r requirements-cuda.txt
|
||||||
|
|
||||||
# cuda arch list used by torch
|
# cuda arch list used by torch
|
||||||
# can be useful for both `dev` and `test`
|
# can be useful for both `dev` and `test`
|
||||||
# explicitly set the list to avoid issues with torch 2.2
|
# explicitly set the list to avoid issues with torch 2.2
|
||||||
@ -75,12 +82,7 @@ ARG TARGETPLATFORM
|
|||||||
COPY requirements-build.txt requirements-build.txt
|
COPY requirements-build.txt requirements-build.txt
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
python3 -m pip install -r requirements-build.txt
|
uv pip install --system -r requirements-build.txt
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
|
|
||||||
python3 -m pip install -r requirements-cuda-arm64.txt; \
|
|
||||||
fi
|
|
||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
ARG GIT_REPO_CHECK=0
|
ARG GIT_REPO_CHECK=0
|
||||||
@ -127,8 +129,8 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
|
|||||||
|
|
||||||
# Check the size of the wheel if RUN_WHEEL_CHECK is true
|
# Check the size of the wheel if RUN_WHEEL_CHECK is true
|
||||||
COPY .buildkite/check-wheel-size.py check-wheel-size.py
|
COPY .buildkite/check-wheel-size.py check-wheel-size.py
|
||||||
# Default max size of the wheel is 250MB
|
# sync the default value with .buildkite/check-wheel-size.py
|
||||||
ARG VLLM_MAX_SIZE_MB=250
|
ARG VLLM_MAX_SIZE_MB=400
|
||||||
ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB
|
ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB
|
||||||
ARG RUN_WHEEL_CHECK=true
|
ARG RUN_WHEEL_CHECK=true
|
||||||
RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
|
RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
|
||||||
@ -145,20 +147,19 @@ COPY requirements-lint.txt requirements-lint.txt
|
|||||||
COPY requirements-test.txt requirements-test.txt
|
COPY requirements-test.txt requirements-test.txt
|
||||||
COPY requirements-dev.txt requirements-dev.txt
|
COPY requirements-dev.txt requirements-dev.txt
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
python3 -m pip install -r requirements-dev.txt
|
uv pip install --system -r requirements-dev.txt
|
||||||
#################### DEV IMAGE ####################
|
#################### DEV IMAGE ####################
|
||||||
|
|
||||||
#################### vLLM installation IMAGE ####################
|
#################### vLLM installation IMAGE ####################
|
||||||
# image with vLLM installed
|
# image with vLLM installed
|
||||||
FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04 AS vllm-base
|
# TODO: Restore to base image after FlashInfer AOT wheel fixed
|
||||||
|
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS vllm-base
|
||||||
ARG CUDA_VERSION=12.4.1
|
ARG CUDA_VERSION=12.4.1
|
||||||
ARG PYTHON_VERSION=3.12
|
ARG PYTHON_VERSION=3.12
|
||||||
WORKDIR /vllm-workspace
|
WORKDIR /vllm-workspace
|
||||||
ENV DEBIAN_FRONTEND=noninteractive
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
ARG TARGETPLATFORM
|
ARG TARGETPLATFORM
|
||||||
|
|
||||||
COPY requirements-cuda-arm64.txt requirements-cuda-arm64.txt
|
|
||||||
|
|
||||||
RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
|
RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
|
||||||
echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
|
echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
|
||||||
|
|
||||||
@ -166,7 +167,7 @@ RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
|
|||||||
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
||||||
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
||||||
&& apt-get update -y \
|
&& apt-get update -y \
|
||||||
&& apt-get install -y ccache software-properties-common git curl sudo vim python3-pip \
|
&& apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
|
||||||
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
|
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
|
||||||
&& add-apt-repository ppa:deadsnakes/ppa \
|
&& add-apt-repository ppa:deadsnakes/ppa \
|
||||||
&& apt-get update -y \
|
&& apt-get update -y \
|
||||||
@ -176,6 +177,9 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
|||||||
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
|
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
|
||||||
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
|
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
|
||||||
&& python3 --version && python3 -m pip --version
|
&& python3 --version && python3 -m pip --version
|
||||||
|
# Install uv for faster pip installs
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
python3 -m pip install uv
|
||||||
|
|
||||||
# Workaround for https://github.com/openai/triton/issues/2507 and
|
# Workaround for https://github.com/openai/triton/issues/2507 and
|
||||||
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
|
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
|
||||||
@ -183,23 +187,47 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
|||||||
# or future versions of triton.
|
# or future versions of triton.
|
||||||
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
|
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
|
||||||
|
|
||||||
|
# arm64 (GH200) build follows the practice of "use existing pytorch" build,
|
||||||
|
# we need to install torch and torchvision from the nightly builds first,
|
||||||
|
# pytorch will not appear as a vLLM dependency in all of the following steps
|
||||||
|
# after this step
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
|
||||||
|
uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215"; \
|
||||||
|
fi
|
||||||
|
|
||||||
# Install vllm wheel first, so that torch etc will be installed.
|
# Install vllm wheel first, so that torch etc will be installed.
|
||||||
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
|
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
|
||||||
--mount=type=cache,target=/root/.cache/pip \
|
--mount=type=cache,target=/root/.cache/pip \
|
||||||
python3 -m pip install dist/*.whl --verbose
|
uv pip install --system dist/*.whl --verbose
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
# If we need to build FlashInfer wheel before its release:
|
||||||
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
|
# $ export FLASHINFER_ENABLE_AOT=1
|
||||||
pip uninstall -y torch && \
|
# $ # Note we remove 7.0 from the arch list compared to the list below, since FlashInfer only supports sm75+
|
||||||
python3 -m pip install -r requirements-cuda-arm64.txt; \
|
# $ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.6 8.9 9.0+PTX'
|
||||||
fi
|
# $ git clone https://github.com/flashinfer-ai/flashinfer.git --recursive
|
||||||
|
# $ cd flashinfer
|
||||||
|
# $ git checkout 524304395bd1d8cd7d07db083859523fcaa246a4
|
||||||
|
# $ rm -rf build
|
||||||
|
# $ python3 setup.py bdist_wheel --dist-dir=dist --verbose
|
||||||
|
# $ ls dist
|
||||||
|
# $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/524304395bd1d8cd7d07db083859523fcaa246a4/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
. /etc/environment && \
|
. /etc/environment && \
|
||||||
if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
|
if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
|
||||||
python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl; \
|
uv pip install --system https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post1/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl ; \
|
||||||
fi
|
fi
|
||||||
COPY examples examples
|
COPY examples examples
|
||||||
|
|
||||||
|
# Although we build Flashinfer with AOT mode, there's still
|
||||||
|
# some issues w.r.t. JIT compilation. Therefore we need to
|
||||||
|
# install build dependencies for JIT compilation.
|
||||||
|
# TODO: Remove this once FlashInfer AOT wheel is fixed
|
||||||
|
COPY requirements-build.txt requirements-build.txt
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
uv pip install --system -r requirements-build.txt
|
||||||
|
|
||||||
#################### vLLM installation IMAGE ####################
|
#################### vLLM installation IMAGE ####################
|
||||||
|
|
||||||
#################### TEST IMAGE ####################
|
#################### TEST IMAGE ####################
|
||||||
@ -211,15 +239,15 @@ ADD . /vllm-workspace/
|
|||||||
|
|
||||||
# install development dependencies (for testing)
|
# install development dependencies (for testing)
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
python3 -m pip install -r requirements-dev.txt
|
uv pip install --system -r requirements-dev.txt
|
||||||
|
|
||||||
# install development dependencies (for testing)
|
# install development dependencies (for testing)
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
python3 -m pip install -e tests/vllm_test_utils
|
uv pip install --system -e tests/vllm_test_utils
|
||||||
|
|
||||||
# enable fast downloads from hf (for testing)
|
# enable fast downloads from hf (for testing)
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
python3 -m pip install hf_transfer
|
uv pip install --system hf_transfer
|
||||||
ENV HF_HUB_ENABLE_HF_TRANSFER 1
|
ENV HF_HUB_ENABLE_HF_TRANSFER 1
|
||||||
|
|
||||||
# Copy in the v1 package for testing (it isn't distributed yet)
|
# Copy in the v1 package for testing (it isn't distributed yet)
|
||||||
@ -234,17 +262,27 @@ RUN mv vllm test_docs/
|
|||||||
#################### TEST IMAGE ####################
|
#################### TEST IMAGE ####################
|
||||||
|
|
||||||
#################### OPENAI API SERVER ####################
|
#################### OPENAI API SERVER ####################
|
||||||
# openai api server alternative
|
# base openai image with additional requirements, for any subsequent openai-style images
|
||||||
FROM vllm-base AS vllm-openai
|
FROM vllm-base AS vllm-openai-base
|
||||||
|
|
||||||
# install additional dependencies for openai api server
|
# install additional dependencies for openai api server
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
|
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
|
||||||
pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10'; \
|
uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
|
||||||
else \
|
else \
|
||||||
pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10'; \
|
uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
|
||||||
fi
|
fi
|
||||||
|
|
||||||
ENV VLLM_USAGE_SOURCE production-docker-image
|
ENV VLLM_USAGE_SOURCE production-docker-image
|
||||||
|
|
||||||
|
# define sagemaker first, so it is not default from `docker build`
|
||||||
|
FROM vllm-openai-base AS vllm-sagemaker
|
||||||
|
|
||||||
|
COPY examples/online_serving/sagemaker-entrypoint.sh .
|
||||||
|
RUN chmod +x sagemaker-entrypoint.sh
|
||||||
|
ENTRYPOINT ["./sagemaker-entrypoint.sh"]
|
||||||
|
|
||||||
|
FROM vllm-openai-base AS vllm-openai
|
||||||
|
|
||||||
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
|
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
|
||||||
#################### OPENAI API SERVER ####################
|
#################### OPENAI API SERVER ####################
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
FROM vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
|
FROM vault.habana.ai/gaudi-docker/1.19.1/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
|
||||||
|
|
||||||
COPY ./ /workspace/vllm
|
COPY ./ /workspace/vllm
|
||||||
|
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
# default base image
|
# default base image
|
||||||
# https://gallery.ecr.aws/neuron/pytorch-inference-neuronx
|
# https://gallery.ecr.aws/neuron/pytorch-inference-neuronx
|
||||||
ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.1.2-neuronx-py310-sdk2.20.2-ubuntu20.04"
|
ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.5.1-neuronx-py310-sdk2.21.0-ubuntu22.04"
|
||||||
|
|
||||||
FROM $BASE_IMAGE
|
FROM $BASE_IMAGE
|
||||||
|
|
||||||
@ -15,16 +15,19 @@ RUN apt-get update && \
|
|||||||
ffmpeg libsm6 libxext6 libgl1
|
ffmpeg libsm6 libxext6 libgl1
|
||||||
|
|
||||||
### Mount Point ###
|
### Mount Point ###
|
||||||
# When launching the container, mount the code directory to /app
|
# When launching the container, mount the code directory to /workspace
|
||||||
ARG APP_MOUNT=/app
|
ARG APP_MOUNT=/workspace
|
||||||
VOLUME [ ${APP_MOUNT} ]
|
VOLUME [ ${APP_MOUNT} ]
|
||||||
WORKDIR ${APP_MOUNT}/vllm
|
WORKDIR ${APP_MOUNT}/vllm
|
||||||
|
|
||||||
RUN python3 -m pip install --upgrade pip
|
RUN python3 -m pip install --upgrade pip
|
||||||
RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
|
RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
|
||||||
RUN python3 -m pip install sentencepiece transformers==4.36.2 -U
|
RUN python3 -m pip install sentencepiece transformers==4.45.2 -U
|
||||||
RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
|
RUN python3 -m pip install neuronx-cc==2.16.345.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
|
||||||
RUN python3 -m pip install --pre neuronx-cc==2.15.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
|
RUN python3 -m pip install pytest
|
||||||
|
|
||||||
|
# uninstall transformers-neuronx package explicitly to avoid version conflict
|
||||||
|
RUN python3 -m pip uninstall -y transformers-neuronx
|
||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
ARG GIT_REPO_CHECK=0
|
ARG GIT_REPO_CHECK=0
|
||||||
@ -42,4 +45,11 @@ RUN --mount=type=bind,source=.git,target=.git \
|
|||||||
# install development dependencies (for testing)
|
# install development dependencies (for testing)
|
||||||
RUN python3 -m pip install -e tests/vllm_test_utils
|
RUN python3 -m pip install -e tests/vllm_test_utils
|
||||||
|
|
||||||
|
# install transformers-neuronx package as an optional dependencies (for V0)
|
||||||
|
# FIXME: `--no-deps` argument is temporarily added to resolve transformers package version conflict
|
||||||
|
RUN python3 -m pip install transformers-neuronx==0.13.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U --no-deps
|
||||||
|
|
||||||
|
# overwrite entrypoint to run bash script
|
||||||
|
RUN echo "import subprocess; import sys; subprocess.check_call(sys.argv[1:])" > /usr/local/bin/dockerd-entrypoint.py
|
||||||
|
|
||||||
CMD ["/bin/bash"]
|
CMD ["/bin/bash"]
|
||||||
|
@ -14,6 +14,7 @@ ARG GIT_REPO_CHECK=0
|
|||||||
RUN --mount=type=bind,source=.git,target=.git \
|
RUN --mount=type=bind,source=.git,target=.git \
|
||||||
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
|
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
|
||||||
|
|
||||||
|
RUN python3 -m pip install -U pip
|
||||||
# install build requirements
|
# install build requirements
|
||||||
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/requirements-build.txt
|
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/requirements-build.txt
|
||||||
# build vLLM with OpenVINO backend
|
# build vLLM with OpenVINO backend
|
||||||
|
@ -4,12 +4,12 @@ USER root
|
|||||||
|
|
||||||
ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/"
|
ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/"
|
||||||
|
|
||||||
RUN apt-get update -y && apt-get install -y git wget curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1
|
RUN apt-get update -y && apt-get install -y git wget kmod curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 libssl-dev
|
||||||
|
|
||||||
# Some packages in requirements-cpu are installed here
|
# Some packages in requirements-cpu are installed here
|
||||||
# IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
|
# IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
|
||||||
# Currently these may not be available for venv or pip directly
|
# Currently these may not be available for venv or pip directly
|
||||||
RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 torchvision-cpu=0.16.2 rust && micromamba clean --all --yes
|
RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 rust && micromamba clean --all --yes
|
||||||
|
|
||||||
COPY ./ /workspace/vllm
|
COPY ./ /workspace/vllm
|
||||||
|
|
||||||
@ -18,11 +18,9 @@ ARG GIT_REPO_CHECK=0
|
|||||||
RUN --mount=type=bind,source=.git,target=.git \
|
RUN --mount=type=bind,source=.git,target=.git \
|
||||||
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
|
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
|
||||||
|
|
||||||
# These packages will be in rocketce eventually
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
|
RUSTFLAGS='-L /opt/conda/lib' pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
|
||||||
'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
|
'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
|
||||||
torch==2.3.1 \
|
|
||||||
-r requirements-cpu.txt \
|
-r requirements-cpu.txt \
|
||||||
xformers uvloop==0.20.0
|
xformers uvloop==0.20.0
|
||||||
|
|
||||||
|
261
Dockerfile.rocm
261
Dockerfile.rocm
@ -1,174 +1,119 @@
|
|||||||
# Default ROCm 6.2 base image
|
# default base image
|
||||||
ARG BASE_IMAGE="rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0"
|
ARG REMOTE_VLLM="0"
|
||||||
|
ARG USE_CYTHON="0"
|
||||||
|
ARG BUILD_RPD="1"
|
||||||
|
ARG COMMON_WORKDIR=/app
|
||||||
|
ARG BASE_IMAGE=rocm/vllm-dev:base
|
||||||
|
|
||||||
# Default ROCm ARCHes to build vLLM for.
|
FROM ${BASE_IMAGE} AS base
|
||||||
ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100"
|
|
||||||
|
|
||||||
# Whether to install CK-based flash-attention
|
ARG ARG_PYTORCH_ROCM_ARCH
|
||||||
# If 0, will not install flash-attention
|
ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}}
|
||||||
ARG BUILD_FA="1"
|
|
||||||
ARG FA_GFX_ARCHS="gfx90a;gfx942"
|
|
||||||
ARG FA_BRANCH="3cea2fb"
|
|
||||||
|
|
||||||
# Whether to build triton on rocm
|
|
||||||
ARG BUILD_TRITON="1"
|
|
||||||
ARG TRITON_BRANCH="e192dba"
|
|
||||||
|
|
||||||
### Base image build stage
|
|
||||||
FROM $BASE_IMAGE AS base
|
|
||||||
|
|
||||||
# Import arg(s) defined before this build stage
|
|
||||||
ARG PYTORCH_ROCM_ARCH
|
|
||||||
|
|
||||||
# Install some basic utilities
|
# Install some basic utilities
|
||||||
RUN apt-get update && apt-get install python3 python3-pip -y
|
RUN apt-get update -q -y && apt-get install -q -y \
|
||||||
RUN apt-get update && apt-get install -y \
|
sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev
|
||||||
curl \
|
# Remove sccache
|
||||||
ca-certificates \
|
RUN python3 -m pip install --upgrade pip && pip install setuptools_scm
|
||||||
sudo \
|
|
||||||
git \
|
|
||||||
bzip2 \
|
|
||||||
libx11-6 \
|
|
||||||
build-essential \
|
|
||||||
wget \
|
|
||||||
unzip \
|
|
||||||
tmux \
|
|
||||||
ccache \
|
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
|
||||||
|
|
||||||
# When launching the container, mount the code directory to /vllm-workspace
|
|
||||||
ARG APP_MOUNT=/vllm-workspace
|
|
||||||
WORKDIR ${APP_MOUNT}
|
|
||||||
|
|
||||||
RUN python3 -m pip install --upgrade pip
|
|
||||||
# Remove sccache so it doesn't interfere with ccache
|
|
||||||
# TODO: implement sccache support across components
|
|
||||||
RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
|
RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
|
||||||
|
ARG COMMON_WORKDIR
|
||||||
# Install torch == 2.6.0 on ROCm
|
WORKDIR ${COMMON_WORKDIR}
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
|
|
||||||
*"rocm-6.2"*) \
|
|
||||||
python3 -m pip uninstall -y torch torchvision \
|
|
||||||
&& python3 -m pip install --pre \
|
|
||||||
torch==2.6.0.dev20241113+rocm6.2 \
|
|
||||||
'setuptools-scm>=8' \
|
|
||||||
torchvision==0.20.0.dev20241113+rocm6.2 \
|
|
||||||
--extra-index-url https://download.pytorch.org/whl/nightly/rocm6.2;; \
|
|
||||||
*) ;; esac
|
|
||||||
|
|
||||||
ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer
|
|
||||||
ENV PATH=$PATH:/opt/rocm/bin:/libtorch/bin:
|
|
||||||
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/libtorch/lib:
|
|
||||||
ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/libtorch/include:/libtorch/include/torch/csrc/api/include/:/opt/rocm/include/:
|
|
||||||
|
|
||||||
ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
|
|
||||||
ENV CCACHE_DIR=/root/.cache/ccache
|
|
||||||
|
|
||||||
|
|
||||||
### AMD-SMI build stage
|
# -----------------------
|
||||||
FROM base AS build_amdsmi
|
# vLLM fetch stages
|
||||||
# Build amdsmi wheel always
|
FROM base AS fetch_vllm_0
|
||||||
RUN cd /opt/rocm/share/amd_smi \
|
ONBUILD COPY ./ vllm/
|
||||||
&& python3 -m pip wheel . --wheel-dir=/install
|
FROM base AS fetch_vllm_1
|
||||||
|
ARG VLLM_REPO="https://github.com/vllm-project/vllm.git"
|
||||||
|
ARG VLLM_BRANCH="main"
|
||||||
|
ONBUILD RUN git clone ${VLLM_REPO} \
|
||||||
|
&& cd vllm \
|
||||||
|
&& git checkout ${VLLM_BRANCH}
|
||||||
|
FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm
|
||||||
|
|
||||||
|
# -----------------------
|
||||||
|
# vLLM build stages
|
||||||
|
FROM fetch_vllm AS build_vllm
|
||||||
|
ARG USE_CYTHON
|
||||||
|
# Build vLLM
|
||||||
|
RUN cd vllm \
|
||||||
|
&& python3 -m pip install -r requirements-rocm.txt \
|
||||||
|
&& python3 setup.py clean --all \
|
||||||
|
&& if [ ${USE_CYTHON} -eq "1" ]; then python3 setup_cython.py build_ext --inplace; fi \
|
||||||
|
&& python3 setup.py bdist_wheel --dist-dir=dist
|
||||||
|
FROM scratch AS export_vllm
|
||||||
|
ARG COMMON_WORKDIR
|
||||||
|
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/dist/*.whl /
|
||||||
|
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/requirements*.txt /
|
||||||
|
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/benchmarks /benchmarks
|
||||||
|
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/tests /tests
|
||||||
|
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/examples /examples
|
||||||
|
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/.buildkite /.buildkite
|
||||||
|
|
||||||
### Flash-Attention wheel build stage
|
# -----------------------
|
||||||
FROM base AS build_fa
|
# Test vLLM image
|
||||||
ARG BUILD_FA
|
FROM base AS test
|
||||||
ARG FA_GFX_ARCHS
|
|
||||||
ARG FA_BRANCH
|
|
||||||
# Build ROCm flash-attention wheel if `BUILD_FA = 1`
|
|
||||||
RUN --mount=type=cache,target=${CCACHE_DIR} \
|
|
||||||
if [ "$BUILD_FA" = "1" ]; then \
|
|
||||||
mkdir -p libs \
|
|
||||||
&& cd libs \
|
|
||||||
&& git clone https://github.com/ROCm/flash-attention.git \
|
|
||||||
&& cd flash-attention \
|
|
||||||
&& git checkout "${FA_BRANCH}" \
|
|
||||||
&& git submodule update --init \
|
|
||||||
&& GPU_ARCHS="${FA_GFX_ARCHS}" python3 setup.py bdist_wheel --dist-dir=/install; \
|
|
||||||
# Create an empty directory otherwise as later build stages expect one
|
|
||||||
else mkdir -p /install; \
|
|
||||||
fi
|
|
||||||
|
|
||||||
|
RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
### Triton wheel build stage
|
# Install vLLM
|
||||||
FROM base AS build_triton
|
RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
|
||||||
ARG BUILD_TRITON
|
cd /install \
|
||||||
ARG TRITON_BRANCH
|
&& pip install -U -r requirements-rocm.txt \
|
||||||
# Build triton wheel if `BUILD_TRITON = 1`
|
&& pip uninstall -y vllm \
|
||||||
RUN --mount=type=cache,target=${CCACHE_DIR} \
|
&& pip install *.whl
|
||||||
if [ "$BUILD_TRITON" = "1" ]; then \
|
|
||||||
mkdir -p libs \
|
|
||||||
&& cd libs \
|
|
||||||
&& python3 -m pip install ninja cmake wheel pybind11 \
|
|
||||||
&& git clone https://github.com/OpenAI/triton.git \
|
|
||||||
&& cd triton \
|
|
||||||
&& git checkout "${TRITON_BRANCH}" \
|
|
||||||
&& cd python \
|
|
||||||
&& python3 setup.py bdist_wheel --dist-dir=/install; \
|
|
||||||
# Create an empty directory otherwise as later build stages expect one
|
|
||||||
else mkdir -p /install; \
|
|
||||||
fi
|
|
||||||
|
|
||||||
|
WORKDIR /vllm-workspace
|
||||||
### Final vLLM build stage
|
ARG COMMON_WORKDIR
|
||||||
FROM base AS final
|
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace
|
||||||
# Import the vLLM development directory from the build context
|
|
||||||
COPY . .
|
|
||||||
ARG GIT_REPO_CHECK=0
|
|
||||||
RUN --mount=type=bind,source=.git,target=.git \
|
|
||||||
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
|
|
||||||
|
|
||||||
RUN python3 -m pip install --upgrade pip
|
|
||||||
|
|
||||||
# Package upgrades for useful functionality or to avoid dependency issues
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
python3 -m pip install --upgrade numba scipy huggingface-hub[cli] pytest-shard
|
|
||||||
|
|
||||||
|
|
||||||
# Workaround for ray >= 2.10.0
|
|
||||||
ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
|
|
||||||
# Silences the HF Tokenizers warning
|
|
||||||
ENV TOKENIZERS_PARALLELISM=false
|
|
||||||
|
|
||||||
RUN --mount=type=cache,target=${CCACHE_DIR} \
|
|
||||||
--mount=type=bind,source=.git,target=.git \
|
|
||||||
--mount=type=cache,target=/root/.cache/pip \
|
|
||||||
python3 -m pip install -Ur requirements-rocm.txt \
|
|
||||||
&& python3 setup.py clean --all \
|
|
||||||
&& python3 setup.py develop
|
|
||||||
|
|
||||||
# Copy amdsmi wheel into final image
|
|
||||||
RUN --mount=type=bind,from=build_amdsmi,src=/install,target=/install \
|
|
||||||
mkdir -p libs \
|
|
||||||
&& cp /install/*.whl libs \
|
|
||||||
# Preemptively uninstall to avoid same-version no-installs
|
|
||||||
&& python3 -m pip uninstall -y amdsmi;
|
|
||||||
|
|
||||||
# Copy triton wheel(s) into final image if they were built
|
|
||||||
RUN --mount=type=bind,from=build_triton,src=/install,target=/install \
|
|
||||||
mkdir -p libs \
|
|
||||||
&& if ls /install/*.whl; then \
|
|
||||||
cp /install/*.whl libs \
|
|
||||||
# Preemptively uninstall to avoid same-version no-installs
|
|
||||||
&& python3 -m pip uninstall -y triton; fi
|
|
||||||
|
|
||||||
# Copy flash-attn wheel(s) into final image if they were built
|
|
||||||
RUN --mount=type=bind,from=build_fa,src=/install,target=/install \
|
|
||||||
mkdir -p libs \
|
|
||||||
&& if ls /install/*.whl; then \
|
|
||||||
cp /install/*.whl libs \
|
|
||||||
# Preemptively uninstall to avoid same-version no-installs
|
|
||||||
&& python3 -m pip uninstall -y flash-attn; fi
|
|
||||||
|
|
||||||
# Install wheels that were built to the final image
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
if ls libs/*.whl; then \
|
|
||||||
python3 -m pip install libs/*.whl; fi
|
|
||||||
|
|
||||||
# install development dependencies (for testing)
|
# install development dependencies (for testing)
|
||||||
RUN python3 -m pip install -e tests/vllm_test_utils
|
RUN cd /vllm-workspace \
|
||||||
|
&& rm -rf vllm \
|
||||||
|
&& python3 -m pip install -e tests/vllm_test_utils \
|
||||||
|
&& python3 -m pip install lm-eval[api]==0.4.4 \
|
||||||
|
&& python3 -m pip install pytest-shard
|
||||||
|
|
||||||
|
# -----------------------
|
||||||
|
# Final vLLM image
|
||||||
|
FROM base AS final
|
||||||
|
|
||||||
|
RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/*
|
||||||
|
# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
|
||||||
|
# Manually remove it so that later steps of numpy upgrade can continue
|
||||||
|
RUN case "$(which python3)" in \
|
||||||
|
*"/opt/conda/envs/py_3.9"*) \
|
||||||
|
rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/;; \
|
||||||
|
*) ;; esac
|
||||||
|
|
||||||
|
RUN python3 -m pip install --upgrade huggingface-hub[cli]
|
||||||
|
ARG BUILD_RPD
|
||||||
|
RUN if [ ${BUILD_RPD} -eq "1" ]; then \
|
||||||
|
git clone -b nvtx_enabled https://github.com/ROCm/rocmProfileData.git \
|
||||||
|
&& cd rocmProfileData/rpd_tracer \
|
||||||
|
&& pip install -r requirements.txt && cd ../ \
|
||||||
|
&& make && make install \
|
||||||
|
&& cd hipMarker && python3 setup.py install ; fi
|
||||||
|
|
||||||
|
# Install vLLM
|
||||||
|
RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
|
||||||
|
cd /install \
|
||||||
|
&& pip install -U -r requirements-rocm.txt \
|
||||||
|
&& pip uninstall -y vllm \
|
||||||
|
&& pip install *.whl
|
||||||
|
|
||||||
|
ARG COMMON_WORKDIR
|
||||||
|
|
||||||
|
# Copy over the benchmark scripts as well
|
||||||
|
COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks
|
||||||
|
COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples
|
||||||
|
|
||||||
|
ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
|
||||||
|
ENV TOKENIZERS_PARALLELISM=false
|
||||||
|
|
||||||
|
# Performance environment variable.
|
||||||
|
ENV HIP_FORCE_DEV_KERNARG=1
|
||||||
|
|
||||||
CMD ["/bin/bash"]
|
CMD ["/bin/bash"]
|
||||||
|
|
||||||
|
158
Dockerfile.rocm_base
Normal file
158
Dockerfile.rocm_base
Normal file
@ -0,0 +1,158 @@
|
|||||||
|
ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.3.1-complete
|
||||||
|
ARG HIPBLASLT_BRANCH="4d40e36"
|
||||||
|
ARG HIPBLAS_COMMON_BRANCH="7c1566b"
|
||||||
|
ARG LEGACY_HIPBLASLT_OPTION=
|
||||||
|
ARG RCCL_BRANCH="648a58d"
|
||||||
|
ARG RCCL_REPO="https://github.com/ROCm/rccl"
|
||||||
|
ARG TRITON_BRANCH="e5be006"
|
||||||
|
ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
|
||||||
|
ARG PYTORCH_BRANCH="3a585126"
|
||||||
|
ARG PYTORCH_VISION_BRANCH="v0.19.1"
|
||||||
|
ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
|
||||||
|
ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
|
||||||
|
ARG FA_BRANCH="b7d29fb"
|
||||||
|
ARG FA_REPO="https://github.com/ROCm/flash-attention.git"
|
||||||
|
|
||||||
|
FROM ${BASE_IMAGE} AS base
|
||||||
|
|
||||||
|
ENV PATH=/opt/rocm/llvm/bin:$PATH
|
||||||
|
ENV ROCM_PATH=/opt/rocm
|
||||||
|
ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib:
|
||||||
|
ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942
|
||||||
|
ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
|
||||||
|
|
||||||
|
ARG PYTHON_VERSION=3.12
|
||||||
|
|
||||||
|
RUN mkdir -p /app
|
||||||
|
WORKDIR /app
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
|
|
||||||
|
# Install Python and other dependencies
|
||||||
|
RUN apt-get update -y \
|
||||||
|
&& apt-get install -y software-properties-common git curl sudo vim less \
|
||||||
|
&& add-apt-repository ppa:deadsnakes/ppa \
|
||||||
|
&& apt-get update -y \
|
||||||
|
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
|
||||||
|
python${PYTHON_VERSION}-lib2to3 python-is-python3 \
|
||||||
|
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
|
||||||
|
&& update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
|
||||||
|
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
|
||||||
|
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
|
||||||
|
&& python3 --version && python3 -m pip --version
|
||||||
|
|
||||||
|
RUN pip install -U packaging cmake ninja wheel setuptools pybind11 Cython
|
||||||
|
|
||||||
|
FROM base AS build_hipblaslt
|
||||||
|
ARG HIPBLASLT_BRANCH
|
||||||
|
ARG HIPBLAS_COMMON_BRANCH
|
||||||
|
# Set to "--legacy_hipblas_direct" for ROCm<=6.2
|
||||||
|
ARG LEGACY_HIPBLASLT_OPTION
|
||||||
|
RUN git clone https://github.com/ROCm/hipBLAS-common.git
|
||||||
|
RUN cd hipBLAS-common \
|
||||||
|
&& git checkout ${HIPBLAS_COMMON_BRANCH} \
|
||||||
|
&& mkdir build \
|
||||||
|
&& cd build \
|
||||||
|
&& cmake .. \
|
||||||
|
&& make package \
|
||||||
|
&& dpkg -i ./*.deb
|
||||||
|
RUN git clone https://github.com/ROCm/hipBLASLt
|
||||||
|
RUN cd hipBLASLt \
|
||||||
|
&& git checkout ${HIPBLASLT_BRANCH} \
|
||||||
|
&& ./install.sh -d --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \
|
||||||
|
&& cd build/release \
|
||||||
|
&& make package
|
||||||
|
RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install
|
||||||
|
|
||||||
|
FROM base AS build_rccl
|
||||||
|
ARG RCCL_BRANCH
|
||||||
|
ARG RCCL_REPO
|
||||||
|
RUN git clone ${RCCL_REPO}
|
||||||
|
RUN cd rccl \
|
||||||
|
&& git checkout ${RCCL_BRANCH} \
|
||||||
|
&& ./install.sh -p --amdgpu_targets ${PYTORCH_ROCM_ARCH}
|
||||||
|
RUN mkdir -p /app/install && cp /app/rccl/build/release/*.deb /app/install
|
||||||
|
|
||||||
|
FROM base AS build_triton
|
||||||
|
ARG TRITON_BRANCH
|
||||||
|
ARG TRITON_REPO
|
||||||
|
RUN git clone ${TRITON_REPO}
|
||||||
|
RUN cd triton \
|
||||||
|
&& git checkout ${TRITON_BRANCH} \
|
||||||
|
&& cd python \
|
||||||
|
&& python3 setup.py bdist_wheel --dist-dir=dist
|
||||||
|
RUN mkdir -p /app/install && cp /app/triton/python/dist/*.whl /app/install
|
||||||
|
|
||||||
|
FROM base AS build_amdsmi
|
||||||
|
RUN cd /opt/rocm/share/amd_smi \
|
||||||
|
&& pip wheel . --wheel-dir=dist
|
||||||
|
RUN mkdir -p /app/install && cp /opt/rocm/share/amd_smi/dist/*.whl /app/install
|
||||||
|
|
||||||
|
FROM base AS build_pytorch
|
||||||
|
ARG PYTORCH_BRANCH
|
||||||
|
ARG PYTORCH_VISION_BRANCH
|
||||||
|
ARG PYTORCH_REPO
|
||||||
|
ARG PYTORCH_VISION_REPO
|
||||||
|
ARG FA_BRANCH
|
||||||
|
ARG FA_REPO
|
||||||
|
RUN git clone ${PYTORCH_REPO} pytorch
|
||||||
|
RUN cd pytorch && git checkout ${PYTORCH_BRANCH} && \
|
||||||
|
pip install -r requirements.txt && git submodule update --init --recursive \
|
||||||
|
&& python3 tools/amd_build/build_amd.py \
|
||||||
|
&& CMAKE_PREFIX_PATH=$(python3 -c 'import sys; print(sys.prefix)') python3 setup.py bdist_wheel --dist-dir=dist \
|
||||||
|
&& pip install dist/*.whl
|
||||||
|
RUN git clone ${PYTORCH_VISION_REPO} vision
|
||||||
|
RUN cd vision && git checkout ${PYTORCH_VISION_BRANCH} \
|
||||||
|
&& python3 setup.py bdist_wheel --dist-dir=dist \
|
||||||
|
&& pip install dist/*.whl
|
||||||
|
RUN git clone ${FA_REPO}
|
||||||
|
RUN cd flash-attention \
|
||||||
|
&& git checkout ${FA_BRANCH} \
|
||||||
|
&& git submodule update --init \
|
||||||
|
&& MAX_JOBS=64 GPU_ARCHS=${PYTORCH_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist
|
||||||
|
RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
|
||||||
|
&& cp /app/vision/dist/*.whl /app/install \
|
||||||
|
&& cp /app/flash-attention/dist/*.whl /app/install
|
||||||
|
|
||||||
|
FROM base AS final
|
||||||
|
RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
|
||||||
|
dpkg -i /install/*deb \
|
||||||
|
&& sed -i 's/, hipblaslt-dev \(.*\), hipcub-dev/, hipcub-dev/g' /var/lib/dpkg/status \
|
||||||
|
&& sed -i 's/, hipblaslt \(.*\), hipfft/, hipfft/g' /var/lib/dpkg/status
|
||||||
|
RUN --mount=type=bind,from=build_rccl,src=/app/install/,target=/install \
|
||||||
|
dpkg -i /install/*deb \
|
||||||
|
&& sed -i 's/, rccl-dev \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status \
|
||||||
|
&& sed -i 's/, rccl \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status
|
||||||
|
RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
|
||||||
|
pip install /install/*.whl
|
||||||
|
RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
|
||||||
|
pip install /install/*.whl
|
||||||
|
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
|
||||||
|
pip install /install/*.whl
|
||||||
|
|
||||||
|
ARG BASE_IMAGE
|
||||||
|
ARG HIPBLASLT_BRANCH
|
||||||
|
ARG LEGACY_HIPBLASLT_OPTION
|
||||||
|
ARG RCCL_BRANCH
|
||||||
|
ARG RCCL_REPO
|
||||||
|
ARG TRITON_BRANCH
|
||||||
|
ARG TRITON_REPO
|
||||||
|
ARG PYTORCH_BRANCH
|
||||||
|
ARG PYTORCH_VISION_BRANCH
|
||||||
|
ARG PYTORCH_REPO
|
||||||
|
ARG PYTORCH_VISION_REPO
|
||||||
|
ARG FA_BRANCH
|
||||||
|
ARG FA_REPO
|
||||||
|
RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
|
||||||
|
&& echo "HIPBLAS_COMMON_BRANCH: ${HIPBLAS_COMMON_BRANCH}" >> /app/versions.txt \
|
||||||
|
&& echo "HIPBLASLT_BRANCH: ${HIPBLASLT_BRANCH}" >> /app/versions.txt \
|
||||||
|
&& echo "LEGACY_HIPBLASLT_OPTION: ${LEGACY_HIPBLASLT_OPTION}" >> /app/versions.txt \
|
||||||
|
&& echo "RCCL_BRANCH: ${RCCL_BRANCH}" >> /app/versions.txt \
|
||||||
|
&& echo "RCCL_REPO: ${RCCL_REPO}" >> /app/versions.txt \
|
||||||
|
&& echo "TRITON_BRANCH: ${TRITON_BRANCH}" >> /app/versions.txt \
|
||||||
|
&& echo "TRITON_REPO: ${TRITON_REPO}" >> /app/versions.txt \
|
||||||
|
&& echo "PYTORCH_BRANCH: ${PYTORCH_BRANCH}" >> /app/versions.txt \
|
||||||
|
&& echo "PYTORCH_VISION_BRANCH: ${PYTORCH_VISION_BRANCH}" >> /app/versions.txt \
|
||||||
|
&& echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \
|
||||||
|
&& echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \
|
||||||
|
&& echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
|
||||||
|
&& echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt
|
@ -1,4 +1,4 @@
|
|||||||
ARG NIGHTLY_DATE="20241017"
|
ARG NIGHTLY_DATE="20250124"
|
||||||
ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE"
|
ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE"
|
||||||
|
|
||||||
FROM $BASE_IMAGE
|
FROM $BASE_IMAGE
|
||||||
|
54
README.md
54
README.md
@ -10,12 +10,19 @@ Easy, fast, and cheap LLM serving for everyone
|
|||||||
</h3>
|
</h3>
|
||||||
|
|
||||||
<p align="center">
|
<p align="center">
|
||||||
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://discord.gg/jz7wjKhh6g"><b>Discord</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
|
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
We are excited to invite you to our Menlo Park meetup with Meta, evening of Thursday, February 27! Meta engineers will discuss the improvements on top of vLLM, and vLLM contributors will share updates from the v0.7.x series of releases. [Register Now](https://lu.ma/h7g3kuj9)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
*Latest News* 🔥
|
*Latest News* 🔥
|
||||||
|
|
||||||
|
- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
|
||||||
|
- [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing), and Google Cloud team [here](https://drive.google.com/file/d/1h24pHewANyRL11xy5dXUbvRC9F9Kkjix/view?usp=sharing).
|
||||||
- [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
|
- [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
|
||||||
- [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).
|
- [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).
|
||||||
- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
|
- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
|
||||||
@ -31,13 +38,17 @@ Easy, fast, and cheap LLM serving for everyone
|
|||||||
- [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).
|
- [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## About
|
## About
|
||||||
|
|
||||||
vLLM is a fast and easy-to-use library for LLM inference and serving.
|
vLLM is a fast and easy-to-use library for LLM inference and serving.
|
||||||
|
|
||||||
|
Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evolved into a community-driven project with contributions from both academia and industry.
|
||||||
|
|
||||||
vLLM is fast with:
|
vLLM is fast with:
|
||||||
|
|
||||||
- State-of-the-art serving throughput
|
- State-of-the-art serving throughput
|
||||||
- Efficient management of attention key and value memory with **PagedAttention**
|
- Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html)
|
||||||
- Continuous batching of incoming requests
|
- Continuous batching of incoming requests
|
||||||
- Fast model execution with CUDA/HIP graph
|
- Fast model execution with CUDA/HIP graph
|
||||||
- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8.
|
- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8.
|
||||||
@ -60,7 +71,7 @@ vLLM is flexible and easy to use with:
|
|||||||
|
|
||||||
vLLM seamlessly supports most popular open-source models on HuggingFace, including:
|
vLLM seamlessly supports most popular open-source models on HuggingFace, including:
|
||||||
- Transformer-like LLMs (e.g., Llama)
|
- Transformer-like LLMs (e.g., Llama)
|
||||||
- Mixture-of-Expert LLMs (e.g., Mixtral)
|
- Mixture-of-Expert LLMs (e.g., Mixtral, Deepseek-V2 and V3)
|
||||||
- Embedding Models (e.g. E5-Mistral)
|
- Embedding Models (e.g. E5-Mistral)
|
||||||
- Multi-modal LLMs (e.g., LLaVA)
|
- Multi-modal LLMs (e.g., LLaVA)
|
||||||
|
|
||||||
@ -68,16 +79,16 @@ Find the full list of supported models [here](https://docs.vllm.ai/en/latest/mod
|
|||||||
|
|
||||||
## Getting Started
|
## Getting Started
|
||||||
|
|
||||||
Install vLLM with `pip` or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source):
|
Install vLLM with `pip` or [from source](https://docs.vllm.ai/en/latest/getting_started/installation/gpu/index.html#build-wheel-from-source):
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
pip install vllm
|
pip install vllm
|
||||||
```
|
```
|
||||||
|
|
||||||
Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to learn more.
|
Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more.
|
||||||
- [Installation](https://vllm.readthedocs.io/en/latest/getting_started/installation.html)
|
- [Installation](https://docs.vllm.ai/en/latest/getting_started/installation/index.html)
|
||||||
- [Quickstart](https://vllm.readthedocs.io/en/latest/getting_started/quickstart.html)
|
- [Quickstart](https://docs.vllm.ai/en/latest/getting_started/quickstart.html)
|
||||||
- [Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html)
|
- [List of Supported Models](https://docs.vllm.ai/en/latest/models/supported_models.html)
|
||||||
|
|
||||||
## Contributing
|
## Contributing
|
||||||
|
|
||||||
@ -90,34 +101,40 @@ vLLM is a community project. Our compute resources for development and testing a
|
|||||||
|
|
||||||
<!-- Note: Please sort them in alphabetical order. -->
|
<!-- Note: Please sort them in alphabetical order. -->
|
||||||
<!-- Note: Please keep these consistent with docs/source/community/sponsors.md -->
|
<!-- Note: Please keep these consistent with docs/source/community/sponsors.md -->
|
||||||
|
Cash Donations:
|
||||||
- a16z
|
- a16z
|
||||||
|
- Dropbox
|
||||||
|
- Sequoia Capital
|
||||||
|
- Skywork AI
|
||||||
|
- ZhenFund
|
||||||
|
|
||||||
|
Compute Resources:
|
||||||
- AMD
|
- AMD
|
||||||
- Anyscale
|
- Anyscale
|
||||||
- AWS
|
- AWS
|
||||||
- Crusoe Cloud
|
- Crusoe Cloud
|
||||||
- Databricks
|
- Databricks
|
||||||
- DeepInfra
|
- DeepInfra
|
||||||
- Dropbox
|
|
||||||
- Google Cloud
|
- Google Cloud
|
||||||
- Lambda Lab
|
- Lambda Lab
|
||||||
- Nebius
|
- Nebius
|
||||||
|
- Novita AI
|
||||||
- NVIDIA
|
- NVIDIA
|
||||||
- Replicate
|
- Replicate
|
||||||
- Roblox
|
- Roblox
|
||||||
- RunPod
|
- RunPod
|
||||||
- Sequoia Capital
|
|
||||||
- Skywork AI
|
|
||||||
- Trainy
|
- Trainy
|
||||||
- UC Berkeley
|
- UC Berkeley
|
||||||
- UC San Diego
|
- UC San Diego
|
||||||
- ZhenFund
|
|
||||||
|
Slack Sponsor: Anyscale
|
||||||
|
|
||||||
We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM.
|
We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM.
|
||||||
|
|
||||||
## Citation
|
## Citation
|
||||||
|
|
||||||
If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs/2309.06180):
|
If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs/2309.06180):
|
||||||
|
|
||||||
```bibtex
|
```bibtex
|
||||||
@inproceedings{kwon2023efficient,
|
@inproceedings{kwon2023efficient,
|
||||||
title={Efficient Memory Management for Large Language Model Serving with PagedAttention},
|
title={Efficient Memory Management for Large Language Model Serving with PagedAttention},
|
||||||
@ -129,12 +146,11 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
|
|||||||
|
|
||||||
## Contact Us
|
## Contact Us
|
||||||
|
|
||||||
* For technical questions and feature requests, please use Github issues or discussions.
|
- For technical questions and feature requests, please use Github issues or discussions.
|
||||||
* For discussing with fellow users, please use Discord.
|
- For discussing with fellow users and coordinating contributions and development, please use Slack.
|
||||||
* For coordinating contributions and development, please use Slack.
|
- For security disclosures, please use Github's security advisory feature.
|
||||||
* For security disclosures, please use Github's security advisory feature.
|
- For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu.
|
||||||
* For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu.
|
|
||||||
|
|
||||||
## Media Kit
|
## Media Kit
|
||||||
|
|
||||||
* If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit).
|
- If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit).
|
||||||
|
@ -4,7 +4,7 @@
|
|||||||
|
|
||||||
If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem.
|
If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem.
|
||||||
|
|
||||||
Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new).
|
Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new). Reports will then be triaged by the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html).
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
@ -3,6 +3,7 @@
|
|||||||
## Downloading the ShareGPT dataset
|
## Downloading the ShareGPT dataset
|
||||||
|
|
||||||
You can download the dataset by running:
|
You can download the dataset by running:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||||
```
|
```
|
||||||
@ -11,9 +12,18 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r
|
|||||||
|
|
||||||
The json file refers to several image datasets (coco, llava, etc.). The benchmark scripts
|
The json file refers to several image datasets (coco, llava, etc.). The benchmark scripts
|
||||||
will ignore a datapoint if the referred image is missing.
|
will ignore a datapoint if the referred image is missing.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/resolve/main/sharegpt4v_instruct_gpt4-vision_cap100k.json
|
wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/resolve/main/sharegpt4v_instruct_gpt4-vision_cap100k.json
|
||||||
mkdir coco -p
|
mkdir coco -p
|
||||||
wget http://images.cocodataset.org/zips/train2017.zip -O coco/train2017.zip
|
wget http://images.cocodataset.org/zips/train2017.zip -O coco/train2017.zip
|
||||||
unzip coco/train2017.zip -d coco/
|
unzip coco/train2017.zip -d coco/
|
||||||
```
|
```
|
||||||
|
|
||||||
|
# Downloading the BurstGPT dataset
|
||||||
|
|
||||||
|
You can download the BurstGPT v1.1 dataset by running:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv
|
||||||
|
```
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
@ -22,6 +24,7 @@ class RequestFuncInput:
|
|||||||
prompt_len: int
|
prompt_len: int
|
||||||
output_len: int
|
output_len: int
|
||||||
model: str
|
model: str
|
||||||
|
model_name: Optional[str] = None
|
||||||
best_of: int = 1
|
best_of: int = 1
|
||||||
logprobs: Optional[int] = None
|
logprobs: Optional[int] = None
|
||||||
extra_body: Optional[dict] = None
|
extra_body: Optional[dict] = None
|
||||||
@ -34,6 +37,7 @@ class RequestFuncOutput:
|
|||||||
generated_text: str = ""
|
generated_text: str = ""
|
||||||
success: bool = False
|
success: bool = False
|
||||||
latency: float = 0.0
|
latency: float = 0.0
|
||||||
|
output_tokens: int = 0
|
||||||
ttft: float = 0.0 # Time to first token
|
ttft: float = 0.0 # Time to first token
|
||||||
itl: List[float] = field(
|
itl: List[float] = field(
|
||||||
default_factory=list) # List of inter-token latencies
|
default_factory=list) # List of inter-token latencies
|
||||||
@ -49,7 +53,8 @@ async def async_request_tgi(
|
|||||||
api_url = request_func_input.api_url
|
api_url = request_func_input.api_url
|
||||||
assert api_url.endswith("generate_stream")
|
assert api_url.endswith("generate_stream")
|
||||||
|
|
||||||
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
async with aiohttp.ClientSession(trust_env=True,
|
||||||
|
timeout=AIOHTTP_TIMEOUT) as session:
|
||||||
params = {
|
params = {
|
||||||
"best_of": request_func_input.best_of,
|
"best_of": request_func_input.best_of,
|
||||||
"max_new_tokens": request_func_input.output_len,
|
"max_new_tokens": request_func_input.output_len,
|
||||||
@ -78,7 +83,7 @@ async def async_request_tgi(
|
|||||||
continue
|
continue
|
||||||
chunk_bytes = chunk_bytes.decode("utf-8")
|
chunk_bytes = chunk_bytes.decode("utf-8")
|
||||||
|
|
||||||
#NOTE: Sometimes TGI returns a ping response without
|
# NOTE: Sometimes TGI returns a ping response without
|
||||||
# any data, we should skip it.
|
# any data, we should skip it.
|
||||||
if chunk_bytes.startswith(":"):
|
if chunk_bytes.startswith(":"):
|
||||||
continue
|
continue
|
||||||
@ -121,7 +126,8 @@ async def async_request_trt_llm(
|
|||||||
api_url = request_func_input.api_url
|
api_url = request_func_input.api_url
|
||||||
assert api_url.endswith("generate_stream")
|
assert api_url.endswith("generate_stream")
|
||||||
|
|
||||||
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
async with aiohttp.ClientSession(trust_env=True,
|
||||||
|
timeout=AIOHTTP_TIMEOUT) as session:
|
||||||
assert request_func_input.best_of == 1
|
assert request_func_input.best_of == 1
|
||||||
payload = {
|
payload = {
|
||||||
"accumulate_tokens": True,
|
"accumulate_tokens": True,
|
||||||
@ -155,7 +161,7 @@ async def async_request_trt_llm(
|
|||||||
timestamp = time.perf_counter()
|
timestamp = time.perf_counter()
|
||||||
# First token
|
# First token
|
||||||
if ttft == 0.0:
|
if ttft == 0.0:
|
||||||
ttft = time.perf_counter() - st
|
ttft = timestamp - st
|
||||||
output.ttft = ttft
|
output.ttft = ttft
|
||||||
|
|
||||||
# Decoding phase
|
# Decoding phase
|
||||||
@ -185,7 +191,8 @@ async def async_request_deepspeed_mii(
|
|||||||
request_func_input: RequestFuncInput,
|
request_func_input: RequestFuncInput,
|
||||||
pbar: Optional[tqdm] = None,
|
pbar: Optional[tqdm] = None,
|
||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
async with aiohttp.ClientSession(trust_env=True,
|
||||||
|
timeout=AIOHTTP_TIMEOUT) as session:
|
||||||
assert request_func_input.best_of == 1
|
assert request_func_input.best_of == 1
|
||||||
|
|
||||||
payload = {
|
payload = {
|
||||||
@ -233,17 +240,23 @@ async def async_request_openai_completions(
|
|||||||
("completions", "profile")
|
("completions", "profile")
|
||||||
), "OpenAI Completions API URL must end with 'completions' or 'profile'."
|
), "OpenAI Completions API URL must end with 'completions' or 'profile'."
|
||||||
|
|
||||||
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
async with aiohttp.ClientSession(trust_env=True,
|
||||||
|
timeout=AIOHTTP_TIMEOUT) as session:
|
||||||
payload = {
|
payload = {
|
||||||
"model": request_func_input.model,
|
"model": request_func_input.model_name \
|
||||||
|
if request_func_input.model_name else request_func_input.model,
|
||||||
"prompt": request_func_input.prompt,
|
"prompt": request_func_input.prompt,
|
||||||
"temperature": 0.0,
|
"temperature": 0.0,
|
||||||
"best_of": request_func_input.best_of,
|
"best_of": request_func_input.best_of,
|
||||||
"max_tokens": request_func_input.output_len,
|
"max_tokens": request_func_input.output_len,
|
||||||
"logprobs": request_func_input.logprobs,
|
"logprobs": request_func_input.logprobs,
|
||||||
"stream": True,
|
"stream": True,
|
||||||
"ignore_eos": request_func_input.ignore_eos,
|
"stream_options": {
|
||||||
|
"include_usage": True,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
if request_func_input.ignore_eos:
|
||||||
|
payload["ignore_eos"] = request_func_input.ignore_eos
|
||||||
if request_func_input.extra_body:
|
if request_func_input.extra_body:
|
||||||
payload.update(request_func_input.extra_body)
|
payload.update(request_func_input.extra_body)
|
||||||
headers = {
|
headers = {
|
||||||
@ -254,7 +267,6 @@ async def async_request_openai_completions(
|
|||||||
output.prompt_len = request_func_input.prompt_len
|
output.prompt_len = request_func_input.prompt_len
|
||||||
|
|
||||||
generated_text = ""
|
generated_text = ""
|
||||||
ttft = 0.0
|
|
||||||
st = time.perf_counter()
|
st = time.perf_counter()
|
||||||
most_recent_timestamp = st
|
most_recent_timestamp = st
|
||||||
try:
|
try:
|
||||||
@ -269,15 +281,16 @@ async def async_request_openai_completions(
|
|||||||
|
|
||||||
chunk = chunk_bytes.decode("utf-8").removeprefix(
|
chunk = chunk_bytes.decode("utf-8").removeprefix(
|
||||||
"data: ")
|
"data: ")
|
||||||
if chunk == "[DONE]":
|
if chunk != "[DONE]":
|
||||||
latency = time.perf_counter() - st
|
|
||||||
else:
|
|
||||||
data = json.loads(chunk)
|
data = json.loads(chunk)
|
||||||
|
|
||||||
# NOTE: Some completion API might have a last
|
# NOTE: Some completion API might have a last
|
||||||
# usage summary response without a token so we
|
# usage summary response without a token so we
|
||||||
# want to check a token was generated
|
# want to check a token was generated
|
||||||
if data["choices"][0]["text"]:
|
if choices := data.get("choices"):
|
||||||
|
# Note that text could be empty here
|
||||||
|
# e.g. for special tokens
|
||||||
|
text = choices[0].get("text")
|
||||||
timestamp = time.perf_counter()
|
timestamp = time.perf_counter()
|
||||||
# First token
|
# First token
|
||||||
if not first_chunk_received:
|
if not first_chunk_received:
|
||||||
@ -291,7 +304,10 @@ async def async_request_openai_completions(
|
|||||||
most_recent_timestamp)
|
most_recent_timestamp)
|
||||||
|
|
||||||
most_recent_timestamp = timestamp
|
most_recent_timestamp = timestamp
|
||||||
generated_text += data["choices"][0]["text"]
|
generated_text += text or ""
|
||||||
|
elif usage := data.get("usage"):
|
||||||
|
output.output_tokens = usage.get(
|
||||||
|
"completion_tokens")
|
||||||
if first_chunk_received:
|
if first_chunk_received:
|
||||||
output.success = True
|
output.success = True
|
||||||
else:
|
else:
|
||||||
@ -300,7 +316,7 @@ async def async_request_openai_completions(
|
|||||||
"Never received a valid chunk to calculate TTFT."
|
"Never received a valid chunk to calculate TTFT."
|
||||||
"This response will be marked as failed!")
|
"This response will be marked as failed!")
|
||||||
output.generated_text = generated_text
|
output.generated_text = generated_text
|
||||||
output.latency = latency
|
output.latency = most_recent_timestamp - st
|
||||||
else:
|
else:
|
||||||
output.error = response.reason or ""
|
output.error = response.reason or ""
|
||||||
output.success = False
|
output.success = False
|
||||||
@ -323,12 +339,14 @@ async def async_request_openai_chat_completions(
|
|||||||
"chat/completions"
|
"chat/completions"
|
||||||
), "OpenAI Chat Completions API URL must end with 'chat/completions'."
|
), "OpenAI Chat Completions API URL must end with 'chat/completions'."
|
||||||
|
|
||||||
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
async with aiohttp.ClientSession(trust_env=True,
|
||||||
|
timeout=AIOHTTP_TIMEOUT) as session:
|
||||||
content = [{"type": "text", "text": request_func_input.prompt}]
|
content = [{"type": "text", "text": request_func_input.prompt}]
|
||||||
if request_func_input.multi_modal_content:
|
if request_func_input.multi_modal_content:
|
||||||
content.append(request_func_input.multi_modal_content)
|
content.append(request_func_input.multi_modal_content)
|
||||||
payload = {
|
payload = {
|
||||||
"model": request_func_input.model,
|
"model": request_func_input.model_name \
|
||||||
|
if request_func_input.model_name else request_func_input.model,
|
||||||
"messages": [
|
"messages": [
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
@ -338,8 +356,12 @@ async def async_request_openai_chat_completions(
|
|||||||
"temperature": 0.0,
|
"temperature": 0.0,
|
||||||
"max_completion_tokens": request_func_input.output_len,
|
"max_completion_tokens": request_func_input.output_len,
|
||||||
"stream": True,
|
"stream": True,
|
||||||
"ignore_eos": request_func_input.ignore_eos,
|
"stream_options": {
|
||||||
|
"include_usage": True,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
if request_func_input.ignore_eos:
|
||||||
|
payload["ignore_eos"] = request_func_input.ignore_eos
|
||||||
if request_func_input.extra_body:
|
if request_func_input.extra_body:
|
||||||
payload.update(request_func_input.extra_body)
|
payload.update(request_func_input.extra_body)
|
||||||
headers = {
|
headers = {
|
||||||
@ -365,17 +387,15 @@ async def async_request_openai_chat_completions(
|
|||||||
|
|
||||||
chunk = chunk_bytes.decode("utf-8").removeprefix(
|
chunk = chunk_bytes.decode("utf-8").removeprefix(
|
||||||
"data: ")
|
"data: ")
|
||||||
if chunk == "[DONE]":
|
if chunk != "[DONE]":
|
||||||
latency = time.perf_counter() - st
|
|
||||||
else:
|
|
||||||
timestamp = time.perf_counter()
|
timestamp = time.perf_counter()
|
||||||
data = json.loads(chunk)
|
data = json.loads(chunk)
|
||||||
|
|
||||||
delta = data["choices"][0]["delta"]
|
if choices := data.get("choices"):
|
||||||
if delta.get("content", None):
|
content = choices[0]["delta"].get("content")
|
||||||
# First token
|
# First token
|
||||||
if ttft == 0.0:
|
if ttft == 0.0:
|
||||||
ttft = time.perf_counter() - st
|
ttft = timestamp - st
|
||||||
output.ttft = ttft
|
output.ttft = ttft
|
||||||
|
|
||||||
# Decoding phase
|
# Decoding phase
|
||||||
@ -383,13 +403,16 @@ async def async_request_openai_chat_completions(
|
|||||||
output.itl.append(timestamp -
|
output.itl.append(timestamp -
|
||||||
most_recent_timestamp)
|
most_recent_timestamp)
|
||||||
|
|
||||||
generated_text += delta["content"]
|
generated_text += content or ""
|
||||||
|
elif usage := data.get("usage"):
|
||||||
|
output.output_tokens = usage.get(
|
||||||
|
"completion_tokens")
|
||||||
|
|
||||||
most_recent_timestamp = timestamp
|
most_recent_timestamp = timestamp
|
||||||
|
|
||||||
output.generated_text = generated_text
|
output.generated_text = generated_text
|
||||||
output.success = True
|
output.success = True
|
||||||
output.latency = latency
|
output.latency = most_recent_timestamp - st
|
||||||
else:
|
else:
|
||||||
output.error = response.reason or ""
|
output.error = response.reason or ""
|
||||||
output.success = False
|
output.success = False
|
||||||
@ -417,14 +440,35 @@ def get_model(pretrained_model_name_or_path: str) -> str:
|
|||||||
|
|
||||||
|
|
||||||
def get_tokenizer(
|
def get_tokenizer(
|
||||||
pretrained_model_name_or_path: str, trust_remote_code: bool
|
pretrained_model_name_or_path: str,
|
||||||
|
tokenizer_mode: str = "auto",
|
||||||
|
trust_remote_code: bool = False,
|
||||||
|
**kwargs,
|
||||||
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
|
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
|
||||||
if pretrained_model_name_or_path is not None and not os.path.exists(
|
if pretrained_model_name_or_path is not None and not os.path.exists(
|
||||||
pretrained_model_name_or_path):
|
pretrained_model_name_or_path):
|
||||||
pretrained_model_name_or_path = get_model(
|
pretrained_model_name_or_path = get_model(
|
||||||
pretrained_model_name_or_path)
|
pretrained_model_name_or_path)
|
||||||
return AutoTokenizer.from_pretrained(pretrained_model_name_or_path,
|
if tokenizer_mode == "slow":
|
||||||
trust_remote_code=trust_remote_code)
|
if kwargs.get("use_fast", False):
|
||||||
|
raise ValueError(
|
||||||
|
"Cannot use the fast tokenizer in slow tokenizer mode.")
|
||||||
|
kwargs["use_fast"] = False
|
||||||
|
if tokenizer_mode == "mistral":
|
||||||
|
try:
|
||||||
|
from vllm.transformers_utils.tokenizer import MistralTokenizer
|
||||||
|
except ImportError as e:
|
||||||
|
raise ImportError("MistralTokenizer requires vllm package.\n"
|
||||||
|
"Please install it with `pip install vllm` "
|
||||||
|
"to use mistral tokenizer mode.") from e
|
||||||
|
return MistralTokenizer.from_pretrained(
|
||||||
|
str(pretrained_model_name_or_path))
|
||||||
|
else:
|
||||||
|
return AutoTokenizer.from_pretrained(
|
||||||
|
pretrained_model_name_or_path,
|
||||||
|
trust_remote_code=trust_remote_code,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
ASYNC_REQUEST_FUNCS = {
|
ASYNC_REQUEST_FUNCS = {
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
"""Benchmark guided decoding throughput."""
|
"""Benchmark guided decoding throughput."""
|
||||||
import argparse
|
import argparse
|
||||||
import dataclasses
|
import dataclasses
|
||||||
|
@ -1,21 +1,39 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
"""Benchmark the latency of processing a single batch of requests."""
|
"""Benchmark the latency of processing a single batch of requests."""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import dataclasses
|
import dataclasses
|
||||||
import json
|
import json
|
||||||
|
import os
|
||||||
import time
|
import time
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List, Optional
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
|
from benchmark_utils import convert_to_pytorch_benchmark_format
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
from vllm.engine.arg_utils import EngineArgs
|
from vllm.engine.arg_utils import EngineArgs
|
||||||
from vllm.inputs import PromptType
|
from vllm.inputs import PromptType
|
||||||
|
from vllm.sampling_params import BeamSearchParams
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
|
|
||||||
|
def save_to_pytorch_benchmark_format(args: argparse.Namespace,
|
||||||
|
results: Dict[str, Any]) -> None:
|
||||||
|
pt_records = convert_to_pytorch_benchmark_format(
|
||||||
|
args=args,
|
||||||
|
metrics={"latency": results["latencies"]},
|
||||||
|
extra_info={k: results[k]
|
||||||
|
for k in ["avg_latency", "percentiles"]})
|
||||||
|
if pt_records:
|
||||||
|
pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
|
||||||
|
with open(pt_file, "w") as f:
|
||||||
|
json.dump(pt_records, f)
|
||||||
|
|
||||||
|
|
||||||
def main(args: argparse.Namespace):
|
def main(args: argparse.Namespace):
|
||||||
print(args)
|
print(args)
|
||||||
|
|
||||||
@ -40,6 +58,21 @@ def main(args: argparse.Namespace):
|
|||||||
"prompt_token_ids": batch
|
"prompt_token_ids": batch
|
||||||
} for batch in dummy_prompt_token_ids.tolist()]
|
} for batch in dummy_prompt_token_ids.tolist()]
|
||||||
|
|
||||||
|
def llm_generate():
|
||||||
|
if not args.use_beam_search:
|
||||||
|
llm.generate(dummy_prompts,
|
||||||
|
sampling_params=sampling_params,
|
||||||
|
use_tqdm=False)
|
||||||
|
else:
|
||||||
|
llm.beam_search(
|
||||||
|
dummy_prompts,
|
||||||
|
BeamSearchParams(
|
||||||
|
beam_width=args.n,
|
||||||
|
max_tokens=args.output_len,
|
||||||
|
ignore_eos=True,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
def run_to_completion(profile_dir: Optional[str] = None):
|
def run_to_completion(profile_dir: Optional[str] = None):
|
||||||
if profile_dir:
|
if profile_dir:
|
||||||
with torch.profiler.profile(
|
with torch.profiler.profile(
|
||||||
@ -48,16 +81,13 @@ def main(args: argparse.Namespace):
|
|||||||
torch.profiler.ProfilerActivity.CUDA,
|
torch.profiler.ProfilerActivity.CUDA,
|
||||||
],
|
],
|
||||||
on_trace_ready=torch.profiler.tensorboard_trace_handler(
|
on_trace_ready=torch.profiler.tensorboard_trace_handler(
|
||||||
str(profile_dir))) as p:
|
str(profile_dir)),
|
||||||
llm.generate(dummy_prompts,
|
) as p:
|
||||||
sampling_params=sampling_params,
|
llm_generate()
|
||||||
use_tqdm=False)
|
print(p.key_averages().table(sort_by="self_cuda_time_total"))
|
||||||
print(p.key_averages())
|
|
||||||
else:
|
else:
|
||||||
start_time = time.perf_counter()
|
start_time = time.perf_counter()
|
||||||
llm.generate(dummy_prompts,
|
llm_generate()
|
||||||
sampling_params=sampling_params,
|
|
||||||
use_tqdm=False)
|
|
||||||
end_time = time.perf_counter()
|
end_time = time.perf_counter()
|
||||||
latency = end_time - start_time
|
latency = end_time - start_time
|
||||||
return latency
|
return latency
|
||||||
@ -69,9 +99,8 @@ def main(args: argparse.Namespace):
|
|||||||
if args.profile:
|
if args.profile:
|
||||||
profile_dir = args.profile_result_dir
|
profile_dir = args.profile_result_dir
|
||||||
if not profile_dir:
|
if not profile_dir:
|
||||||
profile_dir = Path(
|
profile_dir = (Path(".") / "vllm_benchmark_result" /
|
||||||
"."
|
f"latency_result_{time.time()}")
|
||||||
) / "vllm_benchmark_result" / f"latency_result_{time.time()}"
|
|
||||||
print(f"Profiling (results will be saved to '{profile_dir}')...")
|
print(f"Profiling (results will be saved to '{profile_dir}')...")
|
||||||
run_to_completion(profile_dir=profile_dir)
|
run_to_completion(profile_dir=profile_dir)
|
||||||
return
|
return
|
||||||
@ -83,9 +112,9 @@ def main(args: argparse.Namespace):
|
|||||||
latencies = np.array(latencies)
|
latencies = np.array(latencies)
|
||||||
percentages = [10, 25, 50, 75, 90, 99]
|
percentages = [10, 25, 50, 75, 90, 99]
|
||||||
percentiles = np.percentile(latencies, percentages)
|
percentiles = np.percentile(latencies, percentages)
|
||||||
print(f'Avg latency: {np.mean(latencies)} seconds')
|
print(f"Avg latency: {np.mean(latencies)} seconds")
|
||||||
for percentage, percentile in zip(percentages, percentiles):
|
for percentage, percentile in zip(percentages, percentiles):
|
||||||
print(f'{percentage}% percentile latency: {percentile} seconds')
|
print(f"{percentage}% percentile latency: {percentile} seconds")
|
||||||
|
|
||||||
# Output JSON results if specified
|
# Output JSON results if specified
|
||||||
if args.output_json:
|
if args.output_json:
|
||||||
@ -96,43 +125,51 @@ def main(args: argparse.Namespace):
|
|||||||
}
|
}
|
||||||
with open(args.output_json, "w") as f:
|
with open(args.output_json, "w") as f:
|
||||||
json.dump(results, f, indent=4)
|
json.dump(results, f, indent=4)
|
||||||
|
save_to_pytorch_benchmark_format(args, results)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == "__main__":
|
||||||
parser = FlexibleArgumentParser(
|
parser = FlexibleArgumentParser(
|
||||||
description='Benchmark the latency of processing a single batch of '
|
description="Benchmark the latency of processing a single batch of "
|
||||||
'requests till completion.')
|
"requests till completion.")
|
||||||
parser.add_argument('--input-len', type=int, default=32)
|
parser.add_argument("--input-len", type=int, default=32)
|
||||||
parser.add_argument('--output-len', type=int, default=128)
|
parser.add_argument("--output-len", type=int, default=128)
|
||||||
parser.add_argument('--batch-size', type=int, default=8)
|
parser.add_argument("--batch-size", type=int, default=8)
|
||||||
parser.add_argument('--n',
|
parser.add_argument(
|
||||||
type=int,
|
"--n",
|
||||||
default=1,
|
type=int,
|
||||||
help='Number of generated sequences per prompt.')
|
default=1,
|
||||||
parser.add_argument('--use-beam-search', action='store_true')
|
help="Number of generated sequences per prompt.",
|
||||||
parser.add_argument('--num-iters-warmup',
|
)
|
||||||
type=int,
|
parser.add_argument("--use-beam-search", action="store_true")
|
||||||
default=10,
|
parser.add_argument(
|
||||||
help='Number of iterations to run for warmup.')
|
"--num-iters-warmup",
|
||||||
parser.add_argument('--num-iters',
|
type=int,
|
||||||
|
default=10,
|
||||||
|
help="Number of iterations to run for warmup.",
|
||||||
|
)
|
||||||
|
parser.add_argument("--num-iters",
|
||||||
type=int,
|
type=int,
|
||||||
default=30,
|
default=30,
|
||||||
help='Number of iterations to run.')
|
help="Number of iterations to run.")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--profile',
|
"--profile",
|
||||||
action='store_true',
|
action="store_true",
|
||||||
help='profile the generation process of a single batch')
|
help="profile the generation process of a single batch",
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--profile-result-dir',
|
"--profile-result-dir",
|
||||||
type=str,
|
type=str,
|
||||||
default=None,
|
default=None,
|
||||||
help=('path to save the pytorch profiler output. Can be visualized '
|
help=("path to save the pytorch profiler output. Can be visualized "
|
||||||
'with ui.perfetto.dev or Tensorboard.'))
|
"with ui.perfetto.dev or Tensorboard."),
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--output-json',
|
"--output-json",
|
||||||
type=str,
|
type=str,
|
||||||
default=None,
|
default=None,
|
||||||
help='Path to save the latency results in JSON format.')
|
help="Path to save the latency results in JSON format.",
|
||||||
|
)
|
||||||
|
|
||||||
parser = EngineArgs.add_cli_args(parser)
|
parser = EngineArgs.add_cli_args(parser)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
184
benchmarks/benchmark_long_document_qa_throughput.py
Normal file
184
benchmarks/benchmark_long_document_qa_throughput.py
Normal file
@ -0,0 +1,184 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
"""
|
||||||
|
Offline benchmark to test the long document QA throughput.
|
||||||
|
|
||||||
|
Example usage:
|
||||||
|
# This workload samples 8 different prompts with a default input
|
||||||
|
# length of 20000 tokens, then replicates each prompt 2 times
|
||||||
|
# in random order.
|
||||||
|
python benchmark_long_document_qa_throughput.py \
|
||||||
|
--model meta-llama/Llama-2-7b-chat-hf \
|
||||||
|
--enable-prefix-caching \
|
||||||
|
--num-documents 8 \
|
||||||
|
--repeat-count 2
|
||||||
|
|
||||||
|
Commandline arguments:
|
||||||
|
--num-documents: The number of documents to sample prompts from.
|
||||||
|
|
||||||
|
--document-length: The length of each document in tokens.
|
||||||
|
(Optional, default: 20000)
|
||||||
|
|
||||||
|
--output-len: The number of tokens to generate for each prompt.
|
||||||
|
(Optional, default: 10)
|
||||||
|
|
||||||
|
--repeat-count: The number of times to repeat each prompt.
|
||||||
|
(Optional, default: 2)
|
||||||
|
|
||||||
|
--repeat-mode: The mode to repeat prompts. The supported modes are:
|
||||||
|
- 'random': shuffle the prompts randomly. (Default)
|
||||||
|
- 'tile': the entire prompt list is repeated in sequence. (Potentially
|
||||||
|
lowest cache hit)
|
||||||
|
- 'interleave': each prompt is repeated consecutively before
|
||||||
|
moving to the next element. (Highest cache hit)
|
||||||
|
|
||||||
|
--shuffle-seed: Random seed when the repeat mode is "random".
|
||||||
|
(Optional, default: 0)
|
||||||
|
|
||||||
|
In the meantime, it also supports all the vLLM engine args to initialize the
|
||||||
|
LLM engine. You can refer to the `vllm.engine.arg_utils.EngineArgs` for more
|
||||||
|
details.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import dataclasses
|
||||||
|
import random
|
||||||
|
import time
|
||||||
|
|
||||||
|
from vllm import LLM, SamplingParams
|
||||||
|
from vllm.engine.arg_utils import EngineArgs
|
||||||
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
|
|
||||||
|
def test_long_document_qa(llm=None, sampling_params=None, prompts=None):
|
||||||
|
"""
|
||||||
|
Test long document QA with the given prompts and sampling parameters.
|
||||||
|
Print the time spent in processing all the prompts.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
llm: The language model used for generating responses.
|
||||||
|
sampling_params: Sampling parameter used to generate the response.
|
||||||
|
prompts: A list of prompt strings to be processed by the LLM.
|
||||||
|
"""
|
||||||
|
start_time = time.time()
|
||||||
|
llm.generate(prompts, sampling_params=sampling_params)
|
||||||
|
end_time = time.time()
|
||||||
|
print(f"Time to execute all requests: {end_time - start_time:.4f} secs")
|
||||||
|
|
||||||
|
|
||||||
|
def repeat_prompts(prompts, repeat_count, mode: str):
|
||||||
|
"""
|
||||||
|
Repeat each prompt in the list for a specified number of times.
|
||||||
|
The order of prompts in the output list depends on the mode.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
prompts: A list of prompts to be repeated.
|
||||||
|
repeat_count: The number of times each prompt is repeated.
|
||||||
|
mode: The mode of repetition. Supported modes are:
|
||||||
|
- 'random': Shuffle the prompts randomly after repetition.
|
||||||
|
- 'tile': Repeat the entire prompt list in sequence.
|
||||||
|
Example: [1, 2, 3] -> [1, 2, 3, 1, 2, 3].
|
||||||
|
- 'interleave': Repeat each prompt consecutively before moving to
|
||||||
|
the next. Example: [1, 2, 3] -> [1, 1, 2, 2, 3, 3].
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A list of repeated prompts in the specified order.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If an invalid mode is provided.
|
||||||
|
"""
|
||||||
|
print("Repeat mode: ", mode)
|
||||||
|
if mode == 'random':
|
||||||
|
repeated_prompts = prompts * repeat_count
|
||||||
|
random.shuffle(repeated_prompts)
|
||||||
|
return repeated_prompts
|
||||||
|
elif mode == 'tile':
|
||||||
|
return prompts * repeat_count
|
||||||
|
elif mode == 'interleave':
|
||||||
|
repeated_prompts = []
|
||||||
|
for prompt in prompts:
|
||||||
|
repeated_prompts.extend([prompt] * repeat_count)
|
||||||
|
return repeated_prompts
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Invalid mode: {mode}, only support "
|
||||||
|
"'random', 'tile', 'interleave'")
|
||||||
|
|
||||||
|
|
||||||
|
def main(args):
|
||||||
|
random.seed(args.shuffle_seed)
|
||||||
|
|
||||||
|
# Prepare the prompts:
|
||||||
|
# we append the document id at the beginning to avoid any of the document
|
||||||
|
# being the prefix of other documents
|
||||||
|
prompts = [
|
||||||
|
str(i) + ' '.join(['hi'] * args.document_length)
|
||||||
|
for i in range(args.num_documents)
|
||||||
|
]
|
||||||
|
|
||||||
|
prompts = repeat_prompts(prompts, args.repeat_count, mode=args.repeat_mode)
|
||||||
|
|
||||||
|
warmup_prompts = [
|
||||||
|
"This is warm up request " + str(i) + \
|
||||||
|
' '.join(['hi'] * args.document_length)
|
||||||
|
for i in range(args.num_documents)]
|
||||||
|
|
||||||
|
# Create the LLM engine
|
||||||
|
engine_args = EngineArgs.from_cli_args(args)
|
||||||
|
llm = LLM(**dataclasses.asdict(engine_args))
|
||||||
|
sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
|
||||||
|
|
||||||
|
print("------warm up------")
|
||||||
|
test_long_document_qa(
|
||||||
|
llm=llm,
|
||||||
|
prompts=warmup_prompts,
|
||||||
|
sampling_params=sampling_params,
|
||||||
|
)
|
||||||
|
|
||||||
|
print("------start generating------")
|
||||||
|
test_long_document_qa(
|
||||||
|
llm=llm,
|
||||||
|
prompts=prompts,
|
||||||
|
sampling_params=sampling_params,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = FlexibleArgumentParser(
|
||||||
|
description=
|
||||||
|
'Benchmark the performance with or without automatic prefix caching.')
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
'--document-length',
|
||||||
|
type=int,
|
||||||
|
# Roughly the number of tokens for a system paper,
|
||||||
|
# excluding images
|
||||||
|
default=20000,
|
||||||
|
help='Range of input lengths for sampling prompts,'
|
||||||
|
'specified as "min:max" (e.g., "128:256").')
|
||||||
|
|
||||||
|
parser.add_argument('--num-documents',
|
||||||
|
type=int,
|
||||||
|
default=8,
|
||||||
|
help='Range of input lengths for sampling prompts,'
|
||||||
|
'specified as "min:max" (e.g., "128:256").')
|
||||||
|
|
||||||
|
parser.add_argument('--output-len', type=int, default=10)
|
||||||
|
|
||||||
|
parser.add_argument('--repeat-count',
|
||||||
|
type=int,
|
||||||
|
default=2,
|
||||||
|
help='Number of times to repeat each prompt')
|
||||||
|
|
||||||
|
parser.add_argument("--repeat-mode",
|
||||||
|
type=str,
|
||||||
|
default='random',
|
||||||
|
help='The mode to repeat prompts. The supported '
|
||||||
|
'modes are "random", "tile", and "interleave". '
|
||||||
|
'See repeat_prompts() in the source code for details.')
|
||||||
|
|
||||||
|
parser.add_argument("--shuffle-seed",
|
||||||
|
type=int,
|
||||||
|
default=0,
|
||||||
|
help='Random seed when the repeat mode is "random"')
|
||||||
|
|
||||||
|
parser = EngineArgs.add_cli_args(parser)
|
||||||
|
args = parser.parse_args()
|
||||||
|
main(args)
|
@ -1,3 +1,4 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
"""
|
"""
|
||||||
Benchmark the efficiency of prefix caching.
|
Benchmark the efficiency of prefix caching.
|
||||||
|
|
||||||
@ -10,7 +11,8 @@ Fixed example usage:
|
|||||||
--model meta-llama/Llama-2-7b-chat-hf \
|
--model meta-llama/Llama-2-7b-chat-hf \
|
||||||
--enable-prefix-caching \
|
--enable-prefix-caching \
|
||||||
--num-prompts 1 \
|
--num-prompts 1 \
|
||||||
--repeat-count 100
|
--repeat-count 100 \
|
||||||
|
--input-length-range 128:256
|
||||||
|
|
||||||
ShareGPT example usage:
|
ShareGPT example usage:
|
||||||
# This command samples 20 prompts with input lengths
|
# This command samples 20 prompts with input lengths
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
"""Benchmark offline prioritization."""
|
"""Benchmark offline prioritization."""
|
||||||
import argparse
|
import argparse
|
||||||
import dataclasses
|
import dataclasses
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
r"""Benchmark online serving throughput.
|
r"""Benchmark online serving throughput.
|
||||||
|
|
||||||
On the server side, run one of the following commands:
|
On the server side, run one of the following commands:
|
||||||
@ -25,6 +26,7 @@ On the client side, run:
|
|||||||
import argparse
|
import argparse
|
||||||
import asyncio
|
import asyncio
|
||||||
import base64
|
import base64
|
||||||
|
import gc
|
||||||
import io
|
import io
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
@ -36,6 +38,7 @@ from datetime import datetime
|
|||||||
from typing import Any, AsyncGenerator, Collection, Dict, List, Optional, Tuple
|
from typing import Any, AsyncGenerator, Collection, Dict, List, Optional, Tuple
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
|
from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
|
||||||
RequestFuncOutput)
|
RequestFuncOutput)
|
||||||
from datasets import load_dataset
|
from datasets import load_dataset
|
||||||
@ -53,6 +56,8 @@ try:
|
|||||||
except ImportError:
|
except ImportError:
|
||||||
from argparse import ArgumentParser as FlexibleArgumentParser
|
from argparse import ArgumentParser as FlexibleArgumentParser
|
||||||
|
|
||||||
|
from benchmark_utils import convert_to_pytorch_benchmark_format
|
||||||
|
|
||||||
MILLISECONDS_TO_SECONDS_CONVERSION = 1000
|
MILLISECONDS_TO_SECONDS_CONVERSION = 1000
|
||||||
|
|
||||||
|
|
||||||
@ -129,6 +134,35 @@ def sample_sharegpt_requests(
|
|||||||
return filtered_dataset
|
return filtered_dataset
|
||||||
|
|
||||||
|
|
||||||
|
def sample_burstgpt_requests(
|
||||||
|
dataset_path: str,
|
||||||
|
num_requests: int,
|
||||||
|
random_seed: int,
|
||||||
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
|
) -> List[Tuple[str, int, int, None]]:
|
||||||
|
df = pd.read_csv(dataset_path)
|
||||||
|
gpt4_df = df[df["Model"] == "GPT-4"]
|
||||||
|
# Remove the failed requests (i.e., response length is 0)
|
||||||
|
gpt4_df = gpt4_df[gpt4_df["Response tokens"] > 0]
|
||||||
|
# Randomly sample num_requests from the dataset
|
||||||
|
if num_requests <= len(gpt4_df):
|
||||||
|
gpt4_df = gpt4_df.sample(n=num_requests, random_state=random_seed)
|
||||||
|
else:
|
||||||
|
gpt4_df = gpt4_df.sample(n=num_requests,
|
||||||
|
random_state=random_seed,
|
||||||
|
replace=True)
|
||||||
|
# Convert the dataframe to a list of tuples
|
||||||
|
dataset = gpt4_df.values.tolist()
|
||||||
|
input_requests = []
|
||||||
|
for i in range(num_requests):
|
||||||
|
input_len = int(dataset[i][2])
|
||||||
|
output_len = int(dataset[i][3])
|
||||||
|
prompt = tokenizer.decode([(i + j) % tokenizer.vocab_size
|
||||||
|
for j in range(input_len)])
|
||||||
|
input_requests.append((prompt, input_len, output_len, None))
|
||||||
|
return input_requests
|
||||||
|
|
||||||
|
|
||||||
def sample_sonnet_requests(
|
def sample_sonnet_requests(
|
||||||
dataset_path: str,
|
dataset_path: str,
|
||||||
num_requests: int,
|
num_requests: int,
|
||||||
@ -199,7 +233,7 @@ def sample_sonnet_requests(
|
|||||||
return sampled_requests
|
return sampled_requests
|
||||||
|
|
||||||
|
|
||||||
def sample_mmmu_pro_vision_requests(
|
def sample_vision_arena_requests(
|
||||||
dataset,
|
dataset,
|
||||||
num_requests: int,
|
num_requests: int,
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
@ -211,13 +245,7 @@ def sample_mmmu_pro_vision_requests(
|
|||||||
if len(sampled_requests) == num_requests:
|
if len(sampled_requests) == num_requests:
|
||||||
break
|
break
|
||||||
|
|
||||||
# MMMU-Pro vision direct prompt
|
prompt = data["turns"][0][0]['content']
|
||||||
# Ref: https://github.com/MMMU-Benchmark/MMMU/blob/6ce42f4d8f70c1841c67867152648974415b5cac/mmmu-pro/prompts.yaml#L5
|
|
||||||
prompt = (
|
|
||||||
"Answer with the option letter from the given choices directly. "
|
|
||||||
"The last line of your response should be of the following "
|
|
||||||
"format: 'Answer: $LETTER' (without quotes) where LETTER is one of "
|
|
||||||
"options.")
|
|
||||||
|
|
||||||
prompt_token_ids = tokenizer(prompt).input_ids
|
prompt_token_ids = tokenizer(prompt).input_ids
|
||||||
if fixed_output_len is None:
|
if fixed_output_len is None:
|
||||||
@ -229,10 +257,10 @@ def sample_mmmu_pro_vision_requests(
|
|||||||
output_len = fixed_output_len
|
output_len = fixed_output_len
|
||||||
|
|
||||||
assert isinstance(
|
assert isinstance(
|
||||||
data["image"],
|
data["images"][0],
|
||||||
Image), ("Input image format must be `PIL.Image.Image`, "
|
Image), ("Input image format must be `PIL.Image.Image`, "
|
||||||
f"given {type(data['image'])}.")
|
f"given {type(data['image'])}.")
|
||||||
image: Image = data["image"]
|
image: Image = data["images"][0]
|
||||||
image = image.convert("RGB")
|
image = image.convert("RGB")
|
||||||
image_data = io.BytesIO()
|
image_data = io.BytesIO()
|
||||||
image.save(image_data, format='JPEG')
|
image.save(image_data, format='JPEG')
|
||||||
@ -251,7 +279,7 @@ def sample_mmmu_pro_vision_requests(
|
|||||||
|
|
||||||
def sample_hf_requests(
|
def sample_hf_requests(
|
||||||
dataset_path: str,
|
dataset_path: str,
|
||||||
dataset_subset: str,
|
dataset_subset: Optional[str],
|
||||||
dataset_split: str,
|
dataset_split: str,
|
||||||
num_requests: int,
|
num_requests: int,
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
@ -259,19 +287,17 @@ def sample_hf_requests(
|
|||||||
fixed_output_len: Optional[int] = None,
|
fixed_output_len: Optional[int] = None,
|
||||||
) -> List[Tuple[str, str, int, Optional[Dict[str, Collection[str]]]]]:
|
) -> List[Tuple[str, str, int, Optional[Dict[str, Collection[str]]]]]:
|
||||||
|
|
||||||
# Special case for MMMU-Pro vision dataset
|
# Special case for vision_arena dataset
|
||||||
if dataset_path == 'MMMU/MMMU_Pro' and dataset_subset == 'vision':
|
if dataset_path == 'lmarena-ai/vision-arena-bench-v0.1' \
|
||||||
assert dataset_split == "test"
|
and dataset_subset is None:
|
||||||
|
assert dataset_split == "train"
|
||||||
dataset = load_dataset(dataset_path,
|
dataset = load_dataset(dataset_path,
|
||||||
name=dataset_subset,
|
name=dataset_subset,
|
||||||
split=dataset_split,
|
split=dataset_split,
|
||||||
streaming=True)
|
streaming=True)
|
||||||
assert "image" in dataset.features, (
|
dataset = dataset.shuffle(seed=random_seed)
|
||||||
"MMMU/MMMU_Pro vision dataset must have 'image' column.")
|
return sample_vision_arena_requests(dataset, num_requests, tokenizer,
|
||||||
filter_func = lambda x: isinstance(x["image"], Image)
|
fixed_output_len)
|
||||||
dataset = dataset.shuffle(seed=random_seed).filter(filter_func)
|
|
||||||
return sample_mmmu_pro_vision_requests(dataset, num_requests,
|
|
||||||
tokenizer, fixed_output_len)
|
|
||||||
|
|
||||||
dataset = load_dataset(dataset_path,
|
dataset = load_dataset(dataset_path,
|
||||||
name=dataset_subset,
|
name=dataset_subset,
|
||||||
@ -378,21 +404,21 @@ async def get_request(
|
|||||||
burstiness: float = 1.0,
|
burstiness: float = 1.0,
|
||||||
) -> AsyncGenerator[Tuple[str, int, int], None]:
|
) -> AsyncGenerator[Tuple[str, int, int], None]:
|
||||||
"""
|
"""
|
||||||
Asynchronously generates requests at a specified rate
|
Asynchronously generates requests at a specified rate
|
||||||
with OPTIONAL burstiness.
|
with OPTIONAL burstiness.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
input_requests:
|
input_requests:
|
||||||
A list of input requests, each represented as a tuple.
|
A list of input requests, each represented as a tuple.
|
||||||
request_rate:
|
request_rate:
|
||||||
The rate at which requests are generated (requests/s).
|
The rate at which requests are generated (requests/s).
|
||||||
burstiness (optional):
|
burstiness (optional):
|
||||||
The burstiness factor of the request generation.
|
The burstiness factor of the request generation.
|
||||||
Only takes effect when request_rate is not inf.
|
Only takes effect when request_rate is not inf.
|
||||||
Default value is 1, which follows a Poisson process.
|
Default value is 1, which follows a Poisson process.
|
||||||
Otherwise, the request intervals follow a gamma distribution.
|
Otherwise, the request intervals follow a gamma distribution.
|
||||||
A lower burstiness value (0 < burstiness < 1) results
|
A lower burstiness value (0 < burstiness < 1) results
|
||||||
in more bursty requests, while a higher burstiness value
|
in more bursty requests, while a higher burstiness value
|
||||||
(burstiness > 1) results in a more uniform arrival of requests.
|
(burstiness > 1) results in a more uniform arrival of requests.
|
||||||
"""
|
"""
|
||||||
input_requests = iter(input_requests)
|
input_requests = iter(input_requests)
|
||||||
@ -423,7 +449,7 @@ def calculate_metrics(
|
|||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
selected_percentile_metrics: List[str],
|
selected_percentile_metrics: List[str],
|
||||||
selected_percentiles: List[float],
|
selected_percentiles: List[float],
|
||||||
gootput_config_dict: Dict[str, float],
|
goodput_config_dict: Dict[str, float],
|
||||||
) -> Tuple[BenchmarkMetrics, List[int]]:
|
) -> Tuple[BenchmarkMetrics, List[int]]:
|
||||||
actual_output_lens: List[int] = []
|
actual_output_lens: List[int] = []
|
||||||
total_input = 0
|
total_input = 0
|
||||||
@ -436,19 +462,23 @@ def calculate_metrics(
|
|||||||
e2els: List[float] = []
|
e2els: List[float] = []
|
||||||
for i in range(len(outputs)):
|
for i in range(len(outputs)):
|
||||||
if outputs[i].success:
|
if outputs[i].success:
|
||||||
# We use the tokenizer to count the number of output tokens for all
|
output_len = outputs[i].output_tokens
|
||||||
# serving backends instead of looking at len(outputs[i].itl) since
|
|
||||||
# multiple output tokens may be bundled together
|
if output_len is None:
|
||||||
# Note : this may inflate the output token count slightly
|
# We use the tokenizer to count the number of output tokens
|
||||||
output_len = len(
|
# for some serving backends instead of looking at
|
||||||
tokenizer(outputs[i].generated_text,
|
# len(outputs[i].itl) since multiple output tokens may be
|
||||||
add_special_tokens=False).input_ids)
|
# bundled together
|
||||||
|
# Note : this may inflate the output token count slightly
|
||||||
|
output_len = len(
|
||||||
|
tokenizer(outputs[i].generated_text,
|
||||||
|
add_special_tokens=False).input_ids)
|
||||||
actual_output_lens.append(output_len)
|
actual_output_lens.append(output_len)
|
||||||
total_input += input_requests[i][1]
|
total_input += input_requests[i][1]
|
||||||
tpot = 0
|
tpot = 0
|
||||||
if output_len > 1:
|
if output_len > 1:
|
||||||
tpot = (outputs[i].latency - outputs[i].ttft) / (output_len -
|
latency_minus_ttft = outputs[i].latency - outputs[i].ttft
|
||||||
1)
|
tpot = latency_minus_ttft / (output_len - 1)
|
||||||
tpots.append(tpot)
|
tpots.append(tpot)
|
||||||
# Note: if output_len <= 1, we regard tpot as 0 for goodput
|
# Note: if output_len <= 1, we regard tpot as 0 for goodput
|
||||||
all_tpots.append(tpot)
|
all_tpots.append(tpot)
|
||||||
@ -459,21 +489,21 @@ def calculate_metrics(
|
|||||||
else:
|
else:
|
||||||
actual_output_lens.append(0)
|
actual_output_lens.append(0)
|
||||||
|
|
||||||
if gootput_config_dict:
|
if goodput_config_dict:
|
||||||
valid_metrics = []
|
valid_metrics = []
|
||||||
slo_values = []
|
slo_values = []
|
||||||
|
|
||||||
if "ttft" in gootput_config_dict:
|
if "ttft" in goodput_config_dict:
|
||||||
valid_metrics.append(ttfts)
|
valid_metrics.append(ttfts)
|
||||||
slo_values.append(gootput_config_dict["ttft"] /
|
slo_values.append(goodput_config_dict["ttft"] /
|
||||||
MILLISECONDS_TO_SECONDS_CONVERSION)
|
MILLISECONDS_TO_SECONDS_CONVERSION)
|
||||||
if "tpot" in gootput_config_dict:
|
if "tpot" in goodput_config_dict:
|
||||||
valid_metrics.append(all_tpots)
|
valid_metrics.append(all_tpots)
|
||||||
slo_values.append(gootput_config_dict["tpot"] /
|
slo_values.append(goodput_config_dict["tpot"] /
|
||||||
MILLISECONDS_TO_SECONDS_CONVERSION)
|
MILLISECONDS_TO_SECONDS_CONVERSION)
|
||||||
if "e2el" in gootput_config_dict:
|
if "e2el" in goodput_config_dict:
|
||||||
valid_metrics.append(e2els)
|
valid_metrics.append(e2els)
|
||||||
slo_values.append(gootput_config_dict["e2el"] /
|
slo_values.append(goodput_config_dict["e2el"] /
|
||||||
MILLISECONDS_TO_SECONDS_CONVERSION)
|
MILLISECONDS_TO_SECONDS_CONVERSION)
|
||||||
|
|
||||||
for req_metric in zip(*valid_metrics):
|
for req_metric in zip(*valid_metrics):
|
||||||
@ -525,6 +555,7 @@ async def benchmark(
|
|||||||
api_url: str,
|
api_url: str,
|
||||||
base_url: str,
|
base_url: str,
|
||||||
model_id: str,
|
model_id: str,
|
||||||
|
model_name: str,
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
input_requests: List[Tuple[str, int, int]],
|
input_requests: List[Tuple[str, int, int]],
|
||||||
logprobs: Optional[int],
|
logprobs: Optional[int],
|
||||||
@ -536,8 +567,9 @@ async def benchmark(
|
|||||||
selected_percentile_metrics: List[str],
|
selected_percentile_metrics: List[str],
|
||||||
selected_percentiles: List[str],
|
selected_percentiles: List[str],
|
||||||
ignore_eos: bool,
|
ignore_eos: bool,
|
||||||
gootput_config_dict: Dict[str, float],
|
goodput_config_dict: Dict[str, float],
|
||||||
max_concurrency: Optional[int],
|
max_concurrency: Optional[int],
|
||||||
|
lora_modules: Optional[List[str]],
|
||||||
):
|
):
|
||||||
if backend in ASYNC_REQUEST_FUNCS:
|
if backend in ASYNC_REQUEST_FUNCS:
|
||||||
request_func = ASYNC_REQUEST_FUNCS[backend]
|
request_func = ASYNC_REQUEST_FUNCS[backend]
|
||||||
@ -553,6 +585,7 @@ async def benchmark(
|
|||||||
"Multi-modal content is only supported on 'openai-chat' backend.")
|
"Multi-modal content is only supported on 'openai-chat' backend.")
|
||||||
test_input = RequestFuncInput(
|
test_input = RequestFuncInput(
|
||||||
model=model_id,
|
model=model_id,
|
||||||
|
model_name=model_name,
|
||||||
prompt=test_prompt,
|
prompt=test_prompt,
|
||||||
api_url=api_url,
|
api_url=api_url,
|
||||||
prompt_len=test_prompt_len,
|
prompt_len=test_prompt_len,
|
||||||
@ -562,6 +595,7 @@ async def benchmark(
|
|||||||
multi_modal_content=test_mm_content,
|
multi_modal_content=test_mm_content,
|
||||||
ignore_eos=ignore_eos,
|
ignore_eos=ignore_eos,
|
||||||
)
|
)
|
||||||
|
|
||||||
test_output = await request_func(request_func_input=test_input)
|
test_output = await request_func(request_func_input=test_input)
|
||||||
if not test_output.success:
|
if not test_output.success:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
@ -570,9 +604,15 @@ async def benchmark(
|
|||||||
else:
|
else:
|
||||||
print("Initial test run completed. Starting main benchmark run...")
|
print("Initial test run completed. Starting main benchmark run...")
|
||||||
|
|
||||||
|
if lora_modules:
|
||||||
|
# For each input request, choose a LoRA module at random.
|
||||||
|
lora_modules = iter(
|
||||||
|
[random.choice(lora_modules) for _ in range(len(input_requests))])
|
||||||
|
|
||||||
if profile:
|
if profile:
|
||||||
print("Starting profiler...")
|
print("Starting profiler...")
|
||||||
profile_input = RequestFuncInput(model=model_id,
|
profile_input = RequestFuncInput(model=model_id,
|
||||||
|
model_name=model_name,
|
||||||
prompt=test_prompt,
|
prompt=test_prompt,
|
||||||
api_url=base_url + "/start_profile",
|
api_url=base_url + "/start_profile",
|
||||||
prompt_len=test_prompt_len,
|
prompt_len=test_prompt_len,
|
||||||
@ -615,7 +655,13 @@ async def benchmark(
|
|||||||
tasks: List[asyncio.Task] = []
|
tasks: List[asyncio.Task] = []
|
||||||
async for request in get_request(input_requests, request_rate, burstiness):
|
async for request in get_request(input_requests, request_rate, burstiness):
|
||||||
prompt, prompt_len, output_len, mm_content = request
|
prompt, prompt_len, output_len, mm_content = request
|
||||||
request_func_input = RequestFuncInput(model=model_id,
|
req_model_id, req_model_name = model_id, model_name
|
||||||
|
if lora_modules:
|
||||||
|
req_lora_module = next(lora_modules)
|
||||||
|
req_model_id, req_model_name = req_lora_module, req_lora_module
|
||||||
|
|
||||||
|
request_func_input = RequestFuncInput(model=req_model_id,
|
||||||
|
model_name=req_model_name,
|
||||||
prompt=prompt,
|
prompt=prompt,
|
||||||
api_url=api_url,
|
api_url=api_url,
|
||||||
prompt_len=prompt_len,
|
prompt_len=prompt_len,
|
||||||
@ -657,7 +703,7 @@ async def benchmark(
|
|||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
selected_percentile_metrics=selected_percentile_metrics,
|
selected_percentile_metrics=selected_percentile_metrics,
|
||||||
selected_percentiles=selected_percentiles,
|
selected_percentiles=selected_percentiles,
|
||||||
gootput_config_dict=gootput_config_dict,
|
goodput_config_dict=goodput_config_dict,
|
||||||
)
|
)
|
||||||
|
|
||||||
print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
|
print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
|
||||||
@ -669,7 +715,7 @@ async def benchmark(
|
|||||||
metrics.total_output))
|
metrics.total_output))
|
||||||
print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
|
print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
|
||||||
metrics.request_throughput))
|
metrics.request_throughput))
|
||||||
if gootput_config_dict:
|
if goodput_config_dict:
|
||||||
print("{:<40} {:<10.2f}".format("Request goodput (req/s):",
|
print("{:<40} {:<10.2f}".format("Request goodput (req/s):",
|
||||||
metrics.request_goodput))
|
metrics.request_goodput))
|
||||||
print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
|
print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
|
||||||
@ -684,7 +730,7 @@ async def benchmark(
|
|||||||
"total_output_tokens": metrics.total_output,
|
"total_output_tokens": metrics.total_output,
|
||||||
"request_throughput": metrics.request_throughput,
|
"request_throughput": metrics.request_throughput,
|
||||||
"request_goodput:":
|
"request_goodput:":
|
||||||
metrics.request_goodput if gootput_config_dict else None,
|
metrics.request_goodput if goodput_config_dict else None,
|
||||||
"output_throughput": metrics.output_throughput,
|
"output_throughput": metrics.output_throughput,
|
||||||
"total_token_throughput": metrics.total_token_throughput,
|
"total_token_throughput": metrics.total_token_throughput,
|
||||||
"input_lens": [output.prompt_len for output in outputs],
|
"input_lens": [output.prompt_len for output in outputs],
|
||||||
@ -740,11 +786,11 @@ async def benchmark(
|
|||||||
|
|
||||||
def check_goodput_args(args):
|
def check_goodput_args(args):
|
||||||
# Check and parse goodput arguments
|
# Check and parse goodput arguments
|
||||||
gootput_config_dict = {}
|
goodput_config_dict = {}
|
||||||
VALID_NAMES = ["ttft", "tpot", "e2el"]
|
VALID_NAMES = ["ttft", "tpot", "e2el"]
|
||||||
if args.goodput:
|
if args.goodput:
|
||||||
gootput_config_dict = parse_goodput(args.goodput)
|
goodput_config_dict = parse_goodput(args.goodput)
|
||||||
for slo_name, slo_val in gootput_config_dict.items():
|
for slo_name, slo_val in goodput_config_dict.items():
|
||||||
if slo_name not in VALID_NAMES:
|
if slo_name not in VALID_NAMES:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Invalid metric name found, {slo_name}: {slo_val}. "
|
f"Invalid metric name found, {slo_name}: {slo_val}. "
|
||||||
@ -755,22 +801,48 @@ def check_goodput_args(args):
|
|||||||
f"Invalid value found, {slo_name}: {slo_val}. "
|
f"Invalid value found, {slo_name}: {slo_val}. "
|
||||||
"The service level objective value should be "
|
"The service level objective value should be "
|
||||||
"non-negative.")
|
"non-negative.")
|
||||||
return gootput_config_dict
|
return goodput_config_dict
|
||||||
|
|
||||||
|
|
||||||
def parse_goodput(slo_pairs):
|
def parse_goodput(slo_pairs):
|
||||||
gootput_config_dict = {}
|
goodput_config_dict = {}
|
||||||
try:
|
try:
|
||||||
for slo_pair in slo_pairs:
|
for slo_pair in slo_pairs:
|
||||||
slo_name, slo_val = slo_pair.split(":")
|
slo_name, slo_val = slo_pair.split(":")
|
||||||
gootput_config_dict[slo_name] = float(slo_val)
|
goodput_config_dict[slo_name] = float(slo_val)
|
||||||
except ValueError as err:
|
except ValueError as err:
|
||||||
raise argparse.ArgumentTypeError(
|
raise argparse.ArgumentTypeError(
|
||||||
"Invalid format found for service level objectives. "
|
"Invalid format found for service level objectives. "
|
||||||
"Specify service level objectives for goodput as \"KEY:VALUE\" "
|
"Specify service level objectives for goodput as \"KEY:VALUE\" "
|
||||||
"pairs, where the key is a metric name, and the value is a "
|
"pairs, where the key is a metric name, and the value is a "
|
||||||
"number in milliseconds.") from err
|
"number in milliseconds.") from err
|
||||||
return gootput_config_dict
|
return goodput_config_dict
|
||||||
|
|
||||||
|
|
||||||
|
def save_to_pytorch_benchmark_format(args: argparse.Namespace,
|
||||||
|
results: Dict[str, Any],
|
||||||
|
file_name: str) -> None:
|
||||||
|
metrics = [
|
||||||
|
"median_ttft_ms", "mean_ttft_ms", "std_ttft_ms", "p99_ttft_ms",
|
||||||
|
"mean_tpot_ms", "median_tpot_ms", "std_tpot_ms", "p99_tpot_ms",
|
||||||
|
"median_itl_ms", "mean_itl_ms", "std_itl_ms", "p99_itl_ms"
|
||||||
|
]
|
||||||
|
# These raw data might be useful, but they are rather big. They can be added
|
||||||
|
# later if needed
|
||||||
|
ignored_metrics = ["ttfts", "itls", "generated_texts", "errors"]
|
||||||
|
pt_records = convert_to_pytorch_benchmark_format(
|
||||||
|
args=args,
|
||||||
|
metrics={k: [results[k]]
|
||||||
|
for k in metrics},
|
||||||
|
extra_info={
|
||||||
|
k: results[k]
|
||||||
|
for k in results if k not in metrics and k not in ignored_metrics
|
||||||
|
})
|
||||||
|
if pt_records:
|
||||||
|
# Don't use json suffix here as we don't want CI to pick it up
|
||||||
|
pt_file = f"{os.path.splitext(file_name)[0]}.pytorch.json"
|
||||||
|
with open(pt_file, "w") as f:
|
||||||
|
json.dump(pt_records, f)
|
||||||
|
|
||||||
|
|
||||||
def main(args: argparse.Namespace):
|
def main(args: argparse.Namespace):
|
||||||
@ -780,6 +852,7 @@ def main(args: argparse.Namespace):
|
|||||||
|
|
||||||
backend = args.backend
|
backend = args.backend
|
||||||
model_id = args.model
|
model_id = args.model
|
||||||
|
model_name = args.served_model_name
|
||||||
tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
|
tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
|
||||||
tokenizer_mode = args.tokenizer_mode
|
tokenizer_mode = args.tokenizer_mode
|
||||||
|
|
||||||
@ -815,6 +888,14 @@ def main(args: argparse.Namespace):
|
|||||||
fixed_output_len=args.sharegpt_output_len,
|
fixed_output_len=args.sharegpt_output_len,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
elif args.dataset_name == "burstgpt":
|
||||||
|
input_requests = sample_burstgpt_requests(
|
||||||
|
dataset_path=args.dataset_path,
|
||||||
|
num_requests=args.num_prompts,
|
||||||
|
random_seed=args.seed,
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
)
|
||||||
|
|
||||||
elif args.dataset_name == "sonnet":
|
elif args.dataset_name == "sonnet":
|
||||||
# Do not format the prompt, pass to message directly
|
# Do not format the prompt, pass to message directly
|
||||||
if args.backend == "openai-chat":
|
if args.backend == "openai-chat":
|
||||||
@ -869,7 +950,11 @@ def main(args: argparse.Namespace):
|
|||||||
else:
|
else:
|
||||||
raise ValueError(f"Unknown dataset: {args.dataset_name}")
|
raise ValueError(f"Unknown dataset: {args.dataset_name}")
|
||||||
|
|
||||||
gootput_config_dict = check_goodput_args(args)
|
goodput_config_dict = check_goodput_args(args)
|
||||||
|
|
||||||
|
# Avoid GC processing "static" data - reduce pause times.
|
||||||
|
gc.collect()
|
||||||
|
gc.freeze()
|
||||||
|
|
||||||
benchmark_result = asyncio.run(
|
benchmark_result = asyncio.run(
|
||||||
benchmark(
|
benchmark(
|
||||||
@ -877,6 +962,7 @@ def main(args: argparse.Namespace):
|
|||||||
api_url=api_url,
|
api_url=api_url,
|
||||||
base_url=base_url,
|
base_url=base_url,
|
||||||
model_id=model_id,
|
model_id=model_id,
|
||||||
|
model_name=model_name,
|
||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
input_requests=input_requests,
|
input_requests=input_requests,
|
||||||
logprobs=args.logprobs,
|
logprobs=args.logprobs,
|
||||||
@ -890,8 +976,9 @@ def main(args: argparse.Namespace):
|
|||||||
float(p) for p in args.metric_percentiles.split(",")
|
float(p) for p in args.metric_percentiles.split(",")
|
||||||
],
|
],
|
||||||
ignore_eos=args.ignore_eos,
|
ignore_eos=args.ignore_eos,
|
||||||
gootput_config_dict=gootput_config_dict,
|
goodput_config_dict=goodput_config_dict,
|
||||||
max_concurrency=args.max_concurrency,
|
max_concurrency=args.max_concurrency,
|
||||||
|
lora_modules=args.lora_modules,
|
||||||
))
|
))
|
||||||
|
|
||||||
# Save config and results to json
|
# Save config and results to json
|
||||||
@ -919,8 +1006,8 @@ def main(args: argparse.Namespace):
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Traffic
|
# Traffic
|
||||||
result_json["request_rate"] = (
|
result_json["request_rate"] = (args.request_rate if args.request_rate
|
||||||
args.request_rate if args.request_rate < float("inf") else "inf")
|
< float("inf") else "inf")
|
||||||
result_json["burstiness"] = args.burstiness
|
result_json["burstiness"] = args.burstiness
|
||||||
result_json["max_concurrency"] = args.max_concurrency
|
result_json["max_concurrency"] = args.max_concurrency
|
||||||
|
|
||||||
@ -938,6 +1025,7 @@ def main(args: argparse.Namespace):
|
|||||||
file_name = os.path.join(args.result_dir, file_name)
|
file_name = os.path.join(args.result_dir, file_name)
|
||||||
with open(file_name, "w", encoding='utf-8') as outfile:
|
with open(file_name, "w", encoding='utf-8') as outfile:
|
||||||
json.dump(result_json, outfile)
|
json.dump(result_json, outfile)
|
||||||
|
save_to_pytorch_benchmark_format(args, result_json, file_name)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
@ -955,7 +1043,8 @@ if __name__ == "__main__":
|
|||||||
default=None,
|
default=None,
|
||||||
help="Server or API base url if not using http host and port.",
|
help="Server or API base url if not using http host and port.",
|
||||||
)
|
)
|
||||||
parser.add_argument("--host", type=str, default="localhost")
|
# Use 127.0.0.1 here instead of localhost to force the use of ipv4
|
||||||
|
parser.add_argument("--host", type=str, default="127.0.0.1")
|
||||||
parser.add_argument("--port", type=int, default=8000)
|
parser.add_argument("--port", type=int, default=8000)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--endpoint",
|
"--endpoint",
|
||||||
@ -974,7 +1063,7 @@ if __name__ == "__main__":
|
|||||||
"--dataset-name",
|
"--dataset-name",
|
||||||
type=str,
|
type=str,
|
||||||
default="sharegpt",
|
default="sharegpt",
|
||||||
choices=["sharegpt", "sonnet", "random", "hf"],
|
choices=["sharegpt", "burstgpt", "sonnet", "random", "hf"],
|
||||||
help="Name of the dataset to benchmark on.",
|
help="Name of the dataset to benchmark on.",
|
||||||
)
|
)
|
||||||
parser.add_argument("--dataset-path",
|
parser.add_argument("--dataset-path",
|
||||||
@ -1216,11 +1305,26 @@ if __name__ == "__main__":
|
|||||||
'--tokenizer-mode',
|
'--tokenizer-mode',
|
||||||
type=str,
|
type=str,
|
||||||
default="auto",
|
default="auto",
|
||||||
choices=['auto', 'slow', 'mistral'],
|
choices=['auto', 'slow', 'mistral', 'custom'],
|
||||||
help='The tokenizer mode.\n\n* "auto" will use the '
|
help='The tokenizer mode.\n\n* "auto" will use the '
|
||||||
'fast tokenizer if available.\n* "slow" will '
|
'fast tokenizer if available.\n* "slow" will '
|
||||||
'always use the slow tokenizer. \n* '
|
'always use the slow tokenizer. \n* '
|
||||||
'"mistral" will always use the `mistral_common` tokenizer.')
|
'"mistral" will always use the `mistral_common` tokenizer. \n*'
|
||||||
|
'"custom" will use --tokenizer to select the preregistered tokenizer.')
|
||||||
|
|
||||||
|
parser.add_argument("--served-model-name",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="The model name used in the API. "
|
||||||
|
"If not specified, the model name will be the "
|
||||||
|
"same as the ``--model`` argument. ")
|
||||||
|
|
||||||
|
parser.add_argument("--lora-modules",
|
||||||
|
nargs='+',
|
||||||
|
default=None,
|
||||||
|
help="A subset of LoRA module names passed in when "
|
||||||
|
"launching the server. For each request, the "
|
||||||
|
"script chooses a LoRA module at random.")
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
main(args)
|
main(args)
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
r"""Benchmark online serving throughput with guided decoding.
|
r"""Benchmark online serving throughput with guided decoding.
|
||||||
|
|
||||||
On the server side, run one of the following commands:
|
On the server side, run one of the following commands:
|
||||||
@ -730,7 +731,8 @@ if __name__ == "__main__":
|
|||||||
default=None,
|
default=None,
|
||||||
help="Server or API base url if not using http host and port.",
|
help="Server or API base url if not using http host and port.",
|
||||||
)
|
)
|
||||||
parser.add_argument("--host", type=str, default="localhost")
|
# Use 127.0.0.1 here instead of localhost to force the use of ipv4
|
||||||
|
parser.add_argument("--host", type=str, default="127.0.0.1")
|
||||||
parser.add_argument("--port", type=int, default=8000)
|
parser.add_argument("--port", type=int, default=8000)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--endpoint",
|
"--endpoint",
|
||||||
|
@ -1,13 +1,17 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
"""Benchmark offline inference throughput."""
|
"""Benchmark offline inference throughput."""
|
||||||
import argparse
|
import argparse
|
||||||
import dataclasses
|
import dataclasses
|
||||||
import json
|
import json
|
||||||
|
import os
|
||||||
import random
|
import random
|
||||||
import time
|
import time
|
||||||
from typing import List, Optional
|
from functools import cache
|
||||||
|
from typing import Any, Dict, List, Optional, Tuple
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import uvloop
|
import uvloop
|
||||||
|
from benchmark_utils import convert_to_pytorch_benchmark_format
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
from transformers import (AutoModelForCausalLM, AutoTokenizer,
|
from transformers import (AutoModelForCausalLM, AutoTokenizer,
|
||||||
@ -17,8 +21,11 @@ from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
|
|||||||
from vllm.entrypoints.openai.api_server import (
|
from vllm.entrypoints.openai.api_server import (
|
||||||
build_async_engine_client_from_engine_args)
|
build_async_engine_client_from_engine_args)
|
||||||
from vllm.inputs import TextPrompt
|
from vllm.inputs import TextPrompt
|
||||||
|
from vllm.lora.request import LoRARequest
|
||||||
|
from vllm.lora.utils import get_adapter_absolute_path
|
||||||
from vllm.multimodal import MultiModalDataDict
|
from vllm.multimodal import MultiModalDataDict
|
||||||
from vllm.sampling_params import BeamSearchParams
|
from vllm.sampling_params import BeamSearchParams
|
||||||
|
from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
|
||||||
from vllm.utils import FlexibleArgumentParser, merge_async_iterators
|
from vllm.utils import FlexibleArgumentParser, merge_async_iterators
|
||||||
|
|
||||||
|
|
||||||
@ -28,15 +35,17 @@ class SampleRequest:
|
|||||||
|
|
||||||
Attributes:
|
Attributes:
|
||||||
prompt: The input text prompt for the model.
|
prompt: The input text prompt for the model.
|
||||||
multi_modal_data: Optional dictionary containing multi-modal data (e.g.
|
|
||||||
images).
|
|
||||||
prompt_len: The length of the prompt in tokens.
|
prompt_len: The length of the prompt in tokens.
|
||||||
expected_output_len: The expected length of the output in tokens.
|
expected_output_len: The expected length of the output in tokens.
|
||||||
|
multi_modal_data: Optional dictionary containing multi-modal data (e.g.
|
||||||
|
images).
|
||||||
|
lora_request: Optional LoRARequest specifying the LoRA to use.
|
||||||
"""
|
"""
|
||||||
prompt: str
|
prompt: str
|
||||||
prompt_len: int
|
prompt_len: int
|
||||||
expected_output_len: int
|
expected_output_len: int
|
||||||
multi_modal_data: Optional[MultiModalDataDict] = None
|
multi_modal_data: Optional[MultiModalDataDict] = None
|
||||||
|
lora_request: Optional[LoRARequest] = None
|
||||||
|
|
||||||
|
|
||||||
def _get_prompt_for_image_model(question: str, *, model: str) -> str:
|
def _get_prompt_for_image_model(question: str, *, model: str) -> str:
|
||||||
@ -60,8 +69,30 @@ def _get_prompt_for_image_model(question: str, *, model: str) -> str:
|
|||||||
raise ValueError(f"Unsupported model {model}")
|
raise ValueError(f"Unsupported model {model}")
|
||||||
|
|
||||||
|
|
||||||
|
@cache
|
||||||
|
def lora_path_on_disk(lora_path: str) -> str:
|
||||||
|
return get_adapter_absolute_path(lora_path)
|
||||||
|
|
||||||
|
|
||||||
|
lora_tokenizer_cache: Dict[int, AnyTokenizer] = {}
|
||||||
|
|
||||||
|
|
||||||
|
def get_random_lora_request(
|
||||||
|
args: argparse.Namespace
|
||||||
|
) -> Tuple[LoRARequest, Optional[AnyTokenizer]]:
|
||||||
|
global lora_tokenizer_cache
|
||||||
|
lora_id = random.randint(1, args.max_loras)
|
||||||
|
lora_request = LoRARequest(lora_name=str(lora_id),
|
||||||
|
lora_int_id=lora_id,
|
||||||
|
lora_path=lora_path_on_disk(args.lora_path))
|
||||||
|
if lora_id not in lora_tokenizer_cache:
|
||||||
|
lora_tokenizer_cache[lora_id] = get_lora_tokenizer(lora_request)
|
||||||
|
return lora_request, lora_tokenizer_cache[lora_id]
|
||||||
|
|
||||||
|
|
||||||
def sample_requests(tokenizer: PreTrainedTokenizerBase,
|
def sample_requests(tokenizer: PreTrainedTokenizerBase,
|
||||||
args: argparse.Namespace) -> List[SampleRequest]:
|
args: argparse.Namespace) -> List[SampleRequest]:
|
||||||
|
|
||||||
dataset_path: str = args.dataset
|
dataset_path: str = args.dataset
|
||||||
num_requests: int = args.num_prompts
|
num_requests: int = args.num_prompts
|
||||||
fixed_output_len: Optional[int] = args.output_len
|
fixed_output_len: Optional[int] = args.output_len
|
||||||
@ -79,7 +110,9 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
|
|||||||
|
|
||||||
# Filter out sequences that are too long or too short
|
# Filter out sequences that are too long or too short
|
||||||
filtered_dataset: List[SampleRequest] = []
|
filtered_dataset: List[SampleRequest] = []
|
||||||
for data in dataset:
|
for data in tqdm(dataset,
|
||||||
|
total=len(filtered_dataset),
|
||||||
|
desc="sampling requests"):
|
||||||
if len(filtered_dataset) == num_requests:
|
if len(filtered_dataset) == num_requests:
|
||||||
break
|
break
|
||||||
|
|
||||||
@ -102,9 +135,16 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
|
|||||||
continue
|
continue
|
||||||
prompt = _get_prompt_for_image_model(question=prompt, model=model)
|
prompt = _get_prompt_for_image_model(question=prompt, model=model)
|
||||||
|
|
||||||
|
request_tokenizer = tokenizer
|
||||||
|
lora_request: Optional[LoRARequest] = None
|
||||||
|
if args.enable_lora:
|
||||||
|
lora_request, lora_tokenizer = get_random_lora_request(args)
|
||||||
|
if lora_tokenizer:
|
||||||
|
request_tokenizer = lora_tokenizer
|
||||||
|
|
||||||
# Tokenize the prompts and completions.
|
# Tokenize the prompts and completions.
|
||||||
prompt_token_ids = tokenizer(prompt).input_ids
|
prompt_token_ids = request_tokenizer(prompt).input_ids
|
||||||
completion_token_ids = tokenizer(completion).input_ids
|
completion_token_ids = request_tokenizer(completion).input_ids
|
||||||
prompt_len = len(prompt_token_ids)
|
prompt_len = len(prompt_token_ids)
|
||||||
output_len = len(completion_token_ids
|
output_len = len(completion_token_ids
|
||||||
) if fixed_output_len is None else fixed_output_len
|
) if fixed_output_len is None else fixed_output_len
|
||||||
@ -118,7 +158,8 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
|
|||||||
SampleRequest(prompt=prompt,
|
SampleRequest(prompt=prompt,
|
||||||
prompt_len=prompt_len,
|
prompt_len=prompt_len,
|
||||||
expected_output_len=output_len,
|
expected_output_len=output_len,
|
||||||
multi_modal_data=multi_modal_data))
|
multi_modal_data=multi_modal_data,
|
||||||
|
lora_request=lora_request))
|
||||||
|
|
||||||
return filtered_dataset
|
return filtered_dataset
|
||||||
|
|
||||||
@ -146,14 +187,21 @@ def run_vllm(
|
|||||||
ignore_eos=True,
|
ignore_eos=True,
|
||||||
max_tokens=request.expected_output_len,
|
max_tokens=request.expected_output_len,
|
||||||
))
|
))
|
||||||
|
lora_requests: Optional[List[LoRARequest]] = None
|
||||||
|
if engine_args.enable_lora:
|
||||||
|
lora_requests = [request.lora_request for request in requests]
|
||||||
|
|
||||||
use_beam_search = False
|
use_beam_search = False
|
||||||
|
|
||||||
if not use_beam_search:
|
if not use_beam_search:
|
||||||
start = time.perf_counter()
|
start = time.perf_counter()
|
||||||
llm.generate(prompts, sampling_params, use_tqdm=True)
|
llm.generate(prompts,
|
||||||
|
sampling_params,
|
||||||
|
lora_request=lora_requests,
|
||||||
|
use_tqdm=True)
|
||||||
end = time.perf_counter()
|
end = time.perf_counter()
|
||||||
else:
|
else:
|
||||||
|
assert lora_requests is None, "BeamSearch API does not support LoRA"
|
||||||
prompts = [request.prompt for request in requests]
|
prompts = [request.prompt for request in requests]
|
||||||
# output_len should be the same for all requests.
|
# output_len should be the same for all requests.
|
||||||
output_len = requests[0][2]
|
output_len = requests[0][2]
|
||||||
@ -185,6 +233,7 @@ async def run_vllm_async(
|
|||||||
# Add the requests to the engine.
|
# Add the requests to the engine.
|
||||||
prompts: List[TextPrompt] = []
|
prompts: List[TextPrompt] = []
|
||||||
sampling_params: List[SamplingParams] = []
|
sampling_params: List[SamplingParams] = []
|
||||||
|
lora_requests: List[Optional[LoRARequest]] = []
|
||||||
for request in requests:
|
for request in requests:
|
||||||
prompts.append(
|
prompts.append(
|
||||||
TextPrompt(prompt=request.prompt,
|
TextPrompt(prompt=request.prompt,
|
||||||
@ -197,11 +246,16 @@ async def run_vllm_async(
|
|||||||
ignore_eos=True,
|
ignore_eos=True,
|
||||||
max_tokens=request.expected_output_len,
|
max_tokens=request.expected_output_len,
|
||||||
))
|
))
|
||||||
|
lora_requests.append(request.lora_request)
|
||||||
|
|
||||||
generators = []
|
generators = []
|
||||||
start = time.perf_counter()
|
start = time.perf_counter()
|
||||||
for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)):
|
for i, (prompt, sp,
|
||||||
generator = llm.generate(prompt, sp, request_id=f"test{i}")
|
lr) in enumerate(zip(prompts, sampling_params, lora_requests)):
|
||||||
|
generator = llm.generate(prompt,
|
||||||
|
sp,
|
||||||
|
lora_request=lr,
|
||||||
|
request_id=f"test{i}")
|
||||||
generators.append(generator)
|
generators.append(generator)
|
||||||
all_gens = merge_async_iterators(*generators)
|
all_gens = merge_async_iterators(*generators)
|
||||||
async for i, res in all_gens:
|
async for i, res in all_gens:
|
||||||
@ -286,6 +340,25 @@ def run_mii(
|
|||||||
return end - start
|
return end - start
|
||||||
|
|
||||||
|
|
||||||
|
def save_to_pytorch_benchmark_format(args: argparse.Namespace,
|
||||||
|
results: Dict[str, Any]) -> None:
|
||||||
|
pt_records = convert_to_pytorch_benchmark_format(
|
||||||
|
args=args,
|
||||||
|
metrics={
|
||||||
|
"requests_per_second": [results["requests_per_second"]],
|
||||||
|
"tokens_per_second": [results["tokens_per_second"]],
|
||||||
|
},
|
||||||
|
extra_info={
|
||||||
|
k: results[k]
|
||||||
|
for k in ["elapsed_time", "num_requests", "total_num_tokens"]
|
||||||
|
})
|
||||||
|
if pt_records:
|
||||||
|
# Don't use json suffix here as we don't want CI to pick it up
|
||||||
|
pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
|
||||||
|
with open(pt_file, "w") as f:
|
||||||
|
json.dump(pt_records, f)
|
||||||
|
|
||||||
|
|
||||||
def main(args: argparse.Namespace):
|
def main(args: argparse.Namespace):
|
||||||
print(args)
|
print(args)
|
||||||
random.seed(args.seed)
|
random.seed(args.seed)
|
||||||
@ -297,6 +370,14 @@ def main(args: argparse.Namespace):
|
|||||||
vocab_size = tokenizer.vocab_size
|
vocab_size = tokenizer.vocab_size
|
||||||
requests = []
|
requests = []
|
||||||
for _ in range(args.num_prompts):
|
for _ in range(args.num_prompts):
|
||||||
|
|
||||||
|
request_tokenizer = tokenizer
|
||||||
|
lora_request: Optional[LoRARequest] = None
|
||||||
|
if args.enable_lora:
|
||||||
|
lora_request, lora_tokenizer = get_random_lora_request(args)
|
||||||
|
if lora_tokenizer:
|
||||||
|
request_tokenizer = lora_tokenizer
|
||||||
|
|
||||||
# Synthesize a prompt with the given input length.
|
# Synthesize a prompt with the given input length.
|
||||||
candidate_ids = [
|
candidate_ids = [
|
||||||
random.randint(0, vocab_size - 1)
|
random.randint(0, vocab_size - 1)
|
||||||
@ -305,8 +386,8 @@ def main(args: argparse.Namespace):
|
|||||||
# As tokenizer may add additional tokens like BOS, we need to try
|
# As tokenizer may add additional tokens like BOS, we need to try
|
||||||
# different lengths to get the desired input length.
|
# different lengths to get the desired input length.
|
||||||
for _ in range(5): # Max attempts to correct
|
for _ in range(5): # Max attempts to correct
|
||||||
candidate_prompt = tokenizer.decode(candidate_ids)
|
candidate_prompt = request_tokenizer.decode(candidate_ids)
|
||||||
tokenized_len = len(tokenizer.encode(candidate_prompt))
|
tokenized_len = len(request_tokenizer.encode(candidate_prompt))
|
||||||
|
|
||||||
if tokenized_len == args.input_len:
|
if tokenized_len == args.input_len:
|
||||||
break
|
break
|
||||||
@ -323,7 +404,8 @@ def main(args: argparse.Namespace):
|
|||||||
requests.append(
|
requests.append(
|
||||||
SampleRequest(prompt=candidate_prompt,
|
SampleRequest(prompt=candidate_prompt,
|
||||||
prompt_len=args.input_len,
|
prompt_len=args.input_len,
|
||||||
expected_output_len=args.output_len))
|
expected_output_len=args.output_len,
|
||||||
|
lora_request=lora_request))
|
||||||
else:
|
else:
|
||||||
requests = sample_requests(tokenizer, args)
|
requests = sample_requests(tokenizer, args)
|
||||||
|
|
||||||
@ -374,6 +456,7 @@ def main(args: argparse.Namespace):
|
|||||||
}
|
}
|
||||||
with open(args.output_json, "w") as f:
|
with open(args.output_json, "w") as f:
|
||||||
json.dump(results, f, indent=4)
|
json.dump(results, f, indent=4)
|
||||||
|
save_to_pytorch_benchmark_format(args, results)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
@ -422,6 +505,14 @@ if __name__ == "__main__":
|
|||||||
action='store_true',
|
action='store_true',
|
||||||
default=False,
|
default=False,
|
||||||
help="Disable decoupled async engine frontend.")
|
help="Disable decoupled async engine frontend.")
|
||||||
|
# LoRA
|
||||||
|
parser.add_argument(
|
||||||
|
"--lora-path",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="Path to the lora adapters to use. This can be an absolute path, "
|
||||||
|
"a relative path, or a Hugging Face model identifier.")
|
||||||
|
|
||||||
parser = AsyncEngineArgs.add_cli_args(parser)
|
parser = AsyncEngineArgs.add_cli_args(parser)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
if args.tokenizer is None:
|
if args.tokenizer is None:
|
||||||
@ -431,6 +522,8 @@ if __name__ == "__main__":
|
|||||||
assert args.output_len is not None
|
assert args.output_len is not None
|
||||||
else:
|
else:
|
||||||
assert args.input_len is None
|
assert args.input_len is None
|
||||||
|
if args.enable_lora:
|
||||||
|
assert args.lora_path is not None
|
||||||
|
|
||||||
if args.backend == "vllm":
|
if args.backend == "vllm":
|
||||||
if args.hf_max_batch_size is not None:
|
if args.hf_max_batch_size is not None:
|
||||||
@ -440,6 +533,9 @@ if __name__ == "__main__":
|
|||||||
raise ValueError("HF max batch size is required for HF backend.")
|
raise ValueError("HF max batch size is required for HF backend.")
|
||||||
if args.quantization is not None:
|
if args.quantization is not None:
|
||||||
raise ValueError("Quantization is only for vLLM backend.")
|
raise ValueError("Quantization is only for vLLM backend.")
|
||||||
|
if args.enable_lora is not None:
|
||||||
|
raise ValueError("LoRA benchmarking is only supported for vLLM"
|
||||||
|
" backend")
|
||||||
elif args.backend == "mii":
|
elif args.backend == "mii":
|
||||||
if args.dtype != "auto":
|
if args.dtype != "auto":
|
||||||
raise ValueError("dtype must be auto for MII backend.")
|
raise ValueError("dtype must be auto for MII backend.")
|
||||||
@ -452,4 +548,7 @@ if __name__ == "__main__":
|
|||||||
if args.tokenizer != args.model:
|
if args.tokenizer != args.model:
|
||||||
raise ValueError("Tokenizer must be the same as the model for MII "
|
raise ValueError("Tokenizer must be the same as the model for MII "
|
||||||
"backend.")
|
"backend.")
|
||||||
|
if args.enable_lora is not None:
|
||||||
|
raise ValueError("LoRA benchmarking is only supported for vLLM"
|
||||||
|
" backend")
|
||||||
main(args)
|
main(args)
|
||||||
|
39
benchmarks/benchmark_utils.py
Normal file
39
benchmarks/benchmark_utils.py
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
from typing import Any, Dict, List
|
||||||
|
|
||||||
|
|
||||||
|
def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
|
||||||
|
metrics: Dict[str, List],
|
||||||
|
extra_info: Dict[str, Any]) -> List:
|
||||||
|
"""
|
||||||
|
Save the benchmark results in the format used by PyTorch OSS benchmark with
|
||||||
|
on metric per record
|
||||||
|
https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
|
||||||
|
"""
|
||||||
|
records = []
|
||||||
|
if not os.environ.get("SAVE_TO_PYTORCH_BENCHMARK_FORMAT", False):
|
||||||
|
return records
|
||||||
|
|
||||||
|
for name, benchmark_values in metrics.items():
|
||||||
|
record = {
|
||||||
|
"benchmark": {
|
||||||
|
"name": "vLLM benchmark",
|
||||||
|
"extra_info": {
|
||||||
|
"args": vars(args),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"model": {
|
||||||
|
"name": args.model,
|
||||||
|
},
|
||||||
|
"metric": {
|
||||||
|
"name": name,
|
||||||
|
"benchmark_values": benchmark_values,
|
||||||
|
"extra_info": extra_info,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
records.append(record)
|
||||||
|
|
||||||
|
return records
|
386
benchmarks/cutlass_benchmarks/sparse_benchmarks.py
Normal file
386
benchmarks/cutlass_benchmarks/sparse_benchmarks.py
Normal file
@ -0,0 +1,386 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import copy
|
||||||
|
import itertools
|
||||||
|
import pickle as pkl
|
||||||
|
import time
|
||||||
|
from typing import Callable, Iterable, List, Tuple
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torch.utils.benchmark as TBenchmark
|
||||||
|
from torch.utils.benchmark import Measurement as TMeasurement
|
||||||
|
from utils import make_rand_sparse_tensors
|
||||||
|
from weight_shapes import WEIGHT_SHAPES
|
||||||
|
|
||||||
|
from vllm import _custom_ops as ops
|
||||||
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
|
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
|
||||||
|
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
|
||||||
|
DEFAULT_TP_SIZES = [1]
|
||||||
|
|
||||||
|
|
||||||
|
# bench
|
||||||
|
def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
|
||||||
|
**kwargs) -> TMeasurement:
|
||||||
|
min_run_time = 1
|
||||||
|
|
||||||
|
globals = {
|
||||||
|
"args": args,
|
||||||
|
"kwargs": kwargs,
|
||||||
|
"fn": fn,
|
||||||
|
}
|
||||||
|
return TBenchmark.Timer(
|
||||||
|
stmt="fn(*args, **kwargs)",
|
||||||
|
globals=globals,
|
||||||
|
label=label,
|
||||||
|
sub_label=sub_label,
|
||||||
|
description=description,
|
||||||
|
).blocked_autorange(min_run_time=min_run_time)
|
||||||
|
|
||||||
|
|
||||||
|
def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
|
||||||
|
sub_label: str) -> Iterable[TMeasurement]:
|
||||||
|
assert dtype == torch.int8
|
||||||
|
b_compressed, e, a, b = make_rand_sparse_tensors(torch.int8, m, n, k)
|
||||||
|
scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
||||||
|
scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
||||||
|
bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
|
||||||
|
|
||||||
|
out = ops.cutlass_scaled_sparse_mm(a, b_compressed, e, scale_a, scale_b,
|
||||||
|
torch.bfloat16)
|
||||||
|
out_ref = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16)
|
||||||
|
|
||||||
|
if not torch.allclose(out, out_ref):
|
||||||
|
print("Incorrect results")
|
||||||
|
print(out)
|
||||||
|
print(out_ref)
|
||||||
|
else:
|
||||||
|
print("Correct results")
|
||||||
|
|
||||||
|
timers = []
|
||||||
|
# pytorch impl - bfloat16
|
||||||
|
timers.append(
|
||||||
|
bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
|
||||||
|
torch.mm, a.to(dtype=torch.bfloat16),
|
||||||
|
b.to(dtype=torch.bfloat16)))
|
||||||
|
|
||||||
|
# pytorch impl - float16
|
||||||
|
timers.append(
|
||||||
|
bench_fn(label, sub_label,
|
||||||
|
"pytorch_fp16_fp16_fp16_matmul-no-scales", torch.mm,
|
||||||
|
a.to(dtype=torch.float16), b.to(dtype=torch.float16)))
|
||||||
|
|
||||||
|
# cutlass impl
|
||||||
|
timers.append(
|
||||||
|
bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm",
|
||||||
|
ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
|
||||||
|
torch.bfloat16))
|
||||||
|
|
||||||
|
# cutlass with bias
|
||||||
|
timers.append(
|
||||||
|
bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_bias",
|
||||||
|
ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16,
|
||||||
|
bias))
|
||||||
|
|
||||||
|
# cutlass sparse impl
|
||||||
|
timers.append(
|
||||||
|
bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_sparse_mm",
|
||||||
|
ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
|
||||||
|
scale_b, torch.bfloat16))
|
||||||
|
|
||||||
|
# cutlass sparse with bias
|
||||||
|
timers.append(
|
||||||
|
bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_sparse_mm_bias",
|
||||||
|
ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
|
||||||
|
scale_b, torch.bfloat16, bias))
|
||||||
|
|
||||||
|
return timers
|
||||||
|
|
||||||
|
|
||||||
|
def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
|
||||||
|
sub_label: str) -> Iterable[TMeasurement]:
|
||||||
|
assert dtype == torch.float8_e4m3fn
|
||||||
|
b_compressed, e, a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, m, n,
|
||||||
|
k)
|
||||||
|
scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
||||||
|
scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
||||||
|
bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
|
||||||
|
|
||||||
|
out = ops.cutlass_scaled_sparse_mm(a, b_compressed, e, scale_a, scale_b,
|
||||||
|
torch.bfloat16)
|
||||||
|
out_ref = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16)
|
||||||
|
|
||||||
|
if not torch.allclose(out, out_ref):
|
||||||
|
print("Incorrect results")
|
||||||
|
print(out)
|
||||||
|
print(out_ref)
|
||||||
|
else:
|
||||||
|
print("Correct results")
|
||||||
|
|
||||||
|
timers = []
|
||||||
|
|
||||||
|
# pytorch impl w. bf16
|
||||||
|
timers.append(
|
||||||
|
bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
|
||||||
|
torch.mm, a.to(dtype=torch.bfloat16, device="cuda"),
|
||||||
|
b.to(dtype=torch.bfloat16, device="cuda")))
|
||||||
|
|
||||||
|
# pytorch impl: bf16 output, without fp8 fast accum
|
||||||
|
timers.append(
|
||||||
|
bench_fn(label,
|
||||||
|
sub_label,
|
||||||
|
"pytorch_fp8_fp8_bf16_scaled_mm",
|
||||||
|
torch._scaled_mm,
|
||||||
|
a,
|
||||||
|
b,
|
||||||
|
scale_a=scale_a,
|
||||||
|
scale_b=scale_b,
|
||||||
|
out_dtype=torch.bfloat16))
|
||||||
|
|
||||||
|
# pytorch impl: bf16 output, with fp8 fast accum
|
||||||
|
timers.append(
|
||||||
|
bench_fn(label,
|
||||||
|
sub_label,
|
||||||
|
"pytorch_fp8_fp8_bf16_scaled_mm_fast_accum",
|
||||||
|
torch._scaled_mm,
|
||||||
|
a,
|
||||||
|
b,
|
||||||
|
scale_a=scale_a,
|
||||||
|
scale_b=scale_b,
|
||||||
|
out_dtype=torch.bfloat16,
|
||||||
|
use_fast_accum=True))
|
||||||
|
|
||||||
|
# pytorch impl: fp16 output, without fp8 fast accum
|
||||||
|
timers.append(
|
||||||
|
bench_fn(label,
|
||||||
|
sub_label,
|
||||||
|
"pytorch_fp8_fp8_fp16_scaled_mm",
|
||||||
|
torch._scaled_mm,
|
||||||
|
a,
|
||||||
|
b,
|
||||||
|
scale_a=scale_a,
|
||||||
|
scale_b=scale_b,
|
||||||
|
out_dtype=torch.float16))
|
||||||
|
|
||||||
|
# pytorch impl: fp16 output, with fp8 fast accum
|
||||||
|
timers.append(
|
||||||
|
bench_fn(label,
|
||||||
|
sub_label,
|
||||||
|
"pytorch_fp8_fp8_fp16_scaled_mm_fast_accum",
|
||||||
|
torch._scaled_mm,
|
||||||
|
a,
|
||||||
|
b,
|
||||||
|
scale_a=scale_a,
|
||||||
|
scale_b=scale_b,
|
||||||
|
out_dtype=torch.float16,
|
||||||
|
use_fast_accum=True))
|
||||||
|
|
||||||
|
# cutlass impl: bf16 output
|
||||||
|
timers.append(
|
||||||
|
bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm",
|
||||||
|
ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
|
||||||
|
torch.bfloat16))
|
||||||
|
|
||||||
|
# cutlass impl: bf16 output
|
||||||
|
timers.append(
|
||||||
|
bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_sparse_mm",
|
||||||
|
ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
|
||||||
|
scale_b, torch.bfloat16))
|
||||||
|
|
||||||
|
# cutlass impl: fp16 output
|
||||||
|
timers.append(
|
||||||
|
bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_sparse_mm",
|
||||||
|
ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
|
||||||
|
scale_b, torch.float16))
|
||||||
|
|
||||||
|
# cutlass impl: bf16 output, with bias
|
||||||
|
timers.append(
|
||||||
|
bench_fn(label, sub_label,
|
||||||
|
"cutlass_fp8_fp8_bf16_scaled_sparse_mm_bias",
|
||||||
|
ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
|
||||||
|
scale_b, torch.bfloat16, bias))
|
||||||
|
|
||||||
|
# cutlass impl: fp16 output, with bias
|
||||||
|
timers.append(
|
||||||
|
bench_fn(label, sub_label,
|
||||||
|
"cutlass_fp8_fp8_fp16_scaled_sparse_mm_bias",
|
||||||
|
ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
|
||||||
|
scale_b, torch.float16, bias.to(dtype=torch.float16)))
|
||||||
|
|
||||||
|
return timers
|
||||||
|
|
||||||
|
|
||||||
|
def bench(dtype: torch.dtype, m: int, k: int, n: int, label: str,
|
||||||
|
sub_label: str) -> Iterable[TMeasurement]:
|
||||||
|
if dtype == torch.int8:
|
||||||
|
return bench_int8(dtype, m, k, n, label, sub_label)
|
||||||
|
if dtype == torch.float8_e4m3fn:
|
||||||
|
return bench_fp8(dtype, m, k, n, label, sub_label)
|
||||||
|
raise ValueError("unsupported type")
|
||||||
|
|
||||||
|
|
||||||
|
# runner
|
||||||
|
def print_timers(timers: Iterable[TMeasurement]):
|
||||||
|
compare = TBenchmark.Compare(timers)
|
||||||
|
compare.print()
|
||||||
|
|
||||||
|
|
||||||
|
def run(dtype: torch.dtype,
|
||||||
|
MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
|
||||||
|
results = []
|
||||||
|
for m, k, n in MKNs:
|
||||||
|
timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm",
|
||||||
|
f"MKN=({m}x{k}x{n})")
|
||||||
|
print_timers(timers)
|
||||||
|
results.extend(timers)
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
# output makers
|
||||||
|
def make_output(data: Iterable[TMeasurement],
|
||||||
|
MKNs: Iterable[Tuple[int, int, int]],
|
||||||
|
base_description: str,
|
||||||
|
timestamp=None):
|
||||||
|
print(f"== All Results {base_description} ====")
|
||||||
|
print_timers(data)
|
||||||
|
|
||||||
|
# pickle all the results
|
||||||
|
timestamp = int(time.time()) if timestamp is None else timestamp
|
||||||
|
with open(f"{base_description}-{timestamp}.pkl", "wb") as f:
|
||||||
|
pkl.dump(data, f)
|
||||||
|
|
||||||
|
|
||||||
|
# argparse runners
|
||||||
|
|
||||||
|
|
||||||
|
def run_square_bench(args):
|
||||||
|
dim_sizes = list(
|
||||||
|
range(args.dim_start, args.dim_end + 1, args.dim_increment))
|
||||||
|
MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
|
||||||
|
data = run(args.dtype, MKNs)
|
||||||
|
|
||||||
|
make_output(data, MKNs, f"square_bench-{args.dtype}")
|
||||||
|
|
||||||
|
|
||||||
|
def run_range_bench(args):
|
||||||
|
dim_sizes = list(range(args.dim_start, args.dim_end, args.dim_increment))
|
||||||
|
n = len(dim_sizes)
|
||||||
|
Ms = [args.m_constant] * n if args.m_constant is not None else dim_sizes
|
||||||
|
Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes
|
||||||
|
Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes
|
||||||
|
MKNs = list(zip(Ms, Ks, Ns))
|
||||||
|
data = run(args.dtype, MKNs)
|
||||||
|
|
||||||
|
make_output(data, MKNs, f"range_bench-{args.dtype}")
|
||||||
|
|
||||||
|
|
||||||
|
def run_model_bench(args):
|
||||||
|
print("Benchmarking models:")
|
||||||
|
for i, model in enumerate(args.models):
|
||||||
|
print(f"[{i}] {model}")
|
||||||
|
|
||||||
|
def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
|
||||||
|
KNs = []
|
||||||
|
for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
|
||||||
|
KN[tp_split_dim] = KN[tp_split_dim] // tp_size
|
||||||
|
KNs.append(KN)
|
||||||
|
return KNs
|
||||||
|
|
||||||
|
model_bench_data = []
|
||||||
|
models_tps = list(itertools.product(args.models, args.tp_sizes))
|
||||||
|
for model, tp_size in models_tps:
|
||||||
|
Ms = args.batch_sizes
|
||||||
|
KNs = model_shapes(model, tp_size)
|
||||||
|
MKNs = []
|
||||||
|
for m in Ms:
|
||||||
|
for k, n in KNs:
|
||||||
|
MKNs.append((m, k, n))
|
||||||
|
|
||||||
|
data = run(args.dtype, MKNs)
|
||||||
|
model_bench_data.append(data)
|
||||||
|
|
||||||
|
# Print all results
|
||||||
|
for data, model_tp in zip(model_bench_data, models_tps):
|
||||||
|
model, tp_size = model_tp
|
||||||
|
print(f"== Results {args.dtype} {model}-TP{tp_size} ====")
|
||||||
|
print_timers(data)
|
||||||
|
|
||||||
|
timestamp = int(time.time())
|
||||||
|
|
||||||
|
all_data = []
|
||||||
|
for d in model_bench_data:
|
||||||
|
all_data.extend(d)
|
||||||
|
# pickle all data
|
||||||
|
with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f:
|
||||||
|
pkl.dump(all_data, f)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
def to_torch_dtype(dt):
|
||||||
|
if dt == "int8":
|
||||||
|
return torch.int8
|
||||||
|
if dt == "fp8":
|
||||||
|
return torch.float8_e4m3fn
|
||||||
|
raise ValueError("unsupported dtype")
|
||||||
|
|
||||||
|
parser = FlexibleArgumentParser(
|
||||||
|
description="""
|
||||||
|
Benchmark Cutlass GEMM.
|
||||||
|
|
||||||
|
To run square GEMMs:
|
||||||
|
python3 ./benchmarks/cutlass_benchmarks/sparse_benchmarks.py --dtype fp8 square_bench --dim-start 128 --dim-end 512 --dim-increment 64
|
||||||
|
|
||||||
|
To run constant N and K and sweep M:
|
||||||
|
python3 ./benchmarks/cutlass_benchmarks/sparse_benchmarks.py --dtype fp8 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384
|
||||||
|
|
||||||
|
To run dimensions from a model:
|
||||||
|
python3 ./benchmarks/cutlass_benchmarks/sparse_benchmarks.py --dtype fp8 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1
|
||||||
|
|
||||||
|
Output:
|
||||||
|
- a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
|
||||||
|
""", # noqa: E501
|
||||||
|
formatter_class=argparse.RawTextHelpFormatter)
|
||||||
|
|
||||||
|
parser.add_argument("--dtype",
|
||||||
|
type=to_torch_dtype,
|
||||||
|
required=True,
|
||||||
|
help="Available options are ['int8', 'fp8']")
|
||||||
|
subparsers = parser.add_subparsers(dest="cmd")
|
||||||
|
|
||||||
|
square_parser = subparsers.add_parser("square_bench")
|
||||||
|
square_parser.add_argument("--dim-start", type=int, required=True)
|
||||||
|
square_parser.add_argument("--dim-end", type=int, required=True)
|
||||||
|
square_parser.add_argument("--dim-increment", type=int, required=True)
|
||||||
|
square_parser.set_defaults(func=run_square_bench)
|
||||||
|
|
||||||
|
range_parser = subparsers.add_parser("range_bench")
|
||||||
|
range_parser.add_argument("--dim-start", type=int, required=True)
|
||||||
|
range_parser.add_argument("--dim-end", type=int, required=True)
|
||||||
|
range_parser.add_argument("--dim-increment", type=int, required=True)
|
||||||
|
range_parser.add_argument("--m-constant", type=int, default=None)
|
||||||
|
range_parser.add_argument("--n-constant", type=int, default=None)
|
||||||
|
range_parser.add_argument("--k-constant", type=int, default=None)
|
||||||
|
range_parser.set_defaults(func=run_range_bench)
|
||||||
|
|
||||||
|
model_parser = subparsers.add_parser("model_bench")
|
||||||
|
model_parser.add_argument("--models",
|
||||||
|
nargs="+",
|
||||||
|
type=str,
|
||||||
|
default=DEFAULT_MODELS,
|
||||||
|
choices=WEIGHT_SHAPES.keys())
|
||||||
|
model_parser.add_argument("--tp-sizes",
|
||||||
|
nargs="+",
|
||||||
|
type=int,
|
||||||
|
default=DEFAULT_TP_SIZES)
|
||||||
|
model_parser.add_argument("--batch-sizes",
|
||||||
|
nargs="+",
|
||||||
|
type=int,
|
||||||
|
default=DEFAULT_BATCH_SIZES)
|
||||||
|
model_parser.set_defaults(func=run_model_bench)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
args.func(args)
|
98
benchmarks/cutlass_benchmarks/utils.py
Normal file
98
benchmarks/cutlass_benchmarks/utils.py
Normal file
@ -0,0 +1,98 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
|
# Cutlass bench utils
|
||||||
|
from typing import Iterable, Tuple
|
||||||
|
|
||||||
|
import torch
|
||||||
|
|
||||||
|
import vllm._custom_ops as ops
|
||||||
|
|
||||||
|
|
||||||
|
def to_fp8(tensor: torch.Tensor) -> torch.Tensor:
|
||||||
|
finfo = torch.finfo(torch.float8_e4m3fn)
|
||||||
|
return torch.round(tensor.clamp(
|
||||||
|
min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
|
||||||
|
|
||||||
|
|
||||||
|
def to_int8(tensor: torch.Tensor) -> torch.Tensor:
|
||||||
|
return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
|
||||||
|
|
||||||
|
|
||||||
|
def to_bf16(tensor: torch.Tensor) -> torch.Tensor:
|
||||||
|
return tensor.to(dtype=torch.bfloat16)
|
||||||
|
|
||||||
|
|
||||||
|
def to_fp16(tensor: torch.Tensor) -> torch.Tensor:
|
||||||
|
return tensor.to(dtype=torch.float16)
|
||||||
|
|
||||||
|
|
||||||
|
def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
|
||||||
|
k: int) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||||
|
a = torch.randn((m, k), device='cuda') * 5
|
||||||
|
b = torch.randn((n, k), device='cuda').t() * 5
|
||||||
|
|
||||||
|
if dtype == torch.int8:
|
||||||
|
return to_int8(a), to_int8(b)
|
||||||
|
if dtype == torch.float8_e4m3fn:
|
||||||
|
return to_fp8(a), to_fp8(b)
|
||||||
|
|
||||||
|
raise ValueError("unsupported dtype")
|
||||||
|
|
||||||
|
|
||||||
|
def prune_to_2_4(tensor):
|
||||||
|
# Reshape tensor to [N, 4] where N is number of groups of 4
|
||||||
|
original_shape = tensor.shape
|
||||||
|
reshaped = tensor.reshape(-1, 4)
|
||||||
|
|
||||||
|
# Get indices of top 2 absolute values in each group of 4
|
||||||
|
_, indices = torch.topk(torch.abs(reshaped), k=2, dim=1)
|
||||||
|
|
||||||
|
# Create binary mask
|
||||||
|
mask = torch.zeros_like(reshaped)
|
||||||
|
mask.scatter_(dim=1,
|
||||||
|
index=indices,
|
||||||
|
src=torch.ones_like(indices, dtype=mask.dtype))
|
||||||
|
|
||||||
|
# Apply mask and reshape back
|
||||||
|
pruned = reshaped * mask
|
||||||
|
|
||||||
|
# Turn all -0.0 to 0.0
|
||||||
|
pruned[pruned == -0.0] = 0.0
|
||||||
|
|
||||||
|
return pruned.reshape(original_shape)
|
||||||
|
|
||||||
|
|
||||||
|
def make_rand_sparse_tensors(dtype: torch.dtype, m: int, n: int,
|
||||||
|
k: int) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||||
|
a = torch.randn((m, k), device='cuda') * 5
|
||||||
|
b = torch.randn((n, k), device='cuda').t() * 5
|
||||||
|
|
||||||
|
b = prune_to_2_4(b.t()).t()
|
||||||
|
|
||||||
|
if dtype == torch.int8:
|
||||||
|
a, b = to_int8(a), to_int8(b)
|
||||||
|
elif dtype == torch.float8_e4m3fn:
|
||||||
|
a, b = to_fp8(a), to_fp8(b)
|
||||||
|
elif dtype == torch.float16:
|
||||||
|
a, b = to_fp16(a), to_fp16(b)
|
||||||
|
elif dtype == torch.bfloat16:
|
||||||
|
a, b = to_bf16(a), to_bf16(b)
|
||||||
|
else:
|
||||||
|
raise ValueError("unsupported dtype")
|
||||||
|
|
||||||
|
b_compressed, e = ops.cutlass_sparse_compress(b.t())
|
||||||
|
|
||||||
|
# Compressed B, Metadata, Original A, B
|
||||||
|
return b_compressed, e, a, b
|
||||||
|
|
||||||
|
|
||||||
|
def make_n_rand_sparse_tensors(num_tensors: int, dtype: torch.dtype,
|
||||||
|
m: int, n: int, k: int) -> \
|
||||||
|
Tuple[Iterable[torch.Tensor], Iterable[torch.Tensor]]:
|
||||||
|
ABs = []
|
||||||
|
for _ in range(num_tensors):
|
||||||
|
b_comp, e, a, b = make_rand_sparse_tensors(dtype, m, n, k)
|
||||||
|
if b_comp is not None:
|
||||||
|
ABs.append(make_rand_sparse_tensors(dtype, m, n, k))
|
||||||
|
BComps, Es, As, Bs = zip(*ABs)
|
||||||
|
return list(BComps), list(Es), list(As), list(Bs)
|
@ -1,47 +1,27 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import copy
|
import copy
|
||||||
import itertools
|
import itertools
|
||||||
import pickle as pkl
|
import pickle as pkl
|
||||||
import time
|
import time
|
||||||
from typing import Callable, Iterable, List, Tuple
|
from typing import Callable, Iterable, List, Optional, Tuple
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.utils.benchmark as TBenchmark
|
import torch.utils.benchmark as TBenchmark
|
||||||
from torch.utils.benchmark import Measurement as TMeasurement
|
from torch.utils.benchmark import Measurement as TMeasurement
|
||||||
|
from utils import make_rand_tensors
|
||||||
from weight_shapes import WEIGHT_SHAPES
|
from weight_shapes import WEIGHT_SHAPES
|
||||||
|
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
|
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||||
|
w8a8_block_fp8_matmul)
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
|
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
|
||||||
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
|
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
|
||||||
DEFAULT_TP_SIZES = [1]
|
DEFAULT_TP_SIZES = [1]
|
||||||
|
|
||||||
# helpers
|
|
||||||
|
|
||||||
|
|
||||||
def to_fp8(tensor: torch.Tensor) -> torch.Tensor:
|
|
||||||
finfo = torch.finfo(torch.float8_e4m3fn)
|
|
||||||
return torch.round(tensor.clamp(
|
|
||||||
min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
|
|
||||||
|
|
||||||
|
|
||||||
def to_int8(tensor: torch.Tensor) -> torch.Tensor:
|
|
||||||
return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
|
|
||||||
|
|
||||||
|
|
||||||
def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
|
|
||||||
k: int) -> Tuple[torch.Tensor, torch.Tensor]:
|
|
||||||
a = torch.randn((m, k), device='cuda') * 5
|
|
||||||
b = torch.randn((n, k), device='cuda').t() * 5
|
|
||||||
|
|
||||||
if dtype == torch.int8:
|
|
||||||
return to_int8(a), to_int8(b)
|
|
||||||
if dtype == torch.float8_e4m3fn:
|
|
||||||
return to_fp8(a), to_fp8(b)
|
|
||||||
|
|
||||||
raise ValueError("unsupported dtype")
|
|
||||||
|
|
||||||
|
|
||||||
# bench
|
# bench
|
||||||
def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
|
def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
|
||||||
@ -62,8 +42,15 @@ def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
|
|||||||
).blocked_autorange(min_run_time=min_run_time)
|
).blocked_autorange(min_run_time=min_run_time)
|
||||||
|
|
||||||
|
|
||||||
def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
|
def bench_int8(
|
||||||
sub_label: str) -> Iterable[TMeasurement]:
|
dtype: torch.dtype,
|
||||||
|
m: int,
|
||||||
|
k: int,
|
||||||
|
n: int,
|
||||||
|
label: str,
|
||||||
|
sub_label: str,
|
||||||
|
bench_kernels: Optional[List[str]] = None) -> Iterable[TMeasurement]:
|
||||||
|
"""Benchmark INT8-based kernels."""
|
||||||
assert dtype == torch.int8
|
assert dtype == torch.int8
|
||||||
a, b = make_rand_tensors(torch.int8, m, n, k)
|
a, b = make_rand_tensors(torch.int8, m, n, k)
|
||||||
scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
||||||
@ -72,155 +59,132 @@ def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
|
|||||||
azp = torch.zeros((m, ), device="cuda", dtype=torch.int32)
|
azp = torch.zeros((m, ), device="cuda", dtype=torch.int32)
|
||||||
azp_adj = torch.zeros((n, ), device="cuda", dtype=torch.int32)
|
azp_adj = torch.zeros((n, ), device="cuda", dtype=torch.int32)
|
||||||
|
|
||||||
|
bench_fns = {
|
||||||
|
"pytorch_bf16_bf16_bf16_matmul-no-scales":
|
||||||
|
lambda: torch.mm(a.to(dtype=torch.bfloat16), b.to(dtype=torch.bfloat16)
|
||||||
|
),
|
||||||
|
"pytorch_fp16_fp16_fp16_matmul-no-scales":
|
||||||
|
lambda: torch.mm(a.to(dtype=torch.float16), b.to(dtype=torch.float16)),
|
||||||
|
"cutlass_i8_i8_bf16_scaled_mm":
|
||||||
|
lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16),
|
||||||
|
"cutlass_i8_i8_bf16_scaled_mm_bias":
|
||||||
|
lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16,
|
||||||
|
bias),
|
||||||
|
"cutlass_i8_i8_bf16_scaled_mm_azp":
|
||||||
|
lambda: ops.cutlass_scaled_mm_azp(a, b, scale_a, scale_b, torch.
|
||||||
|
bfloat16, azp_adj),
|
||||||
|
"cutlass_i8_i8_bf16_scaled_mm_azp_bias":
|
||||||
|
lambda: ops.cutlass_scaled_mm_azp(a, b, scale_a, scale_b, torch.
|
||||||
|
bfloat16, azp_adj, None, bias),
|
||||||
|
"cutlass_i8_i8_bf16_scaled_mm_azp_pt":
|
||||||
|
lambda: ops.cutlass_scaled_mm_azp(a, b, scale_a, scale_b, torch.
|
||||||
|
bfloat16, azp_adj, azp),
|
||||||
|
"cutlass_i8_i8_bf16_scaled_mm_azp_pt_bias":
|
||||||
|
lambda: ops.cutlass_scaled_mm_azp(a, b, scale_a, scale_b, torch.
|
||||||
|
bfloat16, azp_adj, azp, bias),
|
||||||
|
}
|
||||||
|
|
||||||
timers = []
|
timers = []
|
||||||
# pytorch impl - bfloat16
|
for name, fn in bench_fns.items():
|
||||||
timers.append(
|
# If bench_kernels is None, run all. Otherwise, run only exact matches.
|
||||||
bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
|
if bench_kernels is None or name in bench_kernels:
|
||||||
torch.mm, a.to(dtype=torch.bfloat16),
|
print(f"Running {name}")
|
||||||
b.to(dtype=torch.bfloat16)))
|
timers.append(bench_fn(label, sub_label, name, fn))
|
||||||
|
|
||||||
# pytorch impl - float16
|
|
||||||
timers.append(
|
|
||||||
bench_fn(label, sub_label,
|
|
||||||
"pytorch_fp16_fp16_fp16_matmul-no-scales", torch.mm,
|
|
||||||
a.to(dtype=torch.float16), b.to(dtype=torch.float16)))
|
|
||||||
|
|
||||||
# cutlass impl
|
|
||||||
timers.append(
|
|
||||||
bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm",
|
|
||||||
ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
|
|
||||||
torch.bfloat16))
|
|
||||||
|
|
||||||
# cutlass with bias
|
|
||||||
timers.append(
|
|
||||||
bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_bias",
|
|
||||||
ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16,
|
|
||||||
bias))
|
|
||||||
|
|
||||||
# cutlass with azp per-tensor
|
|
||||||
timers.append(
|
|
||||||
bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp",
|
|
||||||
ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
|
|
||||||
torch.bfloat16, azp_adj))
|
|
||||||
|
|
||||||
# cutlass with azp per-tensor + bias
|
|
||||||
timers.append(
|
|
||||||
bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_bias",
|
|
||||||
ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
|
|
||||||
torch.bfloat16, azp_adj, None, bias))
|
|
||||||
|
|
||||||
# cutlass with azp per-token
|
|
||||||
timers.append(
|
|
||||||
bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_pt",
|
|
||||||
ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
|
|
||||||
torch.bfloat16, azp_adj, azp))
|
|
||||||
|
|
||||||
# cutlass with azp per-token + bias
|
|
||||||
timers.append(
|
|
||||||
bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_pt_bias",
|
|
||||||
ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
|
|
||||||
torch.bfloat16, azp_adj, azp, bias))
|
|
||||||
|
|
||||||
return timers
|
return timers
|
||||||
|
|
||||||
|
|
||||||
def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
|
def bench_fp8(
|
||||||
sub_label: str) -> Iterable[TMeasurement]:
|
dtype: torch.dtype,
|
||||||
|
m: int,
|
||||||
|
k: int,
|
||||||
|
n: int,
|
||||||
|
label: str,
|
||||||
|
sub_label: str,
|
||||||
|
bench_kernels: Optional[List[str]] = None) -> Iterable[TMeasurement]:
|
||||||
|
"""Benchmark FP8-based kernels."""
|
||||||
assert dtype == torch.float8_e4m3fn
|
assert dtype == torch.float8_e4m3fn
|
||||||
a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
|
a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
|
||||||
|
a_cont = a.contiguous()
|
||||||
scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
||||||
scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
||||||
|
block_scale_a = torch.rand((m, k // 128),
|
||||||
|
device="cuda",
|
||||||
|
dtype=torch.float32)
|
||||||
|
block_scale_b = torch.rand((k // 128, n // 128),
|
||||||
|
device="cuda",
|
||||||
|
dtype=torch.float32)
|
||||||
|
block_scale_a_M_major = block_scale_a.t().contiguous().t()
|
||||||
|
block_scale_b_K_major = block_scale_b.t().contiguous().t()
|
||||||
bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
|
bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
|
||||||
|
|
||||||
|
print(m, k, n)
|
||||||
|
|
||||||
|
bench_fns = {
|
||||||
|
"pytorch_bf16_bf16_bf16_matmul-no-scales":
|
||||||
|
lambda: torch.mm(a.to(dtype=torch.bfloat16), b.to(dtype=torch.bfloat16)
|
||||||
|
),
|
||||||
|
"pytorch_fp16_fp16_fp16_matmul-no-scales":
|
||||||
|
lambda: torch.mm(a.to(dtype=torch.float16), b.to(dtype=torch.float16)),
|
||||||
|
"pytorch_fp8_fp8_fp16_scaled_mm":
|
||||||
|
lambda: torch._scaled_mm(
|
||||||
|
a, b, scale_a, scale_b, out_dtype=torch.float16),
|
||||||
|
"pytorch_fp8_fp8_fp16_scaled_mm_fast_accum":
|
||||||
|
lambda: torch._scaled_mm(a,
|
||||||
|
b,
|
||||||
|
scale_a,
|
||||||
|
scale_b,
|
||||||
|
out_dtype=torch.float16,
|
||||||
|
use_fast_accum=True),
|
||||||
|
"pytorch_fp8_fp8_bf16_scaled_mm":
|
||||||
|
lambda: torch._scaled_mm(
|
||||||
|
a, b, scale_a, scale_b, out_dtype=torch.bfloat16),
|
||||||
|
"pytorch_fp8_fp8_bf16_scaled_mm_fast_accum":
|
||||||
|
lambda: torch._scaled_mm(a,
|
||||||
|
b,
|
||||||
|
scale_a,
|
||||||
|
scale_b,
|
||||||
|
out_dtype=torch.bfloat16,
|
||||||
|
use_fast_accum=True),
|
||||||
|
"cutlass_fp8_fp8_bf16_scaled_mm":
|
||||||
|
lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16),
|
||||||
|
"cutlass_fp8_fp8_fp16_scaled_mm":
|
||||||
|
lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.float16),
|
||||||
|
"cutlass_fp8_fp8_bf16_scaled_mm_bias":
|
||||||
|
lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16,
|
||||||
|
bias),
|
||||||
|
"cutlass_fp8_fp8_fp16_scaled_mm_bias":
|
||||||
|
lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.float16,
|
||||||
|
bias.to(dtype=torch.float16)),
|
||||||
|
"triton_fp8_fp8_fp16_scaled_mm_blockwise":
|
||||||
|
lambda: w8a8_block_fp8_matmul(a_cont, b.t(), block_scale_a,
|
||||||
|
block_scale_b.t(), (128, 128)),
|
||||||
|
"cutlass_fp8_fp8_fp16_scaled_mm_blockwise":
|
||||||
|
lambda: ops.cutlass_scaled_mm(a, b, block_scale_a_M_major,
|
||||||
|
block_scale_b_K_major, torch.float16),
|
||||||
|
}
|
||||||
|
|
||||||
timers = []
|
timers = []
|
||||||
|
for name, fn in bench_fns.items():
|
||||||
# pytorch impl w. bf16
|
# If bench_kernels is None, run all. Otherwise, run only exact matches.
|
||||||
timers.append(
|
if bench_kernels is None or name in bench_kernels:
|
||||||
bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
|
print(f"Running {name}")
|
||||||
torch.mm, a.to(dtype=torch.bfloat16, device="cuda"),
|
timers.append(bench_fn(label, sub_label, name, fn))
|
||||||
b.to(dtype=torch.bfloat16, device="cuda")))
|
|
||||||
|
|
||||||
# pytorch impl: bf16 output, without fp8 fast accum
|
|
||||||
timers.append(
|
|
||||||
bench_fn(label,
|
|
||||||
sub_label,
|
|
||||||
"pytorch_fp8_fp8_bf16_scaled_mm",
|
|
||||||
torch._scaled_mm,
|
|
||||||
a,
|
|
||||||
b,
|
|
||||||
scale_a=scale_a,
|
|
||||||
scale_b=scale_b,
|
|
||||||
out_dtype=torch.bfloat16))
|
|
||||||
|
|
||||||
# pytorch impl: bf16 output, with fp8 fast accum
|
|
||||||
timers.append(
|
|
||||||
bench_fn(label,
|
|
||||||
sub_label,
|
|
||||||
"pytorch_fp8_fp8_bf16_scaled_mm_fast_accum",
|
|
||||||
torch._scaled_mm,
|
|
||||||
a,
|
|
||||||
b,
|
|
||||||
scale_a=scale_a,
|
|
||||||
scale_b=scale_b,
|
|
||||||
out_dtype=torch.bfloat16,
|
|
||||||
use_fast_accum=True))
|
|
||||||
|
|
||||||
# pytorch impl: fp16 output, without fp8 fast accum
|
|
||||||
timers.append(
|
|
||||||
bench_fn(label,
|
|
||||||
sub_label,
|
|
||||||
"pytorch_fp8_fp8_fp16_scaled_mm",
|
|
||||||
torch._scaled_mm,
|
|
||||||
a,
|
|
||||||
b,
|
|
||||||
scale_a=scale_a,
|
|
||||||
scale_b=scale_b,
|
|
||||||
out_dtype=torch.float16))
|
|
||||||
|
|
||||||
# pytorch impl: fp16 output, with fp8 fast accum
|
|
||||||
timers.append(
|
|
||||||
bench_fn(label,
|
|
||||||
sub_label,
|
|
||||||
"pytorch_fp8_fp8_fp16_scaled_mm_fast_accum",
|
|
||||||
torch._scaled_mm,
|
|
||||||
a,
|
|
||||||
b,
|
|
||||||
scale_a=scale_a,
|
|
||||||
scale_b=scale_b,
|
|
||||||
out_dtype=torch.float16,
|
|
||||||
use_fast_accum=True))
|
|
||||||
|
|
||||||
# cutlass impl: bf16 output
|
|
||||||
timers.append(
|
|
||||||
bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm",
|
|
||||||
ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
|
|
||||||
torch.bfloat16))
|
|
||||||
# cutlass impl: fp16 output
|
|
||||||
timers.append(
|
|
||||||
bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_mm",
|
|
||||||
ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.float16))
|
|
||||||
|
|
||||||
# cutlass impl: bf16 output, with bias
|
|
||||||
timers.append(
|
|
||||||
bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm_bias",
|
|
||||||
ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16,
|
|
||||||
bias))
|
|
||||||
|
|
||||||
# cutlass impl: fp16 output, with bias
|
|
||||||
timers.append(
|
|
||||||
bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_mm_bias",
|
|
||||||
ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.float16,
|
|
||||||
bias.to(dtype=torch.float16)))
|
|
||||||
|
|
||||||
return timers
|
return timers
|
||||||
|
|
||||||
|
|
||||||
def bench(dtype: torch.dtype, m: int, k: int, n: int, label: str,
|
def bench(dtype: torch.dtype,
|
||||||
sub_label: str) -> Iterable[TMeasurement]:
|
m: int,
|
||||||
|
k: int,
|
||||||
|
n: int,
|
||||||
|
label: str,
|
||||||
|
sub_label: str,
|
||||||
|
bench_kernels: Optional[List[str]] = None) -> Iterable[TMeasurement]:
|
||||||
if dtype == torch.int8:
|
if dtype == torch.int8:
|
||||||
return bench_int8(dtype, m, k, n, label, sub_label)
|
return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels)
|
||||||
if dtype == torch.float8_e4m3fn:
|
if dtype == torch.float8_e4m3fn:
|
||||||
return bench_fp8(dtype, m, k, n, label, sub_label)
|
return bench_fp8(dtype, m, k, n, label, sub_label, bench_kernels)
|
||||||
raise ValueError("unsupported type")
|
raise ValueError("unsupported type")
|
||||||
|
|
||||||
|
|
||||||
@ -231,18 +195,22 @@ def print_timers(timers: Iterable[TMeasurement]):
|
|||||||
|
|
||||||
|
|
||||||
def run(dtype: torch.dtype,
|
def run(dtype: torch.dtype,
|
||||||
MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
|
MKNs: Iterable[Tuple[int, int, int]],
|
||||||
|
bench_kernels: Optional[List[str]] = None) -> Iterable[TMeasurement]:
|
||||||
results = []
|
results = []
|
||||||
for m, k, n in MKNs:
|
for m, k, n in MKNs:
|
||||||
timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm",
|
timers = bench(dtype,
|
||||||
f"MKN=({m}x{k}x{n})")
|
m,
|
||||||
|
k,
|
||||||
|
n,
|
||||||
|
f"scaled-{dtype}-gemm",
|
||||||
|
f"MKN=({m}x{k}x{n})",
|
||||||
|
bench_kernels=bench_kernels)
|
||||||
print_timers(timers)
|
print_timers(timers)
|
||||||
results.extend(timers)
|
results.extend(timers)
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
# output makers
|
|
||||||
def make_output(data: Iterable[TMeasurement],
|
def make_output(data: Iterable[TMeasurement],
|
||||||
MKNs: Iterable[Tuple[int, int, int]],
|
MKNs: Iterable[Tuple[int, int, int]],
|
||||||
base_description: str,
|
base_description: str,
|
||||||
@ -256,15 +224,11 @@ def make_output(data: Iterable[TMeasurement],
|
|||||||
pkl.dump(data, f)
|
pkl.dump(data, f)
|
||||||
|
|
||||||
|
|
||||||
# argparse runners
|
|
||||||
|
|
||||||
|
|
||||||
def run_square_bench(args):
|
def run_square_bench(args):
|
||||||
dim_sizes = list(
|
dim_sizes = list(
|
||||||
range(args.dim_start, args.dim_end + 1, args.dim_increment))
|
range(args.dim_start, args.dim_end + 1, args.dim_increment))
|
||||||
MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
|
MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
|
||||||
data = run(args.dtype, MKNs)
|
data = run(args.dtype, MKNs, bench_kernels=args.kernels)
|
||||||
|
|
||||||
make_output(data, MKNs, f"square_bench-{args.dtype}")
|
make_output(data, MKNs, f"square_bench-{args.dtype}")
|
||||||
|
|
||||||
|
|
||||||
@ -275,8 +239,7 @@ def run_range_bench(args):
|
|||||||
Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes
|
Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes
|
||||||
Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes
|
Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes
|
||||||
MKNs = list(zip(Ms, Ks, Ns))
|
MKNs = list(zip(Ms, Ks, Ns))
|
||||||
data = run(args.dtype, MKNs)
|
data = run(args.dtype, MKNs, bench_kernels=args.kernels)
|
||||||
|
|
||||||
make_output(data, MKNs, f"range_bench-{args.dtype}")
|
make_output(data, MKNs, f"range_bench-{args.dtype}")
|
||||||
|
|
||||||
|
|
||||||
@ -302,7 +265,7 @@ def run_model_bench(args):
|
|||||||
for k, n in KNs:
|
for k, n in KNs:
|
||||||
MKNs.append((m, k, n))
|
MKNs.append((m, k, n))
|
||||||
|
|
||||||
data = run(args.dtype, MKNs)
|
data = run(args.dtype, MKNs, bench_kernels=args.kernels)
|
||||||
model_bench_data.append(data)
|
model_bench_data.append(data)
|
||||||
|
|
||||||
# Print all results
|
# Print all results
|
||||||
@ -352,6 +315,15 @@ Benchmark Cutlass GEMM.
|
|||||||
type=to_torch_dtype,
|
type=to_torch_dtype,
|
||||||
required=True,
|
required=True,
|
||||||
help="Available options are ['int8', 'fp8']")
|
help="Available options are ['int8', 'fp8']")
|
||||||
|
parser.add_argument(
|
||||||
|
"--kernels",
|
||||||
|
nargs="+",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help=
|
||||||
|
"Exact names of the kernels to benchmark. If not set, runs all kernels."
|
||||||
|
)
|
||||||
|
|
||||||
subparsers = parser.add_subparsers(dest="cmd")
|
subparsers = parser.add_subparsers(dest="cmd")
|
||||||
|
|
||||||
square_parser = subparsers.add_parser("square_bench")
|
square_parser = subparsers.add_parser("square_bench")
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
# Weight Shapes are in the format
|
# Weight Shapes are in the format
|
||||||
# ([K, N], TP_SPLIT_DIM)
|
# ([K, N], TP_SPLIT_DIM)
|
||||||
# Example:
|
# Example:
|
||||||
@ -40,4 +42,4 @@ WEIGHT_SHAPES = {
|
|||||||
([8192, 57344], 1),
|
([8192, 57344], 1),
|
||||||
([28672, 8192], 0),
|
([28672, 8192], 0),
|
||||||
],
|
],
|
||||||
}
|
}
|
@ -10,7 +10,8 @@ set -ex
|
|||||||
|
|
||||||
kill_gpu_processes() {
|
kill_gpu_processes() {
|
||||||
# kill all processes on GPU.
|
# kill all processes on GPU.
|
||||||
pkill -f pt_main_thread
|
pgrep pt_main_thread | xargs -r kill -9
|
||||||
|
pgrep python3 | xargs -r kill -9
|
||||||
sleep 10
|
sleep 10
|
||||||
|
|
||||||
# remove vllm config file
|
# remove vllm config file
|
||||||
@ -54,7 +55,7 @@ benchmark() {
|
|||||||
|
|
||||||
CUDA_VISIBLE_DEVICES=0 python3 \
|
CUDA_VISIBLE_DEVICES=0 python3 \
|
||||||
-m vllm.entrypoints.openai.api_server \
|
-m vllm.entrypoints.openai.api_server \
|
||||||
--model meta-llama/Meta-Llama-3.1-8B-Instruct \
|
--model $model \
|
||||||
--port 8100 \
|
--port 8100 \
|
||||||
--max-model-len 10000 \
|
--max-model-len 10000 \
|
||||||
--gpu-memory-utilization 0.6 \
|
--gpu-memory-utilization 0.6 \
|
||||||
@ -64,7 +65,7 @@ benchmark() {
|
|||||||
|
|
||||||
CUDA_VISIBLE_DEVICES=1 python3 \
|
CUDA_VISIBLE_DEVICES=1 python3 \
|
||||||
-m vllm.entrypoints.openai.api_server \
|
-m vllm.entrypoints.openai.api_server \
|
||||||
--model meta-llama/Meta-Llama-3.1-8B-Instruct \
|
--model $model \
|
||||||
--port 8200 \
|
--port 8200 \
|
||||||
--max-model-len 10000 \
|
--max-model-len 10000 \
|
||||||
--gpu-memory-utilization 0.6 \
|
--gpu-memory-utilization 0.6 \
|
||||||
@ -87,7 +88,7 @@ benchmark() {
|
|||||||
--port 8100 \
|
--port 8100 \
|
||||||
--save-result \
|
--save-result \
|
||||||
--result-dir $results_folder \
|
--result-dir $results_folder \
|
||||||
--result-filename disagg_prefill_2xtp4.json \
|
--result-filename disagg_prefill_tp1.json \
|
||||||
--request-rate "inf"
|
--request-rate "inf"
|
||||||
|
|
||||||
|
|
||||||
@ -105,7 +106,7 @@ benchmark() {
|
|||||||
--port 8200 \
|
--port 8200 \
|
||||||
--save-result \
|
--save-result \
|
||||||
--result-dir $results_folder \
|
--result-dir $results_folder \
|
||||||
--result-filename disagg_prefill_2xtp4.json \
|
--result-filename disagg_prefill_tp1_overhead.json \
|
||||||
--request-rate "$qps"
|
--request-rate "$qps"
|
||||||
kill_gpu_processes
|
kill_gpu_processes
|
||||||
|
|
||||||
@ -118,7 +119,7 @@ main() {
|
|||||||
(which jq) || (apt-get -y install jq)
|
(which jq) || (apt-get -y install jq)
|
||||||
(which socat) || (apt-get -y install socat)
|
(which socat) || (apt-get -y install socat)
|
||||||
|
|
||||||
pip install quart httpx
|
pip install quart httpx datasets
|
||||||
|
|
||||||
cd "$(dirname "$0")"
|
cd "$(dirname "$0")"
|
||||||
|
|
||||||
|
@ -1,13 +1,12 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
# Requirement: 8x H100 GPUs.
|
# Requirement: 2x GPUs.
|
||||||
|
|
||||||
|
|
||||||
# Model: neuralmagic/Meta-Llama-3-70B-Instruct-FP8-KV
|
# Model: meta-llama/Meta-Llama-3.1-8B-Instruct
|
||||||
# Query: 2048 input tokens, 11 output tokens, QPS 4, 500 requests
|
# Query: 1024 input tokens, 6 output tokens, QPS 2/4/6/8, 100 requests
|
||||||
# Resource: 8x H100
|
# Resource: 2x GPU
|
||||||
# Approaches:
|
# Approaches:
|
||||||
# 1. Chunked prefill: 1 vllm instance with tp=8
|
|
||||||
# 2. Chunked prefill: 2 vllm instance with tp=4, equivalent to 1 tp=4 instance with QPS 4
|
# 2. Chunked prefill: 2 vllm instance with tp=4, equivalent to 1 tp=4 instance with QPS 4
|
||||||
# 3. Disaggregated prefill: 1 prefilling instance and 1 decoding instance
|
# 3. Disaggregated prefill: 1 prefilling instance and 1 decoding instance
|
||||||
# Prefilling instance: max_output_token=1
|
# Prefilling instance: max_output_token=1
|
||||||
@ -114,7 +113,6 @@ benchmark() {
|
|||||||
--request-rate "$qps"
|
--request-rate "$qps"
|
||||||
|
|
||||||
sleep 2
|
sleep 2
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -123,8 +121,9 @@ main() {
|
|||||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||||
(which jq) || (apt-get -y install jq)
|
(which jq) || (apt-get -y install jq)
|
||||||
(which socat) || (apt-get -y install socat)
|
(which socat) || (apt-get -y install socat)
|
||||||
|
(which lsof) || (apt-get -y install lsof)
|
||||||
|
|
||||||
pip install quart httpx matplotlib aiohttp
|
pip install quart httpx matplotlib aiohttp datasets
|
||||||
|
|
||||||
cd "$(dirname "$0")"
|
cd "$(dirname "$0")"
|
||||||
|
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
|
||||||
import aiohttp
|
import aiohttp
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import itertools
|
import itertools
|
||||||
|
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
import json
|
import json
|
||||||
|
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
import pickle as pkl
|
import pickle as pkl
|
||||||
import time
|
import time
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
import time
|
import time
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
1149
benchmarks/kernels/benchmark_lora.py
Normal file
1149
benchmarks/kernels/benchmark_lora.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -1,3 +1,5 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import copy
|
import copy
|
||||||
import itertools
|
import itertools
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
@ -1,6 +1,9 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import time
|
import time
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from itertools import product
|
||||||
from typing import Any, Dict, List, Tuple, TypedDict
|
from typing import Any, Dict, List, Tuple, TypedDict
|
||||||
|
|
||||||
import ray
|
import ray
|
||||||
@ -13,6 +16,9 @@ from vllm.model_executor.layers.fused_moe.fused_moe import *
|
|||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
|
FP8_DTYPE = torch.float8_e4m3fnuz if current_platform.is_rocm(
|
||||||
|
) else torch.float8_e4m3fn
|
||||||
|
|
||||||
|
|
||||||
class BenchmarkConfig(TypedDict):
|
class BenchmarkConfig(TypedDict):
|
||||||
BLOCK_SIZE_M: int
|
BLOCK_SIZE_M: int
|
||||||
@ -80,8 +86,8 @@ def benchmark_config(
|
|||||||
a1_scale = torch.randn(1, dtype=torch.float32)
|
a1_scale = torch.randn(1, dtype=torch.float32)
|
||||||
a2_scale = torch.randn(1, dtype=torch.float32)
|
a2_scale = torch.randn(1, dtype=torch.float32)
|
||||||
|
|
||||||
w1 = w1.to(torch.float8_e4m3fn)
|
w1 = w1.to(FP8_DTYPE)
|
||||||
w2 = w2.to(torch.float8_e4m3fn)
|
w2 = w2.to(FP8_DTYPE)
|
||||||
|
|
||||||
input_gating = torch.empty(num_tokens, num_experts, dtype=torch.float32)
|
input_gating = torch.empty(num_tokens, num_experts, dtype=torch.float32)
|
||||||
|
|
||||||
@ -141,28 +147,172 @@ def benchmark_config(
|
|||||||
return avg
|
return avg
|
||||||
|
|
||||||
|
|
||||||
def get_configs_compute_bound() -> List[Dict[str, int]]:
|
def get_rocm_tuning_space(use_fp16):
|
||||||
# Reduced search space for faster tuning.
|
block_mn_range = [16, 32, 64, 128, 256]
|
||||||
# TODO(woosuk): Increase the search space and use a performance model to
|
block_k_range = [16, 32, 64, 128, 256]
|
||||||
# prune the search space.
|
if not use_fp16:
|
||||||
|
block_k_range.remove(16) # BLOCK_K=16 not supported for fp8
|
||||||
|
num_warps_range = [1, 2, 4, 8]
|
||||||
|
group_m_range = [1, 4, 8, 16, 32]
|
||||||
|
num_stage_range = [2]
|
||||||
|
waves_per_eu_range = [0]
|
||||||
|
matrix_instr_nonkdim_range = [16, 32] if use_fp16 else []
|
||||||
|
kpack_range = [1, 2] if use_fp16 else []
|
||||||
|
|
||||||
|
param_ranges = {
|
||||||
|
"BLOCK_SIZE_M": block_mn_range,
|
||||||
|
"BLOCK_SIZE_N": block_mn_range,
|
||||||
|
"BLOCK_SIZE_K": block_k_range,
|
||||||
|
"GROUP_SIZE_M": group_m_range,
|
||||||
|
"num_warps": num_warps_range,
|
||||||
|
"num_stages": num_stage_range,
|
||||||
|
"waves_per_eu": waves_per_eu_range,
|
||||||
|
}
|
||||||
|
if use_fp16:
|
||||||
|
param_ranges["matrix_instr_nonkdim"] = matrix_instr_nonkdim_range
|
||||||
|
param_ranges["kpack"] = kpack_range
|
||||||
|
|
||||||
|
return param_ranges
|
||||||
|
|
||||||
|
|
||||||
|
def get_configs_compute_bound(use_fp16) -> List[Dict[str, int]]:
|
||||||
configs: List[BenchmarkConfig] = []
|
configs: List[BenchmarkConfig] = []
|
||||||
for num_stages in [2, 3, 4, 5]:
|
|
||||||
for block_m in [16, 32, 64, 128, 256]:
|
if current_platform.is_rocm():
|
||||||
for block_k in [64, 128, 256]:
|
param_ranges = get_rocm_tuning_space(use_fp16)
|
||||||
for block_n in [32, 64, 128, 256]:
|
else:
|
||||||
for num_warps in [4, 8]:
|
# Reduced search space for faster tuning.
|
||||||
for group_size in [1, 16, 32, 64]:
|
# TODO(woosuk): Increase the search space and use a performance model to
|
||||||
configs.append({
|
# prune the search space.
|
||||||
"BLOCK_SIZE_M": block_m,
|
block_m_range = [16, 32, 64, 128, 256]
|
||||||
"BLOCK_SIZE_N": block_n,
|
block_n_range = [32, 64, 128, 256]
|
||||||
"BLOCK_SIZE_K": block_k,
|
block_k_range = [64, 128, 256]
|
||||||
"GROUP_SIZE_M": group_size,
|
num_warps_range = [4, 8]
|
||||||
"num_warps": num_warps,
|
group_m_range = [1, 16, 32, 64]
|
||||||
"num_stages": num_stages,
|
num_stage_range = [2, 3, 4, 5]
|
||||||
})
|
|
||||||
|
param_ranges = {
|
||||||
|
"BLOCK_SIZE_M": block_m_range,
|
||||||
|
"BLOCK_SIZE_N": block_n_range,
|
||||||
|
"BLOCK_SIZE_K": block_k_range,
|
||||||
|
"GROUP_SIZE_M": group_m_range,
|
||||||
|
"num_warps": num_warps_range,
|
||||||
|
"num_stages": num_stage_range,
|
||||||
|
}
|
||||||
|
|
||||||
|
keys, values = zip(*param_ranges.items())
|
||||||
|
for config_values in product(*values):
|
||||||
|
config = dict(zip(keys, config_values))
|
||||||
|
configs.append(config)
|
||||||
return configs
|
return configs
|
||||||
|
|
||||||
|
|
||||||
|
def prune_rocm_search_space(num_tokens, shard_intermediate_size, hidden_size,
|
||||||
|
search_space, is_fp16):
|
||||||
|
N1, K1 = shard_intermediate_size, hidden_size
|
||||||
|
N2, K2 = hidden_size, shard_intermediate_size // 2
|
||||||
|
pruned_space_1 = prune_rocm_configs(num_tokens * 2, N1, K1, search_space,
|
||||||
|
is_fp16)
|
||||||
|
pruned_space_2 = prune_rocm_configs(num_tokens * 2, N2, K2, search_space,
|
||||||
|
is_fp16)
|
||||||
|
search_space = merge_unique_dicts(pruned_space_1, pruned_space_2)
|
||||||
|
return search_space
|
||||||
|
|
||||||
|
|
||||||
|
# The following code is inspired by ROCm/Triton GEMM tuning script:
|
||||||
|
# https://github.com/ROCm/triton/blob/triton-mlir/scripts/amd/gemm/tune_gemm.py#L89
|
||||||
|
def prune_rocm_configs(M, N, K, configs, is_fp16=True):
|
||||||
|
pruned_configs = []
|
||||||
|
elemBytes_a = 2 if is_fp16 else 1
|
||||||
|
elemBytes_b = 2 if is_fp16 else 1
|
||||||
|
|
||||||
|
mfma = 16 if M < 32 or N < 32 else 32
|
||||||
|
|
||||||
|
# TODO (zhanglx): figure out the boundary between large and small gemms
|
||||||
|
large_gemm = False
|
||||||
|
if M >= 2048 and N >= 2048:
|
||||||
|
large_gemm = True
|
||||||
|
|
||||||
|
for config in configs:
|
||||||
|
BLOCK_SIZE_M = config.get("BLOCK_SIZE_M")
|
||||||
|
BLOCK_SIZE_N = config.get("BLOCK_SIZE_N")
|
||||||
|
BLOCK_SIZE_K = config.get("BLOCK_SIZE_K")
|
||||||
|
num_warps = config.get("num_warps")
|
||||||
|
|
||||||
|
if is_fp16:
|
||||||
|
matrix_instr_nonkdim = config.get("matrix_instr_nonkdim")
|
||||||
|
if matrix_instr_nonkdim > mfma:
|
||||||
|
continue
|
||||||
|
if mfma == 4 and BLOCK_SIZE_K < 64:
|
||||||
|
continue
|
||||||
|
# some layouts could not work properly in case
|
||||||
|
# number elements per thread is less 1
|
||||||
|
if BLOCK_SIZE_M * BLOCK_SIZE_N < 64:
|
||||||
|
continue
|
||||||
|
SPLIT_K = config.get("SPLIT_K", 1)
|
||||||
|
GROUP_M = config.get("GROUP_SIZE_M")
|
||||||
|
if is_fp16:
|
||||||
|
if (matrix_instr_nonkdim > BLOCK_SIZE_M
|
||||||
|
or matrix_instr_nonkdim > BLOCK_SIZE_N):
|
||||||
|
continue
|
||||||
|
if (matrix_instr_nonkdim >= M
|
||||||
|
and matrix_instr_nonkdim != BLOCK_SIZE_M):
|
||||||
|
continue
|
||||||
|
if (matrix_instr_nonkdim >= N
|
||||||
|
and matrix_instr_nonkdim != BLOCK_SIZE_N):
|
||||||
|
continue
|
||||||
|
# Skip BLOCK_SIZE that is too large compare to M/N
|
||||||
|
# unless BLOCK_SIZE is already small enough
|
||||||
|
if M * 2 < BLOCK_SIZE_M and BLOCK_SIZE_M != 16:
|
||||||
|
continue
|
||||||
|
if N * 2 < BLOCK_SIZE_N and BLOCK_SIZE_N != 16:
|
||||||
|
continue
|
||||||
|
# skip large split_k when not necessary
|
||||||
|
if SPLIT_K != 1 and not need_split_k(M, N, K):
|
||||||
|
continue
|
||||||
|
# skip split_k that leads to EVEN_K = false
|
||||||
|
leap = SPLIT_K * BLOCK_SIZE_K
|
||||||
|
modv = K % leap
|
||||||
|
if modv != 0:
|
||||||
|
continue
|
||||||
|
# skip large GROUP_M
|
||||||
|
if GROUP_M * BLOCK_SIZE_M > M and GROUP_M != 1:
|
||||||
|
continue
|
||||||
|
# out of shared memory resource
|
||||||
|
# TODO (zhanglx): This does not consider the LDS usage in the epilogue
|
||||||
|
LDS = (BLOCK_SIZE_K * BLOCK_SIZE_M * elemBytes_a +
|
||||||
|
BLOCK_SIZE_K * BLOCK_SIZE_N * elemBytes_b)
|
||||||
|
if LDS > 65536:
|
||||||
|
continue
|
||||||
|
# Skip small block sizes and num_warps for large gemm
|
||||||
|
# For fp16 and f8, we want to only use BLOCK_SIZE >= 64
|
||||||
|
if large_gemm:
|
||||||
|
if BLOCK_SIZE_M < 64 or BLOCK_SIZE_N < 64:
|
||||||
|
continue
|
||||||
|
if BLOCK_SIZE_K < 64:
|
||||||
|
continue
|
||||||
|
if num_warps < 4:
|
||||||
|
continue
|
||||||
|
|
||||||
|
pruned_configs.append(config)
|
||||||
|
|
||||||
|
return pruned_configs
|
||||||
|
|
||||||
|
|
||||||
|
def need_split_k(SIZE_M, SIZE_N, SIZE_K):
|
||||||
|
return (SIZE_M < 64 or SIZE_N < 64) and SIZE_K > 1024
|
||||||
|
|
||||||
|
|
||||||
|
def merge_unique_dicts(list1, list2):
|
||||||
|
result = []
|
||||||
|
combined_list = list1.copy()
|
||||||
|
combined_list.extend(list2)
|
||||||
|
for dictionary in combined_list:
|
||||||
|
if dictionary not in result:
|
||||||
|
result.append(dictionary)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
@ray.remote(num_gpus=1)
|
@ray.remote(num_gpus=1)
|
||||||
class BenchmarkWorker:
|
class BenchmarkWorker:
|
||||||
|
|
||||||
@ -170,6 +320,10 @@ class BenchmarkWorker:
|
|||||||
torch.set_default_device("cuda")
|
torch.set_default_device("cuda")
|
||||||
current_platform.seed_everything(seed)
|
current_platform.seed_everything(seed)
|
||||||
self.seed = seed
|
self.seed = seed
|
||||||
|
# Get the device ID to allocate tensors and kernels
|
||||||
|
# on the respective GPU. This is required for Ray to work
|
||||||
|
# correctly with multi-GPU tuning on the ROCm platform.
|
||||||
|
self.device_id = int(ray.get_gpu_ids()[0])
|
||||||
|
|
||||||
def benchmark(
|
def benchmark(
|
||||||
self,
|
self,
|
||||||
@ -191,9 +345,13 @@ class BenchmarkWorker:
|
|||||||
op_config = get_moe_configs(num_experts, shard_intermediate_size // 2,
|
op_config = get_moe_configs(num_experts, shard_intermediate_size // 2,
|
||||||
dtype_str)
|
dtype_str)
|
||||||
if op_config is None:
|
if op_config is None:
|
||||||
config = get_default_config(num_tokens, num_experts,
|
config = get_default_config(num_tokens,
|
||||||
shard_intermediate_size, hidden_size,
|
num_experts,
|
||||||
topk, dtype_str)
|
shard_intermediate_size,
|
||||||
|
hidden_size,
|
||||||
|
topk,
|
||||||
|
dtype_str,
|
||||||
|
is_marlin=False)
|
||||||
else:
|
else:
|
||||||
config = op_config[min(op_config.keys(),
|
config = op_config[min(op_config.keys(),
|
||||||
key=lambda x: abs(x - num_tokens))]
|
key=lambda x: abs(x - num_tokens))]
|
||||||
@ -217,25 +375,33 @@ class BenchmarkWorker:
|
|||||||
) -> Dict[str, int]:
|
) -> Dict[str, int]:
|
||||||
best_config = None
|
best_config = None
|
||||||
best_time = float("inf")
|
best_time = float("inf")
|
||||||
for config in tqdm(search_space):
|
if current_platform.is_rocm():
|
||||||
try:
|
is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16)
|
||||||
kernel_time = benchmark_config(config,
|
search_space = prune_rocm_search_space(num_tokens,
|
||||||
num_tokens,
|
shard_intermediate_size,
|
||||||
num_experts,
|
hidden_size, search_space,
|
||||||
shard_intermediate_size,
|
is_fp16)
|
||||||
hidden_size,
|
|
||||||
topk,
|
|
||||||
dtype,
|
|
||||||
use_fp8_w8a8,
|
|
||||||
use_int8_w8a16,
|
|
||||||
num_iters=10)
|
|
||||||
except triton.runtime.autotuner.OutOfResources:
|
|
||||||
# Some configurations may be invalid and fail to compile.
|
|
||||||
continue
|
|
||||||
|
|
||||||
if kernel_time < best_time:
|
with torch.cuda.device(self.device_id):
|
||||||
best_time = kernel_time
|
for config in tqdm(search_space):
|
||||||
best_config = config
|
try:
|
||||||
|
kernel_time = benchmark_config(config,
|
||||||
|
num_tokens,
|
||||||
|
num_experts,
|
||||||
|
shard_intermediate_size,
|
||||||
|
hidden_size,
|
||||||
|
topk,
|
||||||
|
dtype,
|
||||||
|
use_fp8_w8a8,
|
||||||
|
use_int8_w8a16,
|
||||||
|
num_iters=20)
|
||||||
|
except triton.runtime.autotuner.OutOfResources:
|
||||||
|
# Some configurations may be invalid and fail to compile.
|
||||||
|
continue
|
||||||
|
|
||||||
|
if kernel_time < best_time:
|
||||||
|
best_time = kernel_time
|
||||||
|
best_config = config
|
||||||
now = datetime.now()
|
now = datetime.now()
|
||||||
print(f"{now.ctime()}] Completed tuning for batch_size={num_tokens}")
|
print(f"{now.ctime()}] Completed tuning for batch_size={num_tokens}")
|
||||||
assert best_config is not None
|
assert best_config is not None
|
||||||
@ -244,12 +410,27 @@ class BenchmarkWorker:
|
|||||||
|
|
||||||
def sort_config(config: BenchmarkConfig) -> BenchmarkConfig:
|
def sort_config(config: BenchmarkConfig) -> BenchmarkConfig:
|
||||||
return {
|
return {
|
||||||
"BLOCK_SIZE_M": config["BLOCK_SIZE_M"],
|
"BLOCK_SIZE_M":
|
||||||
"BLOCK_SIZE_N": config["BLOCK_SIZE_N"],
|
config["BLOCK_SIZE_M"],
|
||||||
"BLOCK_SIZE_K": config["BLOCK_SIZE_K"],
|
"BLOCK_SIZE_N":
|
||||||
"GROUP_SIZE_M": config["GROUP_SIZE_M"],
|
config["BLOCK_SIZE_N"],
|
||||||
"num_warps": config["num_warps"],
|
"BLOCK_SIZE_K":
|
||||||
"num_stages": config["num_stages"],
|
config["BLOCK_SIZE_K"],
|
||||||
|
"GROUP_SIZE_M":
|
||||||
|
config["GROUP_SIZE_M"],
|
||||||
|
"num_warps":
|
||||||
|
config["num_warps"],
|
||||||
|
"num_stages":
|
||||||
|
config["num_stages"],
|
||||||
|
**({
|
||||||
|
"waves_per_eu": config["waves_per_eu"]
|
||||||
|
} if "waves_per_eu" in config else {}),
|
||||||
|
**({
|
||||||
|
"matrix_instr_nonkdim": config["matrix_instr_nonkdim"]
|
||||||
|
} if "matrix_instr_nonkdim" in config else {}),
|
||||||
|
**({
|
||||||
|
"kpack": config["kpack"]
|
||||||
|
} if "kpack" in config else {}),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -275,7 +456,8 @@ def save_configs(configs: Dict[int, BenchmarkConfig], num_experts: int,
|
|||||||
def main(args: argparse.Namespace):
|
def main(args: argparse.Namespace):
|
||||||
print(args)
|
print(args)
|
||||||
|
|
||||||
config = AutoConfig.from_pretrained(args.model)
|
config = AutoConfig.from_pretrained(
|
||||||
|
args.model, trust_remote_code=args.trust_remote_code)
|
||||||
if config.architectures[0] == "DbrxForCausalLM":
|
if config.architectures[0] == "DbrxForCausalLM":
|
||||||
E = config.ffn_config.moe_num_experts
|
E = config.ffn_config.moe_num_experts
|
||||||
topk = config.ffn_config.moe_top_k
|
topk = config.ffn_config.moe_top_k
|
||||||
@ -286,6 +468,11 @@ def main(args: argparse.Namespace):
|
|||||||
topk = config.num_experts_per_tok
|
topk = config.num_experts_per_tok
|
||||||
intermediate_size = config.intermediate_size
|
intermediate_size = config.intermediate_size
|
||||||
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
||||||
|
elif config.architectures[0] == "DeepseekV3ForCausalLM":
|
||||||
|
E = config.n_routed_experts
|
||||||
|
topk = config.num_experts_per_tok
|
||||||
|
intermediate_size = config.moe_intermediate_size
|
||||||
|
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
||||||
else:
|
else:
|
||||||
# Default: Mixtral.
|
# Default: Mixtral.
|
||||||
E = config.num_local_experts
|
E = config.num_local_experts
|
||||||
@ -294,7 +481,7 @@ def main(args: argparse.Namespace):
|
|||||||
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
||||||
|
|
||||||
hidden_size = config.hidden_size
|
hidden_size = config.hidden_size
|
||||||
dtype = config.torch_dtype
|
dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
|
||||||
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
|
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
|
||||||
use_int8_w8a16 = args.dtype == "int8_w8a16"
|
use_int8_w8a16 = args.dtype == "int8_w8a16"
|
||||||
|
|
||||||
@ -322,7 +509,8 @@ def main(args: argparse.Namespace):
|
|||||||
return ray.get(outputs)
|
return ray.get(outputs)
|
||||||
|
|
||||||
if args.tune:
|
if args.tune:
|
||||||
search_space = get_configs_compute_bound()
|
is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16)
|
||||||
|
search_space = get_configs_compute_bound(is_fp16)
|
||||||
print(f"Start tuning over {len(search_space)} configurations...")
|
print(f"Start tuning over {len(search_space)} configurations...")
|
||||||
|
|
||||||
start = time.time()
|
start = time.time()
|
||||||
@ -354,7 +542,11 @@ if __name__ == "__main__":
|
|||||||
parser.add_argument("--model",
|
parser.add_argument("--model",
|
||||||
type=str,
|
type=str,
|
||||||
default="mistralai/Mixtral-8x7B-Instruct-v0.1")
|
default="mistralai/Mixtral-8x7B-Instruct-v0.1")
|
||||||
parser.add_argument("--tp-size", "-tp", type=int, default=2)
|
parser.add_argument("--tp-size",
|
||||||
|
"-tp",
|
||||||
|
"--tensor-parallel-size",
|
||||||
|
type=int,
|
||||||
|
default=2)
|
||||||
parser.add_argument("--dtype",
|
parser.add_argument("--dtype",
|
||||||
type=str,
|
type=str,
|
||||||
choices=["auto", "fp8_w8a8", "int8_w8a16"],
|
choices=["auto", "fp8_w8a8", "int8_w8a16"],
|
||||||
@ -362,6 +554,7 @@ if __name__ == "__main__":
|
|||||||
parser.add_argument("--seed", type=int, default=0)
|
parser.add_argument("--seed", type=int, default=0)
|
||||||
parser.add_argument("--batch-size", type=int, required=False)
|
parser.add_argument("--batch-size", type=int, required=False)
|
||||||
parser.add_argument("--tune", action="store_true")
|
parser.add_argument("--tune", action="store_true")
|
||||||
|
parser.add_argument("--trust-remote-code", action="store_true")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
main(args)
|
main(args)
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
import random
|
import random
|
||||||
import time
|
import time
|
||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
@ -98,7 +100,9 @@ def main(
|
|||||||
start_time = time.perf_counter()
|
start_time = time.perf_counter()
|
||||||
|
|
||||||
# Using default kv_scale
|
# Using default kv_scale
|
||||||
k_scale = v_scale = 1.0
|
k_scale = v_scale = torch.tensor(1.0,
|
||||||
|
dtype=torch.float32,
|
||||||
|
device=device)
|
||||||
|
|
||||||
for _ in range(num_iters):
|
for _ in range(num_iters):
|
||||||
if version == "v1":
|
if version == "v1":
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
import time
|
import time
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
import itertools
|
import itertools
|
||||||
from typing import Optional, Tuple, Union
|
from typing import Optional, Tuple, Union
|
||||||
|
|
||||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user