mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 14:53:52 +08:00
Compare commits
2341 Commits
v0.7.3
...
releases/v
Author | SHA1 | Date | |
---|---|---|---|
b6553be1bc | |||
64a9af5afa | |||
e4248849ec | |||
467bef18a3 | |||
5f1ac1e1d1 | |||
9368cc90b2 | |||
32b3946bb4 | |||
6b1391ca7e | |||
a3f66e75d1 | |||
319cb1e351 | |||
1efef71645 | |||
646d62f636 | |||
6cd4ae8acd | |||
c016047ed7 | |||
9af6d22e4c | |||
4589b94032 | |||
cc867be19c | |||
3a7cd627a8 | |||
8058c91108 | |||
7d44c469fe | |||
31f58be96a | |||
ebb2f383b8 | |||
c1c7dbbeeb | |||
5cf2daea9a | |||
b8089195b4 | |||
770e5dcdb8 | |||
c57c9415b1 | |||
01810f9236 | |||
59abbd84f9 | |||
95a6568b5c | |||
0eca5eacd0 | |||
12e5829221 | |||
3a4d417707 | |||
8335667c22 | |||
e1c4380d4c | |||
e31ae3de36 | |||
2ffb9b6e07 | |||
cda10fa3e2 | |||
c123bc33f9 | |||
b9a1791e2c | |||
989dcee981 | |||
3d64d366e0 | |||
eaa2e51088 | |||
d77f7fb871 | |||
2d8476e465 | |||
88be823d57 | |||
4e4f63ad45 | |||
d2f0e7e615 | |||
122cdca5f6 | |||
cf02f9b283 | |||
c4296b1a27 | |||
66c508b137 | |||
84166fee97 | |||
6e0cd10f72 | |||
e010688f50 | |||
441b65d8c7 | |||
46ecc57973 | |||
b6a3a9f76d | |||
ca27f0f9c1 | |||
aad30bd306 | |||
94ecee6282 | |||
8267f9916f | |||
7353492a47 | |||
7661e92ef8 | |||
f168b85725 | |||
da511d54d8 | |||
65c69444b1 | |||
94870359cd | |||
0d49483ea9 | |||
90b78ec5f9 | |||
91a2ef98ea | |||
3da2313d78 | |||
b61dc5f972 | |||
f8a1a2d108 | |||
3465b87ef8 | |||
c8134bea15 | |||
cb6d572e85 | |||
87360308b7 | |||
aa49f14832 | |||
9ef9173cfa | |||
85e2b7bb13 | |||
61059bee40 | |||
ec89524f50 | |||
f20f9f063b | |||
9bc8bb07cf | |||
1aeb925f34 | |||
188a4590d8 | |||
18093084be | |||
da40380214 | |||
8fc57501d3 | |||
af7fc84fd2 | |||
0678b52251 | |||
25b918eee6 | |||
a408820f2f | |||
c56ed8bb0e | |||
78dcf56cb3 | |||
b2fac67130 | |||
23027e2daf | |||
c3fd4d669a | |||
ef3f98b59f | |||
7ee2590478 | |||
53a5a0ce30 | |||
d459fae0a2 | |||
c8dcc15921 | |||
8f4ffbd373 | |||
5f2cd251d2 | |||
02658c2dfe | |||
01dc9a76db | |||
35cf32df30 | |||
8711bc5e68 | |||
2669a0d7b5 | |||
8e972d9c44 | |||
3336c8cfbe | |||
b124e1085b | |||
41aa578428 | |||
8d646c2e53 | |||
5d6d1adf15 | |||
1409ef9134 | |||
4555143ea7 | |||
52dceb172d | |||
abd7df2fca | |||
b712be98c7 | |||
a8da78eac9 | |||
5d96533e22 | |||
4de790fcad | |||
b5fd9506c1 | |||
135cf55cd1 | |||
6cac54f4d1 | |||
6865fe0074 | |||
e31446b6c8 | |||
bdf13965ab | |||
fa98d77773 | |||
01eee40536 | |||
19bdaf32b1 | |||
02f0c7b220 | |||
d054da1992 | |||
4b7817c119 | |||
d00dd65cd4 | |||
d81edded69 | |||
476844d44c | |||
4e68ae5e59 | |||
4e88723f32 | |||
118ff92111 | |||
ec2dcd80bc | |||
42243fbda0 | |||
6d18ed2a2e | |||
f32fcd9444 | |||
d32aa2e670 | |||
cc977286e7 | |||
17430e3653 | |||
1282bd812e | |||
bdce64f236 | |||
9e6f61e8c3 | |||
8655f47f37 | |||
4ce42f9204 | |||
8a57872b2a | |||
5bc1ad6cee | |||
9112b443a0 | |||
c57d577e8d | |||
ca2f6b9c30 | |||
20133cfee2 | |||
ebb1ec9318 | |||
5b168b6d7a | |||
9760fd8f6a | |||
b9f61e1387 | |||
d6fd3a33b8 | |||
432ec9926e | |||
2b102d51ad | |||
aa54a7bf7b | |||
2ad6194a02 | |||
c594cbf565 | |||
a35ca765a5 | |||
6aa8f9a4e7 | |||
1bc86a3da1 | |||
bbfa0c61d1 | |||
20079c6e36 | |||
9a1b9b99d7 | |||
8bf507d766 | |||
306d60401d | |||
f2c3f66d59 | |||
0f5e0d567e | |||
c55d804672 | |||
749f5bdd38 | |||
2a50ef5760 | |||
b8b904795d | |||
ba5111f237 | |||
1e123529d7 | |||
dff80b0e42 | |||
7782464a17 | |||
0f71e24034 | |||
1dab4d5718 | |||
7f21e8052b | |||
5a8641638a | |||
f49239cb45 | |||
2dbe8c0774 | |||
84ec470fca | |||
b29ca5c4d5 | |||
ec6833c5e9 | |||
e1fadf1197 | |||
43ff405b90 | |||
fba02e3bd1 | |||
4577fc9abb | |||
5f1d0c8118 | |||
c3bb9f2331 | |||
8f8900cee9 | |||
6acb7a6285 | |||
4f4a6b844a | |||
4d0a1541be | |||
77b6e74fe2 | |||
5acf828d99 | |||
3987e2ae96 | |||
77164dad5e | |||
3de3eadf5b | |||
3132290a14 | |||
1aa2f81b43 | |||
d54af615d5 | |||
a1cc9f33a3 | |||
a521ef06e5 | |||
64eaf5fe05 | |||
d1d61f3351 | |||
32ce3cf7c9 | |||
d58f9c7f7a | |||
c29034037d | |||
1b7cfd5a36 | |||
da4b69d0b4 | |||
c9479b2920 | |||
6f2909405e | |||
b169d5f7b6 | |||
f8977c233f | |||
f274581f44 | |||
0b1447f890 | |||
24d0ef8970 | |||
7fcfd954ff | |||
e740d07f07 | |||
a652e71dd0 | |||
34d6c447c4 | |||
972eddf7c9 | |||
fd7bb88d72 | |||
3c49dbdd03 | |||
1661a9c28f | |||
8e882ffdc0 | |||
26b4fa45be | |||
515b413ebf | |||
269d901734 | |||
7951d78738 | |||
6dbe5b5c93 | |||
643622ba46 | |||
a09c7ca9f2 | |||
0e98964e94 | |||
c68b5c63eb | |||
fced756923 | |||
321331b8ae | |||
6e4cea1cc5 | |||
435fa95444 | |||
4c2b38ce9e | |||
d781930f90 | |||
ce75efeecb | |||
aa42561e40 | |||
de65fc8e1e | |||
0c492b7824 | |||
0f0926b43f | |||
7f2c1a87e9 | |||
b78f844a67 | |||
5e13c07d00 | |||
774c5fde30 | |||
9a21e331ff | |||
3e9ce609bd | |||
794ae1f551 | |||
d73a9457a5 | |||
a3896c7f02 | |||
51e98e4ffd | |||
e56f44d9ec | |||
e0cbad4e30 | |||
b48d5cca16 | |||
5873877241 | |||
696259ca01 | |||
6b6d496114 | |||
aaa4ac1c95 | |||
06a0338015 | |||
4318c0559d | |||
a68e293cb9 | |||
6881107948 | |||
e0f0ff87b8 | |||
c24b1572ac | |||
4693a3438c | |||
bbd9a84dc5 | |||
a547aeb828 | |||
fc6d0c290f | |||
753944fa9b | |||
25a817f202 | |||
d260f799a9 | |||
b50602d5f0 | |||
1f1b1bc03b | |||
1f88dbd2bb | |||
0eebd74842 | |||
27bebcd897 | |||
e7523c2e03 | |||
a869baca73 | |||
82e2339b06 | |||
9553fdb41e | |||
243eb9199f | |||
0665e29998 | |||
e76be06550 | |||
0877750029 | |||
6d68030f1c | |||
5a2c76cbe1 | |||
38b13dfe78 | |||
61a45e7a72 | |||
65523a0995 | |||
4b7740a105 | |||
4ea62c0ea0 | |||
561b77a0d6 | |||
abd4030d94 | |||
8820821b59 | |||
fba0642704 | |||
6071e989df | |||
57fd13a707 | |||
3a886bd58c | |||
35be8fad62 | |||
f2faac745d | |||
279f854519 | |||
624b77a2b3 | |||
503f8487c2 | |||
44073a7ac3 | |||
63934543a0 | |||
75f81750f3 | |||
6ab681bcbe | |||
cebc22f3b6 | |||
6c6dcd8611 | |||
7891fdf0c6 | |||
6825d9a998 | |||
b554ab736e | |||
9ea7f1abf3 | |||
2807271c86 | |||
b9018a3f9f | |||
4ceafb6299 | |||
2e6705784f | |||
1cb194a018 | |||
2cd4d58df4 | |||
6d166a8d35 | |||
ef1dd6870f | |||
e77dc4bad8 | |||
07458a51ce | |||
c1e4a4052d | |||
a859320575 | |||
441dc63ac7 | |||
d55e446d13 | |||
ec82c3e388 | |||
45ab403a1f | |||
2b10ba7491 | |||
4fc1bf813a | |||
f2036734fb | |||
7d9216495c | |||
0ddf88e16e | |||
1645b60196 | |||
2628a69e35 | |||
371f7e4ca2 | |||
15b45ffb9a | |||
273cb3b4d9 | |||
8ddd1cf26a | |||
6550114c9c | |||
9520a989df | |||
3d28ad343f | |||
6a7988c55b | |||
022d8abe29 | |||
5221815a00 | |||
1068556b2c | |||
2cd1fa4556 | |||
d4c2919760 | |||
6220f3c6b0 | |||
52fb23f47e | |||
6dd51c7ef1 | |||
2edb533af2 | |||
38a95cb4a8 | |||
cd821ea5d2 | |||
7ab056c273 | |||
6526e05111 | |||
e493e48524 | |||
4ce64e2df4 | |||
fbb13a2c15 | |||
a1fe24d961 | |||
d0bc2f810b | |||
b046cf792d | |||
54af915949 | |||
71ea614d4a | |||
4c611348a7 | |||
60cad94b86 | |||
9c1baa5bc6 | |||
4be2255c81 | |||
ed5d408255 | |||
583507d130 | |||
e44d8ce8c7 | |||
93ecb8139c | |||
fae453f8ce | |||
4b0da7b60e | |||
c6b636f9fb | |||
04eb88dc80 | |||
46791e1b4b | |||
c32e249a23 | |||
c91fe7b1b9 | |||
a04720bc36 | |||
7b9d832c80 | |||
6e588da0f4 | |||
f8d2cc5f55 | |||
721fb9b181 | |||
1f3a1200e4 | |||
54631f8262 | |||
cb506ecb5a | |||
93f71673ce | |||
3f505233fd | |||
4e04eceb58 | |||
71075029f2 | |||
ca86a7cf6e | |||
a35a494745 | |||
f6037d1907 | |||
fa72f9a812 | |||
ebed81fbf5 | |||
e2d7d31244 | |||
23b67b37b2 | |||
db5a29ba19 | |||
51797775c3 | |||
cf5984b2fe | |||
d022115cc6 | |||
acb54ca8e1 | |||
6e0fd34d3c | |||
176d62e4ea | |||
20bd6f4d2e | |||
1f079540db | |||
94d8ec8d2b | |||
bb0a311213 | |||
dd5fa7e04f | |||
2b16104557 | |||
371376f996 | |||
c6c10ca920 | |||
c154d89306 | |||
eca18691d2 | |||
61acfc45bc | |||
107f5fc4cb | |||
907f935de9 | |||
5d7f545204 | |||
cd8dfc6dfc | |||
d06dd72ba9 | |||
ad0012a0ac | |||
92247c522e | |||
0c15c2e486 | |||
3b17ea26e4 | |||
23baa2180b | |||
980a172474 | |||
e1f5a71ed7 | |||
f4a8a37465 | |||
8f55962a7f | |||
be48360c1f | |||
86847700d7 | |||
d6c86d09ae | |||
6b35cb10a0 | |||
1b1e8e05ff | |||
bca55b556f | |||
d981396778 | |||
9609327fa4 | |||
f07a673eb2 | |||
d565e0976f | |||
258bf621d5 | |||
dc1440cf9f | |||
8171221834 | |||
7937c2fd52 | |||
e2ee1e8e9e | |||
20d8ce81eb | |||
84ab4feb7e | |||
6781af5608 | |||
1b15df2546 | |||
43b5f61dce | |||
c5bb0ebdc6 | |||
d637b96099 | |||
275c5daeb0 | |||
47fda6d089 | |||
27d0952600 | |||
221cfc2fea | |||
9da1095daf | |||
d1211f8794 | |||
b6a6e7a529 | |||
4fb349f66a | |||
908733aca7 | |||
1a8f68bb90 | |||
9ab2c02ff8 | |||
66e63e86ec | |||
9214e60631 | |||
f880d42582 | |||
dcfe95234c | |||
48ac2bed5b | |||
3e0d435027 | |||
4ee4826ede | |||
60017dc841 | |||
55f1a468d9 | |||
fd195b194e | |||
fabe89bbc4 | |||
e73b7dfd69 | |||
7fdfa01530 | |||
aef94c6d07 | |||
0ceaebf87b | |||
1db4f47f81 | |||
d3d91b6f71 | |||
87d871470d | |||
a5f8c111c2 | |||
e23564cb70 | |||
390ec88905 | |||
541817670c | |||
67da5720d4 | |||
5c04bb8b86 | |||
3d2779c29a | |||
6b31c84aff | |||
b18201fe06 | |||
f4937a51c1 | |||
ee659e3b60 | |||
4e1c6a0264 | |||
c7852a6d9b | |||
8795eb9975 | |||
0b34593017 | |||
e3f3aee6f4 | |||
92540529c0 | |||
fadb8d5c2d | |||
2aa5470ac5 | |||
51ff154639 | |||
566ec04c3d | |||
01c22335ba | |||
451da4bcbd | |||
07ad27121f | |||
a9944aabfa | |||
a8f5aec20a | |||
de71fec81b | |||
70f8b96724 | |||
dd2a94596a | |||
420caf7557 | |||
4f07a64075 | |||
e6b8e65d2d | |||
26d0419309 | |||
83f74c698f | |||
2dff093574 | |||
afe3236e90 | |||
65334ef3b9 | |||
e60f550b38 | |||
f25e0d1125 | |||
09f106a91e | |||
2142035b51 | |||
78aa341d12 | |||
7974736740 | |||
2fc9075b82 | |||
d93c976a0d | |||
749f792553 | |||
856865008e | |||
f9c069c85e | |||
418d2f8bfb | |||
964472b966 | |||
59dd311cf5 | |||
d066e52013 | |||
c8ea982d9b | |||
dc372b9c8a | |||
9b5b39b650 | |||
9ccc6ded42 | |||
d62a076e84 | |||
259127f8b8 | |||
612c2edb4f | |||
38fe728d60 | |||
82e7f9bb03 | |||
63dc3426e0 | |||
8f5dc41481 | |||
63ad622233 | |||
e7ef61c1f0 | |||
d4154c35a2 | |||
6685890d11 | |||
33011318c2 | |||
4f8b373225 | |||
7b2f28deba | |||
2d912fb66f | |||
12e6c0b41c | |||
9a2a6357de | |||
6266c57bae | |||
754b699cbe | |||
6e27c6d86b | |||
d5af47a149 | |||
65f0f74b66 | |||
176a95c670 | |||
f2ae883b67 | |||
40de1ef455 | |||
0189a65a2e | |||
55aa7af994 | |||
0b217da646 | |||
19324d660c | |||
fc407a1425 | |||
009d9e7590 | |||
b922c2ebd2 | |||
00b14e0f16 | |||
54e467e6f8 | |||
79a1d25bbd | |||
9944011b30 | |||
8c946cecca | |||
ff334ca1cd | |||
6223dd8114 | |||
906f0598fc | |||
cb528d0585 | |||
98fcba1575 | |||
23b3134eb5 | |||
ea6ae8cb45 | |||
2ff297dce9 | |||
8dd0671bac | |||
f0d610a8ae | |||
e57e4d6e9e | |||
ee5be834e7 | |||
48545728d8 | |||
dc1a821768 | |||
61e0a506a3 | |||
1df491c522 | |||
d8487ef557 | |||
c06af9a959 | |||
60f7624334 | |||
f6518b2b48 | |||
d67085c2c8 | |||
307939f299 | |||
9d7ea9dbbf | |||
acee8f48aa | |||
f065de4e88 | |||
dc9905368d | |||
ebab1ac37c | |||
2b0db9b0e2 | |||
195adb47c0 | |||
302f3aca7e | |||
e9c730c9bd | |||
289199feb6 | |||
b9fd0d7a69 | |||
72a3f6b898 | |||
98ea35601c | |||
d19110204c | |||
05a4324f8e | |||
7ea6cb28b2 | |||
9fbf2bfbd5 | |||
3a5ea75129 | |||
891b9d33de | |||
430783018c | |||
19a3c78d1f | |||
ada50aa295 | |||
08bf784078 | |||
d45fe333fb | |||
021c16c7ca | |||
7de18d541b | |||
a810b5b088 | |||
009b3d5382 | |||
e4b8713380 | |||
06c0922a69 | |||
cd3edfc908 | |||
9cea90eab4 | |||
d1110f5b5a | |||
8132365b74 | |||
eea22a56ab | |||
9112155283 | |||
90d0a74b60 | |||
d74e5f37bc | |||
ca66a1674c | |||
950751a987 | |||
4c31218f80 | |||
68311891f5 | |||
fc4441a4ee | |||
246e3e0a36 | |||
7042cc96b0 | |||
0c0fdae84f | |||
3b602cdea7 | |||
4b2ed7926a | |||
7e3571134f | |||
ea2236bf95 | |||
7d4aedae7c | |||
22481fbfa3 | |||
5c4c08f6f1 | |||
c44c384b1c | |||
85b72cb7b1 | |||
6e5595ca39 | |||
200da9a517 | |||
9f64e93415 | |||
ec61ea20a8 | |||
c6798baa9c | |||
5b2dcbf0b8 | |||
6e4a93e3f7 | |||
217db4baa6 | |||
ff8c400502 | |||
89a0315f4c | |||
3d1e387652 | |||
d310e6de98 | |||
5e6f939484 | |||
760e3ecc8f | |||
3c9396a64f | |||
376786fac1 | |||
4f605a6de5 | |||
8342e3abd1 | |||
a83a0f92b5 | |||
226a4272cf | |||
ec54d73c31 | |||
a944f8ede7 | |||
015815fe01 | |||
e4ca6e3a99 | |||
53d0cb7423 | |||
f50dcb7c21 | |||
a1e19b635d | |||
bb239a730f | |||
a463555dee | |||
ca04b97c93 | |||
0a9bbaa104 | |||
39956efb3f | |||
597051e56f | |||
96722aa81d | |||
843b222723 | |||
e515668edf | |||
5a499e70d5 | |||
6930a41116 | |||
998eea4a0e | |||
c747d84576 | |||
b2da14a05a | |||
7ea2adb802 | |||
3d13ca0e24 | |||
66ab3b13c9 | |||
a8238bbdb0 | |||
d43f914d42 | |||
ed5272cf21 | |||
c20ef40fd0 | |||
db593aa67f | |||
f98e307588 | |||
646a31e51e | |||
be8ff88e66 | |||
1a6af1453d | |||
32aa74c09c | |||
7377dd0307 | |||
98c89e16ff | |||
324a3119b0 | |||
8a15c2603a | |||
043e4c4955 | |||
ba7703e659 | |||
f80ae5bdcf | |||
1a45a61387 | |||
c3e9d5060e | |||
822de7fb94 | |||
8d84d836d1 | |||
950b71186f | |||
e50a1f1a9c | |||
a17cef70ea | |||
18dd5e01f2 | |||
6de3e13413 | |||
ed3a1d2106 | |||
022afbeb4e | |||
2f925e5777 | |||
de906b95f9 | |||
d456aea71f | |||
621ca2c0ab | |||
6115b11582 | |||
5b8c390747 | |||
7525d5f3d5 | |||
aabcd2cae3 | |||
0d115460a7 | |||
175bda67a1 | |||
cba31c47c4 | |||
a6fed02068 | |||
d419aa5dc4 | |||
f9bc5a0693 | |||
05e1f96419 | |||
6eae34533a | |||
63ced7b43f | |||
dc47ba32f8 | |||
edbf2d609e | |||
999328be0d | |||
98834fefaa | |||
90bd2ae172 | |||
5941e0b7ea | |||
9765940824 | |||
5ea5c514da | |||
d3efde8176 | |||
aea302be6c | |||
cc05b90d86 | |||
1d0c9d6b2d | |||
f62cad6431 | |||
5394ad7387 | |||
68e1ee0072 | |||
2858830c39 | |||
d6484ef3c3 | |||
46fae69cf0 | |||
f66f1e0fa3 | |||
887d7af882 | |||
a92842454c | |||
c8386fa61d | |||
87baebebd8 | |||
e3d0a1d190 | |||
d47b605eca | |||
22c6f6397f | |||
3ec97e2cc5 | |||
9b103a1d76 | |||
b90b0852e9 | |||
9352cdb56d | |||
182f40ea8b | |||
3e887d2e0c | |||
0f87d8f7b2 | |||
4c33d67321 | |||
cb234955df | |||
3a500cd0b6 | |||
868c546da4 | |||
99404f53c7 | |||
785d75a03b | |||
6d1479ca4b | |||
b8b0859b5c | |||
d7543862bd | |||
c777df79f7 | |||
cc2a77d7f1 | |||
9e2de9b9e9 | |||
109e15a335 | |||
f192ca90e6 | |||
f89d0e11bf | |||
b4003d11fc | |||
292fc59d61 | |||
afcb3f8863 | |||
afb12e4294 | |||
24aebae177 | |||
39c0813a7f | |||
9b70e2b4c1 | |||
173daac19d | |||
04f2cfc894 | |||
811a6c0972 | |||
9b1769dd9a | |||
61c299f81f | |||
4acfa3354a | |||
88c8304104 | |||
6768ff4a22 | |||
f2e7af9b86 | |||
7423cf0a9b | |||
460a2b1100 | |||
28566d73b3 | |||
98060b001d | |||
f5a3c655b2 | |||
7169f87ad0 | |||
b74d888c63 | |||
2007d4d54f | |||
48e925fab5 | |||
1903c0b8a3 | |||
86a1f67a3b | |||
a257d9bccc | |||
015069b017 | |||
fbefc8a78d | |||
26bc4bbcd8 | |||
3c3d767201 | |||
13cf6b6236 | |||
90d0a54c4d | |||
7a0a146c54 | |||
7ab643e425 | |||
afb4429b4f | |||
aa4502e7f3 | |||
17b4d85f63 | |||
1144a8efe7 | |||
08fb5587b4 | |||
dbc18e7816 | |||
02bd654846 | |||
200bbf92e8 | |||
81ecf425f0 | |||
42d9a2c4c7 | |||
2ac74d098e | |||
584f5fb4c6 | |||
d586ddc691 | |||
0b7e701dd4 | |||
947f2f5375 | |||
739e03b344 | |||
da4e7687b5 | |||
39317cf42b | |||
2990cee95b | |||
0be6d05b5e | |||
77073c77bc | |||
a7d5b016bd | |||
d803786731 | |||
1534d389af | |||
ece5a8b0b6 | |||
54072f315f | |||
be633fba0f | |||
ed6cfb90c8 | |||
6ed9f6047e | |||
a44c4f1d2f | |||
88fcf00dda | |||
d1f569b1b9 | |||
13698db634 | |||
2c4f59afc3 | |||
1c2bc7ead0 | |||
4055130a85 | |||
34120f5acd | |||
7489ec0bab | |||
70788bdbdc | |||
c9c1b59e59 | |||
0350809f3a | |||
a6977dbd15 | |||
2fa2a50bf9 | |||
08e15defa9 | |||
b37685afbb | |||
792595b59d | |||
0c1c788312 | |||
56d64fbe30 | |||
608968b7c5 | |||
06ffc7e1d3 | |||
d3cf61b89b | |||
a39203f99e | |||
24e6ad3f16 | |||
2ef5d106bb | |||
0ed27ef66c | |||
900edfa8d4 | |||
88ad9ec6b2 | |||
40896bdf3f | |||
00ee37efa2 | |||
890f104cdf | |||
4a5e13149a | |||
97cc8729f0 | |||
4464109219 | |||
193e78e35d | |||
bdb2cddafc | |||
ebb3930d28 | |||
cde384cd92 | |||
96e06e3cb7 | |||
17eb306fcc | |||
165cb56329 | |||
d6da8a8ff2 | |||
b4ac4fa04d | |||
e136000595 | |||
86d9fc29cb | |||
506475de5f | |||
cfe4532093 | |||
8fc88d63f1 | |||
6e74fd4945 | |||
dcbac4cb4b | |||
ed2462030f | |||
cc5befbced | |||
2c89cd96a8 | |||
a0304dc504 | |||
c7941cca18 | |||
b6dd32aa07 | |||
f94886946e | |||
72dfe4c74f | |||
8b464d9660 | |||
889ebb2638 | |||
3ad986c28b | |||
344e193b7d | |||
fb1c933ade | |||
72c5b97231 | |||
fa93cd9f60 | |||
aec9674dbe | |||
7fcc4223dc | |||
8262a3e23b | |||
f211331c48 | |||
9053d0b134 | |||
cb3f2d8d10 | |||
c12df53b60 | |||
d1aeea7553 | |||
d8bccde686 | |||
20e489eaa1 | |||
4213475ec7 | |||
d92879baf6 | |||
690fe019f0 | |||
ed7a29d9f8 | |||
756848e79e | |||
18445edd0f | |||
30215ca61f | |||
838cedade7 | |||
4283a28c2f | |||
93a126fbc7 | |||
8e4b351a0c | |||
9869453c42 | |||
3642c59aa8 | |||
43eea2953b | |||
de7eb10ce4 | |||
fd11a325b8 | |||
4d17e20310 | |||
10fd1d7380 | |||
52b4f4a8d7 | |||
e782e0a170 | |||
dc2ceca5c5 | |||
f8acd01ff7 | |||
c48334d405 | |||
909fdaf152 | |||
8c1c926d00 | |||
df6f3ce883 | |||
513f074766 | |||
b07bf83c7d | |||
53e8cf53a4 | |||
54271bb766 | |||
9e96f56efb | |||
b278911229 | |||
7bd0c7745c | |||
1cf0719ebd | |||
537d5ee025 | |||
c8e5be35f7 | |||
a6e72e1e4f | |||
5e83a7277f | |||
68af5f6c5c | |||
8de2901fea | |||
c53e0730cb | |||
a0e619e62a | |||
70116459c3 | |||
65e262b93b | |||
43faa0461a | |||
48cb2109b6 | |||
a5450f11c9 | |||
9d98ab5ec6 | |||
df5c879527 | |||
423e9f1cbe | |||
0bd7f8fca5 | |||
d5615af9ae | |||
19dcc02a72 | |||
7feae92c1f | |||
f851b84266 | |||
fc966e9cc6 | |||
ef19e67d2c | |||
a41351f363 | |||
6aae216b4e | |||
b22980a1dc | |||
881f735827 | |||
2f54045508 | |||
5aa6efb9a5 | |||
6ca0234478 | |||
649818995f | |||
7a0a9da72b | |||
69bff9bc89 | |||
41ca7eb491 | |||
eef364723c | |||
0d6e187e88 | |||
9420a1fc30 | |||
583e900996 | |||
05e1fbfc52 | |||
fe92176321 | |||
6d0df0ebeb | |||
0fa939e2d1 | |||
0422ce109f | |||
47bdee409c | |||
49f189439d | |||
5adf6f6b7f | |||
4115f19958 | |||
340d7b1b21 | |||
1bcbcbf574 | |||
82e43b2d7e | |||
67309a1cb5 | |||
b724afe343 | |||
21f4f1c9a4 | |||
b0c1f6202d | |||
c0dfd97519 | |||
a9138e85b1 | |||
0a05ed57e6 | |||
14288d1332 | |||
b411418ff0 | |||
2bc0f72ae5 | |||
9c1244de57 | |||
db2f8d915c | |||
6167c0e5d2 | |||
ed2e464653 | |||
2c8ed8ee48 | |||
ed50f46641 | |||
46e678bcff | |||
6b2427f995 | |||
b07d741661 | |||
41fb013d29 | |||
32d4b669d0 | |||
3cde34a4a4 | |||
bdb3660312 | |||
f3a21e9c68 | |||
8e630d680e | |||
af869f6dff | |||
53c0fa1e25 | |||
f7912cba3d | |||
6317a5174a | |||
aa72d9a4ea | |||
ce17db8085 | |||
8c87a9ad46 | |||
ec69124eb4 | |||
d0da99fb70 | |||
b2f195c429 | |||
047797ef90 | |||
eb8ef4224d | |||
56a735261c | |||
e1cf90e099 | |||
6bc1e30ef9 | |||
7e081ba7ca | |||
1e013fa388 | |||
bc7c4d206b | |||
f67e9e9f22 | |||
36fe78769f | |||
83d933718c | |||
5175b884f7 | |||
5536b30a4c | |||
7f58fb9718 | |||
30bc3e0f66 | |||
f34410715f | |||
68d4c33202 | |||
f961d7f6ef | |||
d059110498 | |||
571e8dd65e | |||
4b91c927f6 | |||
0e237f0035 | |||
8f7bace7c3 | |||
e4d6144232 | |||
8d32dc603d | |||
c4ab9f3e71 | |||
2689d5c027 | |||
acba33a0f1 | |||
a114bf20a3 | |||
3097ce3a32 | |||
d6da9322c8 | |||
71ce44047f | |||
188b7f9b8c | |||
b9b4746950 | |||
7b8a2ab76f | |||
c9acbf1141 | |||
5b794cae8d | |||
0e4254492f | |||
1311913f55 | |||
29f395c97c | |||
fa3bba2a53 | |||
986537f1c3 | |||
210207525e | |||
71eda0bb76 | |||
471fe65630 | |||
3a0fba5cf4 | |||
299ebb62b2 | |||
f728ab8e35 | |||
63e26fff78 | |||
fe3462c774 | |||
3b34fd5273 | |||
55d6d3fdb8 | |||
7272bfae77 | |||
d9ac9e3dc5 | |||
d41faaf9df | |||
b34f33438a | |||
26c0406555 | |||
4c41278b77 | |||
bb3605db85 | |||
fe742aef5a | |||
4b07d36891 | |||
87aaadef73 | |||
682e0b6d2f | |||
d6195a748b | |||
205d84aaa9 | |||
5124f5bf51 | |||
83f3c3bd91 | |||
d9737ca1c6 | |||
9d4ca19d50 | |||
2ef0dc53b8 | |||
1d4680fad2 | |||
2c1bd848a6 | |||
5c9121203c | |||
490b1698a5 | |||
5a5e29de88 | |||
3d3ab3689f | |||
686623c5e7 | |||
aadb656562 | |||
87e067de41 | |||
26507f8973 | |||
9c1d5b456d | |||
e31045f95c | |||
aaec845f8e | |||
7bdfd29a35 | |||
e78587a64c | |||
7eb4255628 | |||
6a0f547561 | |||
30ed81b7ca | |||
7a4a5de729 | |||
c16fb5dae8 | |||
e37073efd7 | |||
183dad7a85 | |||
3408e47159 | |||
0377b8310b | |||
e4755f7fac | |||
92edf35826 | |||
eb5819b2d9 | |||
5989f4684d | |||
5125d72f02 | |||
a018e555fd | |||
6211b92273 | |||
05fcd1b430 | |||
7c02d6a137 | |||
11c3b98491 | |||
dbe7f07001 | |||
c69bf4ee06 | |||
d27ea94034 | |||
99ed526101 | |||
207da28186 | |||
5b1aca2ae3 | |||
d8e557b5e5 | |||
61a44a0b22 | |||
a6481525b8 | |||
8cac35ba43 | |||
9dbf7a2dc1 | |||
607029e515 | |||
cb072ce93b | |||
95aca283b4 | |||
2b05b8ce69 | |||
3c776dcefb | |||
2cbd4d2999 | |||
3092375e27 | |||
3cd91dc955 | |||
8a7368e069 | |||
93e561ec4d | |||
e1b004839a | |||
ee378f3d49 | |||
e82ee40de3 | |||
facbe2a114 | |||
7168920491 | |||
21378a2323 | |||
976711d9db | |||
44fa4d556c | |||
3ac98edcb1 | |||
966c742ed2 | |||
0d7d05f4b6 | |||
96bb8aa68b | |||
3badb0213b | |||
fdcb850f14 | |||
54a66e5fee | |||
280d62b8a2 | |||
1666e66443 | |||
1575c1701a | |||
6ae996a873 | |||
b590adfdc1 | |||
b4fe16c75b | |||
bc5dd4f669 | |||
dbb036cf61 | |||
70e7ed841d | |||
d06ba4ed3f | |||
6b40996ae8 | |||
d2020acac7 | |||
1eb3c2ed48 | |||
c64ee87267 | |||
b1308b84a3 | |||
7b5ecf79bd | |||
9883a18859 | |||
b3f2fddd17 | |||
aa29841ede | |||
6bf27affb6 | |||
1dd23386ec | |||
7cbfc10943 | |||
ce4ddd2d1a | |||
e51929ebca | |||
dc1b4a6f13 | |||
63d2705edb | |||
d085a44082 | |||
f49e5aff11 | |||
6c11ecf8d3 | |||
93e5f3c5fb | |||
70363bccfa | |||
3cdc57669f | |||
68bb122eb4 | |||
d9fc8cd9da | |||
f069f3ea74 | |||
c5bc0e7fcc | |||
4a3a518722 | |||
fbf722c6e6 | |||
e92d7085bf | |||
bd6028d6b0 | |||
802329dee9 | |||
41cc883c29 | |||
57504a4bcf | |||
ed4792c990 | |||
87b836ba77 | |||
56c76c2e0e | |||
c09632a66c | |||
a3bf8d4a2b | |||
16eda8c43a | |||
cd77382ac1 | |||
71b9cde010 | |||
5285589f37 | |||
f41647ee6b | |||
4d022cbc75 | |||
70de35a881 | |||
34b2cf3b33 | |||
9e90c9f73f | |||
e9528f6dc6 | |||
51baa9c333 | |||
35e076b3a8 | |||
a26f59ccbc | |||
aa3b3d76e0 | |||
f7030df3be | |||
905e91e9ac | |||
f8f9c0ba62 | |||
dda811021a | |||
93195146ea | |||
ed37599544 | |||
99ef59cf7f | |||
d544d141ec | |||
3e397a9484 | |||
268c325078 | |||
3cc9af88ff | |||
7cd0bd7212 | |||
56d4aefa33 | |||
dd143ef541 | |||
daefed052c | |||
5fbab20e02 | |||
e8224f3dca | |||
9665313c39 | |||
0c54fc7273 | |||
c1b57855ec | |||
83b824c8b4 | |||
7678fcd5b6 | |||
8661c0241d | |||
ce8d6b75fc | |||
61de3ef74b | |||
ec1f9c8c91 | |||
65e09094c4 | |||
c70cf0fe06 | |||
a5d11a54dc | |||
3d4c87758e | |||
a9bd832fc5 | |||
417bcefbae | |||
baada0e737 | |||
82eb61dd4c | |||
0d4d06fe2f | |||
4aed0ca6a2 | |||
1621b25288 | |||
a564797151 | |||
1da6a09274 | |||
1e44ffc3ff | |||
a454748544 | |||
1bff42c4b7 | |||
cb391d85dc | |||
fee5b8d37f | |||
b2ce859bd2 | |||
566f10a929 | |||
c3b5189137 | |||
a25866ac8d | |||
098900d7c2 | |||
98d01d3ce2 | |||
d55244df31 | |||
04149cce27 | |||
24834f4894 | |||
ec7da6fcf3 | |||
819d548e8a | |||
477d2a8aa2 | |||
e484e02857 | |||
24f6b9a713 | |||
9cdde47289 | |||
b1eb4ca152 | |||
87b4ac56c2 | |||
cb84e45ac7 | |||
4716377fbc | |||
4e9cf8c1dd | |||
2976dc27e9 | |||
102bf967f0 | |||
1f4b09b525 | |||
86c3369eb8 | |||
2755c34a8f | |||
db10422184 | |||
e1a2c699dd | |||
0115ccd5c0 | |||
40b4284fe3 | |||
4ebc0b9640 | |||
dc96fd54c6 | |||
1f5d13ab9f | |||
90cb44eb02 | |||
e11880deea | |||
9351f91be9 | |||
5a1e1c8353 | |||
69ecaa7c79 | |||
7f00899ff7 | |||
995e3d1f41 | |||
b4ac449a83 | |||
8e5314a468 | |||
87918e40c4 | |||
f6b32efb7f | |||
b99733d092 | |||
05a015d6a5 | |||
ad971af8c7 | |||
f2ebb6f541 | |||
1d01211264 | |||
f94ab12f79 | |||
a865bc1ca6 | |||
21802c4b6d | |||
652907b354 | |||
24f1c01e0f | |||
fad6e2538e | |||
7f6d47c1a2 | |||
3147586ebd | |||
ed636d99ca | |||
090c856d76 | |||
ad434d4cfe | |||
66d433b94f | |||
027b204ff1 | |||
55dcce91df | |||
8017c8db7f | |||
dc3529dbf6 | |||
7699258ef0 | |||
e9ba99f296 | |||
7c80368710 | |||
95d63f38c0 | |||
bb8dab821e | |||
fc0f87768a | |||
0a57386721 | |||
3749e28774 | |||
86fc2321ff | |||
2549c0dfef | |||
b10e519895 | |||
9bde5ba127 | |||
72c8f1ad04 | |||
da224daaa9 | |||
3a100b9278 | |||
242a637aea | |||
c2a9671510 | |||
d5ae4f7f42 | |||
b6c502a150 | |||
9ca710e525 | |||
eb07c8cb5b | |||
ba10801961 | |||
620fc2d09e | |||
29283eaa7e | |||
2fa66ef713 | |||
13affc432d | |||
d8f094a92a | |||
97ae6d777f | |||
6baeee70d1 | |||
d2517a4939 | |||
6342adc438 | |||
0adba91547 | |||
4285e423a6 | |||
63375f0cdb | |||
70ad3f9e98 | |||
d6fc629f4d | |||
af51d80fa1 | |||
f5722a5052 | |||
651cf0fec1 | |||
4dc52e1c53 | |||
4708f13a9c | |||
a6d042df0a | |||
40a36ccfeb | |||
ef608c37a7 | |||
2386803f2a | |||
95862f7b4d | |||
230b131b54 | |||
0812d8dd41 | |||
bf7e3c51ae | |||
a35a8a8392 | |||
4ef0bb1fcf | |||
fadc59c0e6 | |||
86cbd2eee9 | |||
092475f738 | |||
dcc56d62da | |||
f15e70d906 | |||
b6be6f8d1e | |||
03a70eacaf | |||
45b1ff7a25 | |||
15ba07ef25 | |||
d2b58ca203 | |||
82e7e19a6e | |||
421c462948 | |||
84884cd9ac | |||
a43aa183dc | |||
463bbb1835 | |||
5e125e74d1 | |||
06f21ce7a5 | |||
57a810db9c | |||
8b664706aa | |||
37bfee92bf | |||
e73ff24e31 | |||
bd7599d34a | |||
01b6113659 | |||
1b84eff03a | |||
55acf86bf8 | |||
f021b97993 | |||
1cab43c2d2 | |||
8bd651b318 | |||
58e234a754 | |||
e86c414d6a | |||
550b2801ad | |||
cefb9e5a28 | |||
98d7367b61 | |||
594a8b9030 | |||
44f990515b | |||
252937806c | |||
51826d51fa | |||
14e53ed11f | |||
ddb94c2605 | |||
90969fb39a | |||
101f1481f9 | |||
2edc87b161 | |||
4203926f10 | |||
cdb57015a7 | |||
aa557e6422 | |||
0e00d40e4f | |||
c920e01242 | |||
274d8e8818 | |||
2039c6305b | |||
6efb195a6e | |||
24b7fb455a | |||
58f5a59769 | |||
db9dfcfa6a | |||
9ef98d527e | |||
93491aefc7 | |||
7acd539cd7 | |||
e75a6301bd | |||
a79cc68b3a | |||
7e3f7a4ee7 | |||
9ec8257914 | |||
38327cf454 | |||
dfa82e2a3d | |||
e59ca942f5 | |||
a57a3044aa | |||
4e5a0f6ae2 | |||
b63bd14999 | |||
2041c0e360 | |||
085cbc4f9f | |||
2b93162fb0 | |||
2e45bd29fe | |||
51d7c6a2b2 | |||
f3aca1ee30 | |||
8dd41d6bcc | |||
0a298ea418 | |||
d330558bab | |||
656fd72976 | |||
79455cf421 | |||
30d6a015e0 | |||
8af5a5c4e5 | |||
3a5f0afcd2 | |||
c7e63aa4d8 | |||
4a9ce1784c | |||
7e4e709b43 | |||
63d8eabed0 | |||
e830b01383 | |||
ff6473980d | |||
a164aea35d | |||
a76f547e11 | |||
b7b7676d67 | |||
e6e3c55ef2 | |||
f98a4920f9 | |||
d4bfc23ef0 | |||
9a2160fa55 | |||
2de4118243 | |||
239b7befdd | |||
09e974d483 | |||
e5ef4fa99a | |||
037bcd942c | |||
c2e7507ad4 | |||
3aa2b6a637 | |||
555aa21905 | |||
e7ae3bf3d6 | |||
b932c048ac | |||
e85829450d | |||
effc5d24fa | |||
18ed3132d2 | |||
9b459eca88 | |||
70fedd0f79 | |||
bb103b29bf | |||
248e76c4df | |||
803d5c35f3 | |||
7fd8c0f85c | |||
44c3a5abc3 | |||
6909a76201 | |||
045533716b | |||
3c0ff914ac | |||
2bc4be4e32 | |||
c67abd614f | |||
6fa7cd3dbc | |||
94744ba41a | |||
4965ec42d2 | |||
73aa7041bf | |||
7c1f760024 | |||
da461f3cbf | |||
5b800f0932 | |||
8427f70493 | |||
7a7992085b | |||
1286211f57 | |||
6d531ad7b8 | |||
762b424a52 | |||
de1cb38769 | |||
c802f5430d | |||
cff8991a50 | |||
f3f8d8fff4 | |||
26df46ee59 | |||
c3f687ac22 | |||
04437e313d | |||
038bededba | |||
d03308be0c | |||
c6bc0034d0 | |||
70e132244a | |||
47e9038d23 | |||
432cf22a6a | |||
2914006fe0 | |||
7329ff5468 | |||
541d1df486 | |||
3b00ff9138 | |||
91276c5721 | |||
0b4167526d | |||
fd5fd26902 | |||
3bbaacbe15 | |||
a10314c6b3 | |||
70f2c2a709 | |||
280d074103 | |||
32b14baf8a | |||
2d9045fce8 | |||
355f66348c | |||
8693e47e6a | |||
cec8c7d7f8 | |||
4d0ec37267 | |||
e7f720ea56 | |||
4ae17bf1e2 | |||
8a49eea74b | |||
b4245a48df | |||
4e0f6076be | |||
726efc6a32 | |||
bd45912b99 | |||
15dac210f0 | |||
112b3e5b3b | |||
32d669275b | |||
4098b72210 | |||
46450b8d33 | |||
13ac9cab21 | |||
66aa4c0bf4 | |||
247181536f | |||
07bf813fb5 | |||
8958217ad5 | |||
ac5bc615b0 | |||
8063dfc61a | |||
6278bc829e | |||
3f532cb6a6 | |||
e6c9053f9e | |||
43ed4143c4 | |||
f4c98b4d4c | |||
e1e0fd7543 | |||
df8d3d1287 | |||
619d3de8bd | |||
ecff8309a3 | |||
dcf2a590f5 | |||
54aa619459 | |||
fb22be5817 | |||
7f301dd8ef | |||
8095341a01 | |||
69db16a46a | |||
ce78f9af4e | |||
9239bf718e | |||
7a6d45bc8a | |||
e74ff409e0 | |||
7a888271f5 | |||
9d119a86ae | |||
b2e85e26f4 | |||
dd8a29da99 | |||
27df5199d9 | |||
35fad35a48 | |||
733e7c9e95 | |||
0af4d764d6 | |||
e64afa455c | |||
1711b929b6 | |||
c091c0a588 | |||
1aa162e030 | |||
cf5c8f1686 | |||
4ec2cee000 | |||
99f536f830 | |||
5ebf66748b | |||
781d056280 | |||
5aefd6ac31 | |||
6c663dfd5e | |||
33437bc6e7 | |||
23114d3364 | |||
997c8811d6 | |||
e42389f9d7 | |||
ff38f0a32c | |||
a5cfbab3c8 | |||
ac3cd6e83c | |||
082ab86f5f | |||
6aa196c8dc | |||
a0dd7dcd49 | |||
e977c11111 | |||
5f063a80bd | |||
5d8e1c9279 | |||
0a049c7d86 | |||
d0cfec7ab9 | |||
a608160027 | |||
3f04a7fbf2 | |||
5994430b84 | |||
a9e879b316 | |||
3e2f37a69a | |||
4f044b1d67 | |||
4157f563b4 | |||
051da7efe3 | |||
25f560a62c | |||
a09ad90a72 | |||
10b34e36b9 | |||
b5269db959 | |||
6db94571d7 | |||
97cfa65df7 | |||
911c8eb000 | |||
ebcebeeb6b | |||
f533b5837f | |||
8279201ce6 | |||
23fdab00a8 | |||
623e2ed29f | |||
9d72daf4ce | |||
6dd55af6c9 | |||
3eb08ed9b1 | |||
5eeadc2642 | |||
3aee6573dc | |||
9cc645141d | |||
0893567db9 | |||
8abe69b499 | |||
761702fd19 | |||
9606d572ed | |||
cbcdf2c609 | |||
038de04d7b | |||
6b3cc75be0 | |||
7ffcccfa5c | |||
cc8accfd53 | |||
948ab03e7e | |||
5797fb97e9 | |||
3892e58ad7 | |||
d20e261199 | |||
f622dbcf39 | |||
dccf535f8e | |||
9c5c81b0da | |||
d6cd59f122 | |||
bc8ed3c4ba | |||
b9bd76ca14 | |||
6ebaf9ac71 | |||
f90d34b498 | |||
f68cce8e64 | |||
09b6a95551 | |||
50c9636d87 | |||
0661cfef7a | |||
a827aa815d | |||
b877031d80 | |||
dd861b992f | |||
eb63ea1e18 | |||
2f4bd358f1 | |||
8a8b30eac1 | |||
2fa0e1396b | |||
1c2bec0f82 | |||
ec870fba9a | |||
df1430265c | |||
4c69e228b3 | |||
790b79750b | |||
cfbb8c930f | |||
baec0d4de9 | |||
c21b99b912 | |||
93a00d7dde | |||
61e8c18350 | |||
8afcd0f633 | |||
91ca929dc7 | |||
84e00adc8a | |||
47c7126213 | |||
a989ca2bf6 | |||
0fa3970deb | |||
da6ea29f7a | |||
7297941b38 | |||
f8a08cb90d | |||
b15fd2be2a | |||
e588ac237c | |||
5df2da5b97 | |||
11b986b3fb | |||
296f927f24 | |||
0032903a5b | |||
47195057e9 | |||
6edbfa924d | |||
1e508343e1 | |||
2e0b4cfde0 | |||
10f55fe6c5 | |||
d3ccbd6350 | |||
0cfe7d386d | |||
0c6f5023c3 | |||
06dd08256f | |||
2b22290ce0 | |||
d8e82bc06d | |||
086b56824c | |||
5a0905ba2a | |||
a8f12a63fd | |||
69ae2380c6 | |||
27261e40a6 | |||
e3f813c33b | |||
c607a2652b | |||
3d45e3d749 | |||
742369d35a | |||
bfe2fe0af4 | |||
a8652f4f0f | |||
2f726b241e | |||
a597a57595 | |||
ae65f3e237 | |||
34868b106a | |||
1f16b7fe74 | |||
b88be22165 | |||
d8c6d7d6b5 | |||
40828ce5fe | |||
ffa443afed | |||
70e500cad9 | |||
4cb1c05c9e | |||
c47aafa37c | |||
cfbca8a2f2 | |||
0fe5609874 | |||
22d33baca2 | |||
b0e96aaebb | |||
8310e0b59b | |||
26dd972adb | |||
61c7a1b856 | |||
374ee287d8 | |||
a4d83661d7 | |||
8363cd093d | |||
6c5a3195db | |||
073d1ed354 | |||
3d446433ec | |||
1fe0fd12d3 | |||
dafb4e504a | |||
68cf1601d3 | |||
61f412187d | |||
05ccd0aa35 | |||
f690372b68 | |||
8b3e94a357 | |||
437f9162d0 | |||
4f065f12f5 | |||
228b768db6 | |||
027827cc1d | |||
72a8639b68 | |||
99abb8b650 | |||
3a1e648158 | |||
46c759c165 | |||
179a619c21 | |||
452e8fd968 | |||
8b793f7ec6 | |||
af35d3a3cc | |||
3b457143d2 | |||
ab656f2c2f | |||
64fc2193dc | |||
dd732028f5 | |||
414919138b | |||
db7c8ca910 | |||
f863ffc965 | |||
400d483e87 | |||
d1695758b2 | |||
53a0cf8b95 | |||
5eeabc2a44 | |||
18551e820c | |||
e41e160263 | |||
b89fb2a4a1 | |||
5340b0e221 | |||
37e3806132 | |||
c0efdd655b | |||
aaaec52ad9 | |||
e1eb45d397 | |||
89fca671fb | |||
d20b0c139c | |||
166a168b0f | |||
2bb0e1a799 | |||
6eaf1e5c52 | |||
868a8c5b2c | |||
b4ad56c1bd | |||
69698f257e | |||
cd0cd85102 | |||
0a74bfce9c | |||
dd3b865854 | |||
9b87a579aa | |||
b539222d4e | |||
8d6cf89526 | |||
583a9778e0 | |||
a73e183e36 | |||
1e799b7ec1 | |||
7f6c5ee06c | |||
faa0275730 | |||
8a5a9b70d7 | |||
bb3aeddfaf | |||
aecc780dba | |||
90df7f23aa | |||
b9b5bdfc7d | |||
31060b2757 | |||
fc1f67715d | |||
f6137adbcb | |||
e53b1350f2 | |||
d30aa7e9e6 | |||
d1ad2a57af | |||
b82662d952 | |||
71c1e07107 | |||
b30c75dda4 | |||
def232e122 | |||
3453b964a3 | |||
61c6a5a796 | |||
74bc397b0a | |||
f58aea002c | |||
3556a41434 | |||
9ed6ee92d6 | |||
ee3778d5fc | |||
aaacf17324 | |||
4c7629cae9 | |||
e0fdfa1608 | |||
5952d8ab61 | |||
a2ae496589 | |||
877e352262 | |||
d4d93db2c5 | |||
8c0d15d5c5 | |||
97ac781c62 | |||
776dcec8fe | |||
ccf02fcbae | |||
acaea3bb07 | |||
9f37422779 | |||
dd344e0342 | |||
54a8804455 | |||
bbd94a19fc | |||
233ffce1eb | |||
40677783aa | |||
14f301b541 | |||
46f98893dd | |||
fe66b34728 | |||
270a5da495 | |||
7097b4cc1c | |||
977a16772c | |||
73deea2fdb | |||
9d2b4a70f4 | |||
0b0d6421b2 | |||
1140991a7b | |||
613c5bb945 | |||
fd8e055ffb | |||
ab93f1360f | |||
40253bab44 | |||
c77620d22d | |||
989ecd2007 | |||
54cc46f3eb | |||
601bd3268e | |||
09269b3127 | |||
27b50f1fe6 | |||
9532c49836 | |||
0c2af17c76 | |||
a6e0d096dd | |||
d3d4956261 | |||
4059adc31b | |||
f1f632d9ec | |||
95d680b862 | |||
fb4c7f8ef0 | |||
0b1cfa6180 | |||
32ef4983cd | |||
ad19c8a003 | |||
2a602b055a | |||
7888e1d0a3 | |||
60c872d4b6 | |||
3fb17d26c8 | |||
d47807ba08 | |||
02fcaa3d0a | |||
8a4a2efc6f | |||
8e9ffd37d6 | |||
01b3fd0af7 | |||
f53a0586b9 | |||
b1cc4dfef5 | |||
382403921f | |||
a73122de96 | |||
bd44b812cb | |||
55211b01e8 | |||
5d043c1685 | |||
36d1ccb286 | |||
1bc3b739c4 | |||
1bd32bc8dd | |||
128bf75283 | |||
a94a699c3f | |||
ab426ec9c0 | |||
165290d357 | |||
ce20124671 | |||
53be4a8634 | |||
f5d3acd474 | |||
916836bbfb | |||
d9f83d6206 | |||
4a754fcf15 | |||
c0c25e25fa | |||
45f3f3f59e | |||
ff47aab056 | |||
debd6bbf09 | |||
5c538c37b2 | |||
e22ee1e7a2 | |||
e392d85831 | |||
77a318bd01 | |||
80e78d02ac | |||
4a42b9f5d6 | |||
47532cd9f4 | |||
36e0c8f7da | |||
9f583e360c | |||
b706d898af | |||
863d315c86 | |||
d374f04a33 | |||
61a01b27a7 | |||
53056731fd | |||
4cbf286794 | |||
c6e14a61ab | |||
07b4b7a37f | |||
07964e2f30 | |||
4bf82d4b90 | |||
9ab326713f | |||
af295e9b01 | |||
a1c8f3796c | |||
08a1a1121d | |||
1477ffc381 | |||
70b808fe1a | |||
63d635d179 | |||
1fc973c0b5 | |||
c982ac5722 | |||
4290b704ff | |||
c91b64f749 | |||
d6123170d5 | |||
485afdd3cb | |||
90e88ab756 | |||
04421dff8a | |||
432d6dad15 | |||
5ff0d32580 | |||
0967110e42 | |||
fb0acb6c72 | |||
92b0ce2ac7 | |||
bc2d4473bf | |||
3b352a2f92 | |||
dea985aef0 | |||
39be30351f | |||
001a9c7b0d | |||
89cdaa83e7 | |||
b0746fae3d | |||
60a98b2de5 | |||
460f553a6d | |||
1253b15774 | |||
dc74613fa2 | |||
a21076ed3a | |||
212007b168 | |||
fb16eea48b | |||
73ae0b44e9 | |||
6d7f037748 | |||
10f7552789 | |||
b0d541947a | |||
5f0b53c6ea | |||
eb8b5eb183 | |||
9513290032 | |||
0d5e73d30e | |||
609ef61fea | |||
db84f5eb3b | |||
206e2577fa | |||
e02883c400 | |||
9085aabd62 | |||
8d5aa466fb | |||
0b7f06b447 | |||
03fe18ae0f | |||
cb8bdfade2 | |||
33f227e16b | |||
cfd0ae8234 | |||
7caff01a7b | |||
be0b399d74 | |||
b8b0ccbd2d | |||
c908a07f57 | |||
7b6fd6e486 | |||
47512b3200 | |||
3b9c6c6947 | |||
4aae667668 | |||
9f3bc0f58c | |||
980385f8c1 | |||
ca7a2d5f28 | |||
333681408f | |||
ef64044079 | |||
66e16a038e | |||
e1f0835ae0 | |||
8ed5421aaa | |||
c6359e8ca6 | |||
952a074980 | |||
d0feea31c7 | |||
58abe35455 | |||
f7ebad2307 | |||
80e9afb5bc | |||
1e3598edeb | |||
f7a6bd0fa1 | |||
0ca3b8e01c | |||
cc10281498 | |||
05fb6718f0 | |||
12c29a881f | |||
70da0c0748 | |||
c1588a2c94 | |||
8ca7a71df7 | |||
63137cd922 | |||
ddd1ef66ec | |||
e5e03c2c1b | |||
e1744502c2 | |||
dae6896977 | |||
c34eeec58d | |||
ad60bbb2b2 | |||
0578e5a462 | |||
04222984f8 | |||
6832707e90 | |||
6b2ef5cd17 | |||
958adce478 | |||
99b0915d3b | |||
8ca2b21c98 | |||
d9292786e1 | |||
cc2f9b32c8 | |||
cd579352bf | |||
9f1710f1ac | |||
e642ec962c | |||
ada19210a3 | |||
bf0560bda9 | |||
151b08e0fe | |||
81b2f4a45f | |||
82551ad616 | |||
caac5c2e59 | |||
6bd1dd9d26 | |||
4f27044aab | |||
0ddc991f5c | |||
fa82b93853 | |||
69ff99fdcd | |||
5d802522a7 | |||
1769928079 | |||
ed6ea06577 | |||
5ee10e990d | |||
3dbd2d813a | |||
f5f7f00cd9 | |||
abcc61e0af | |||
f6bb18fd9a | |||
71eaf8969b | |||
ca100c90fe | |||
ffad94397d | |||
4dacaa4a83 | |||
a7ea35aa67 | |||
1e3e76b6cc | |||
53ea6ad830 | |||
1b7624bf5c | |||
ac60dc7fe1 | |||
a4f1ee35d6 | |||
a32c8669ca | |||
ca2ca8de57 | |||
f71b00a19e | |||
8f808cf86e | |||
7bab4bb048 | |||
e17e4488bd | |||
257e200a25 | |||
47d4a7e004 | |||
7f89a594dd | |||
961644e6a8 | |||
8d6cd32b7b | |||
ec79b67c77 | |||
32985bed7c | |||
dae9ec464c | |||
6eaf93020d | |||
72c62eae5f | |||
0a995d5434 | |||
ade3f7d988 | |||
0df25101d6 | |||
e123aafdf0 | |||
5b143d33be | |||
eb59b5a6cb | |||
fbfc3ee37e | |||
3e1d223626 | |||
4f5b059f14 | |||
288ca110f6 | |||
c2bd2196fc | |||
550c7ba3dc | |||
e5b2f1601a | |||
9badee53de | |||
beebf4742a | |||
f89978ad7c | |||
b3cf368d79 | |||
c8525f06fc | |||
5db6b2c961 | |||
6247bae6c6 | |||
3610fb4930 | |||
71c4b40562 | |||
ac65bc92df | |||
f78c0be80a | |||
66233af7b6 | |||
bf13d40972 | |||
989f4f430c | |||
bb5b640359 | |||
c060b71408 | |||
79e4937c65 | |||
cd1d3c3df8 | |||
19d98e0c7d | |||
2b04c209ee | |||
ae122b1cbd | |||
872db2be0e | |||
2dfdfed8a0 | |||
c41d27156b | |||
91373a0d15 | |||
848a6438ae | |||
98175b2816 | |||
4167252eaf | |||
f35f8e2242 | |||
b87c21fc89 | |||
e584b85afd | |||
09e56f9262 | |||
cf069aa8aa | |||
bf33700ecd | |||
bc6ccb9878 | |||
82fbeae92b | |||
cc5e8f6db8 | |||
d54990da47 | |||
b9f1d4294e | |||
b28246f6ff | |||
3b5567a209 | |||
fdcc405346 | |||
8994dabc22 | |||
02296f420d | |||
6a92ff93e1 | |||
6a84164add | |||
f64ffa8c25 | |||
bd56c983d6 | |||
084bbac8cc | |||
28943d36ce | |||
b526ca6726 | |||
e7bd944e08 | |||
c3b6559a10 | |||
4be4b26cb7 | |||
2aed2c9fa7 | |||
9b61dd41e7 | |||
f7bee5c815 | |||
e0734387fb | |||
f58f8b5c96 | |||
b3f7aaccd0 | |||
b91660ddb8 | |||
76c89fcadd | |||
b9e41734c5 | |||
1088f06242 | |||
73e0225ee9 | |||
6c85da3a18 | |||
67fc426845 | |||
9804145cac | |||
2e94b9cfbb | |||
8294773e48 | |||
cd813c6d4d | |||
38acae6e97 | |||
a2dd48c386 | |||
126f6beeb4 | |||
58d1b2aa77 | |||
f1579b229d | |||
7864875879 | |||
1dd422b64a | |||
06c8f8d885 | |||
5677c9bb3e | |||
512d77d582 | |||
7f0be2aa24 | |||
edf309ebbe | |||
788f284b53 | |||
4b1d141f49 | |||
10c3b8c1cf | |||
a7f37314b7 | |||
cd711c48b2 | |||
378b3ef6f8 | |||
c9944acbf9 | |||
ca377cf1b9 | |||
a31614e386 | |||
f95903909f | |||
b382a7f28f | |||
4cb6fa0a9c | |||
d08b285adf | |||
b27122acc2 | |||
934bb99c71 | |||
3f808cc044 | |||
ec8a5e5386 | |||
215bf150a6 | |||
0ecdd98031 | |||
7b700ec8c8 | |||
7ca1da020f | |||
5157338ed9 | |||
e206b54331 | |||
1d35662e6d | |||
e656f638de | |||
145944cb94 | |||
094b7d9496 | |||
e1fe7591f2 | |||
5629f26df7 | |||
9ba28043b5 | |||
24679788ed | |||
07c4353057 | |||
34e3494e70 | |||
f75aa72732 | |||
340e39e387 | |||
f4133ce4e5 | |||
6522d55b6f | |||
6ff518626c | |||
fa82074167 | |||
75e9d49796 | |||
32c3b6bfd1 | |||
37b6cb4985 | |||
aabeb2688f | |||
2f42a4888c | |||
3173c3b34e | |||
2d87d7d1ac | |||
aab392774b | |||
6724e79164 | |||
03f48b3db6 | |||
4d251ad00e | |||
18e505930d | |||
4a8cfc7551 | |||
bc32bc73aa | |||
ab1091d5f2 | |||
1e15aaef56 | |||
51010a1807 | |||
7196a3b1db | |||
cdc1fa12eb | |||
f61528d46d | |||
1f0ae3ed0a | |||
db986c19ea | |||
227578480d | |||
befc402d34 | |||
444b0f0f62 | |||
ccc00515fd | |||
781096e385 | |||
7940d8a6a7 | |||
c0e3ecd6d2 | |||
23eca9cf68 | |||
437b76ff59 | |||
f90a375593 | |||
e7ef74e26e | |||
cbae7af552 | |||
eb24dc4a45 | |||
9bebc9512f | |||
5a2ba16f5c | |||
ba5106e519 | |||
d5ca2110f1 | |||
2c5e637b57 | |||
322d2a27d6 | |||
82e0d601fc | |||
78ac0f591d | |||
b56155e7f3 | |||
382f66fb08 | |||
8354f6640c | |||
c904fdddf6 | |||
558db8083c | |||
e109e598c7 | |||
8db1b9d0a1 | |||
2382ad29d1 | |||
3e472d882a | |||
7f6bae561c | |||
105b8ce4c0 | |||
2cb8c1540e | |||
1cd981da4f | |||
fca20841c2 | |||
da31b5333e | |||
bb78fb318e | |||
8aca27fa11 | |||
95c617e04b | |||
9a1f1da5d1 | |||
68d630a0c7 | |||
68d535ef44 | |||
c6ed93860f | |||
0ffdf8ce0c | |||
8c0dd3d4df | |||
ada7c780d5 | |||
288cc6c234 | |||
900edbfa48 | |||
b2c3fc5d65 | |||
839b27c6cc | |||
34ad27fe83 | |||
1c3c975766 | |||
1cdc88614a | |||
31aa045c11 | |||
a30c093502 | |||
c7b07a95a6 | |||
27a09dc52c | |||
981f3c831e | |||
44c33f01f3 | |||
33170081f1 | |||
71face8540 | |||
bfbc0b32c6 | |||
6a417b8600 | |||
d3ea50113c | |||
34aad515c8 |
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import os
|
||||
import sys
|
||||
@ -8,12 +9,12 @@ import zipfile
|
||||
# Note that we have 400 MiB quota, please use it wisely.
|
||||
# See https://github.com/pypi/support/issues/3792 .
|
||||
# Please also sync the value with the one in Dockerfile.
|
||||
VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 400))
|
||||
VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 400))
|
||||
|
||||
|
||||
def print_top_10_largest_files(zip_file):
|
||||
"""Print the top 10 largest files in the given zip file."""
|
||||
with zipfile.ZipFile(zip_file, 'r') as z:
|
||||
with zipfile.ZipFile(zip_file, "r") as z:
|
||||
file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()]
|
||||
file_sizes.sort(key=lambda x: x[1], reverse=True)
|
||||
for f, size in file_sizes[:10]:
|
||||
@ -28,14 +29,18 @@ def check_wheel_size(directory):
|
||||
wheel_path = os.path.join(root, file_name)
|
||||
wheel_size_mb = os.path.getsize(wheel_path) / (1024 * 1024)
|
||||
if wheel_size_mb > VLLM_MAX_SIZE_MB:
|
||||
print(f"Not allowed: Wheel {wheel_path} is larger "
|
||||
f"({wheel_size_mb:.2f} MB) than the limit "
|
||||
f"({VLLM_MAX_SIZE_MB} MB).")
|
||||
print(
|
||||
f"Not allowed: Wheel {wheel_path} is larger "
|
||||
f"({wheel_size_mb:.2f} MB) than the limit "
|
||||
f"({VLLM_MAX_SIZE_MB} MB)."
|
||||
)
|
||||
print_top_10_largest_files(wheel_path)
|
||||
return 1
|
||||
else:
|
||||
print(f"Wheel {wheel_path} is within the allowed size "
|
||||
f"({wheel_size_mb:.2f} MB).")
|
||||
print(
|
||||
f"Wheel {wheel_path} is within the allowed size "
|
||||
f"({wheel_size_mb:.2f} MB)."
|
||||
)
|
||||
return 0
|
||||
|
||||
|
||||
@ -45,4 +50,4 @@ if __name__ == "__main__":
|
||||
sys.exit(1)
|
||||
|
||||
directory = sys.argv[1]
|
||||
sys.exit(check_wheel_size(directory))
|
||||
sys.exit(check_wheel_size(directory))
|
||||
|
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import argparse
|
||||
import os
|
||||
@ -22,5 +23,5 @@ with open("index.html", "w") as f:
|
||||
print(f"Generated index.html for {args.wheel}")
|
||||
# cloudfront requires escaping the '+' character
|
||||
f.write(
|
||||
template.format(wheel=filename,
|
||||
wheel_html_escaped=filename.replace("+", "%2B")))
|
||||
template.format(wheel=filename, wheel_html_escaped=filename.replace("+", "%2B"))
|
||||
)
|
||||
|
@ -1,3 +1,4 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2
|
||||
model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat"
|
||||
tasks:
|
||||
|
@ -1,3 +1,4 @@
|
||||
# For hf script, without -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5
|
||||
model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform"
|
||||
tasks:
|
||||
|
@ -1,3 +1,4 @@
|
||||
# For hf script, without -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
|
||||
model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
|
||||
tasks:
|
||||
|
@ -1,3 +1,4 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1
|
||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors"
|
||||
tasks:
|
||||
|
@ -1,3 +1,4 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1
|
||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform"
|
||||
tasks:
|
||||
|
@ -1,3 +1,4 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1
|
||||
model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
|
||||
tasks:
|
||||
|
@ -1,3 +1,4 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
|
||||
model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
|
||||
tasks:
|
||||
|
@ -1,3 +1,4 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
|
||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
|
||||
tasks:
|
||||
|
@ -1,3 +1,4 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
|
||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test"
|
||||
tasks:
|
||||
|
@ -1,3 +1,4 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1
|
||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
|
||||
tasks:
|
||||
|
@ -1,4 +1,5 @@
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 -t 1
|
||||
# For hf script, without -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5
|
||||
model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
|
@ -1,3 +1,4 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
|
||||
model_name: "HandH1998/QQQ-Llama-3-8b-g128"
|
||||
tasks:
|
||||
|
@ -0,0 +1,11 @@
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Llama-3.2-1B-Instruct-FP8 -b "auto" -l 1319 -f 5 -t 1
|
||||
model_name: "RedHatAI/Llama-3.2-1B-Instruct-FP8"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.335
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.323
|
||||
limit: 1319
|
||||
num_fewshot: 5
|
@ -1,3 +1,4 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
|
||||
model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"
|
||||
tasks:
|
||||
|
@ -1,11 +1,12 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1
|
||||
model_name: "mgoin/Minitron-4B-Base-FP8"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.233
|
||||
value: 0.231
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.236
|
||||
value: 0.22
|
||||
limit: 1000
|
||||
num_fewshot: 5
|
||||
|
@ -1,3 +1,4 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8
|
||||
model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic"
|
||||
tasks:
|
||||
|
@ -1,3 +1,4 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4
|
||||
model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
|
||||
tasks:
|
||||
|
@ -1,4 +1,5 @@
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5 -t 4
|
||||
# For hf script, without -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5
|
||||
model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
|
@ -0,0 +1,12 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16 -b auto -l 1319 -f 5 -t 1
|
||||
model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.30
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.465
|
||||
limit: 1319
|
||||
num_fewshot: 5
|
@ -1,3 +1,4 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1
|
||||
model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8"
|
||||
tasks:
|
||||
|
@ -1,3 +1,4 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
|
||||
model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
|
||||
tasks:
|
||||
|
@ -1,3 +1,4 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
|
||||
model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
|
||||
tasks:
|
||||
|
@ -1,3 +1,4 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
|
||||
model_name: "Qwen/Qwen2-57B-A14B-Instruct"
|
||||
tasks:
|
||||
|
@ -0,0 +1,11 @@
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2.5-1.5B-Instruct -b auto -l 1319 -f 5 -t 1
|
||||
model_name: "Qwen/Qwen2.5-1.5B-Instruct"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.54
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.59
|
||||
limit: 1319
|
||||
num_fewshot: 5
|
@ -0,0 +1,11 @@
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
|
||||
model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.47
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.64
|
||||
limit: 1319
|
||||
num_fewshot: 5
|
@ -1,3 +1,4 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2
|
||||
model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM"
|
||||
tasks:
|
||||
|
@ -3,3 +3,4 @@ Meta-Llama-3-70B-Instruct.yaml
|
||||
Mixtral-8x7B-Instruct-v0.1.yaml
|
||||
Qwen2-57B-A14-Instruct.yaml
|
||||
DeepSeek-V2-Lite-Chat.yaml
|
||||
Meta-Llama-3-8B-QQQ.yaml
|
||||
|
@ -1,10 +1,6 @@
|
||||
Meta-Llama-3-8B-Instruct.yaml
|
||||
Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
|
||||
Qwen2.5-1.5B-Instruct.yaml
|
||||
Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
|
||||
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
|
||||
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
|
||||
Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
|
||||
Minitron-4B-Base-FP8.yaml
|
||||
Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
|
||||
Qwen2-1.5B-Instruct-FP8W8.yaml
|
||||
Meta-Llama-3-8B-QQQ.yaml
|
||||
Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
|
||||
Qwen1.5-MoE-W4A16-compressed-tensors.yaml
|
||||
|
44
.buildkite/lm-eval-harness/conftest.py
Normal file
44
.buildkite/lm-eval-harness/conftest.py
Normal file
@ -0,0 +1,44 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
def pytest_addoption(parser):
|
||||
parser.addoption(
|
||||
"--config-list-file",
|
||||
action="store",
|
||||
help="Path to the file listing model config YAMLs (one per line)",
|
||||
)
|
||||
parser.addoption(
|
||||
"--tp-size",
|
||||
action="store",
|
||||
default="1",
|
||||
help="Tensor parallel size to use for evaluation",
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def config_list_file(pytestconfig, config_dir):
|
||||
rel_path = pytestconfig.getoption("--config-list-file")
|
||||
return config_dir / rel_path
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def tp_size(pytestconfig):
|
||||
return pytestconfig.getoption("--tp-size")
|
||||
|
||||
|
||||
def pytest_generate_tests(metafunc):
|
||||
if "config_filename" in metafunc.fixturenames:
|
||||
rel_path = metafunc.config.getoption("--config-list-file")
|
||||
config_list_file = Path(rel_path).resolve()
|
||||
config_dir = config_list_file.parent
|
||||
with open(config_list_file, encoding="utf-8") as f:
|
||||
configs = [
|
||||
config_dir / line.strip()
|
||||
for line in f
|
||||
if line.strip() and not line.startswith("#")
|
||||
]
|
||||
metafunc.parametrize("config_filename", configs)
|
@ -1,59 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
usage() {
|
||||
echo``
|
||||
echo "Runs lm eval harness on GSM8k using vllm and compares to "
|
||||
echo "precomputed baseline (measured by HF transformers.)"
|
||||
echo
|
||||
echo "usage: ${0} <options>"
|
||||
echo
|
||||
echo " -c - path to the test data config (e.g. configs/small-models.txt)"
|
||||
echo " -t - tensor parallel size"
|
||||
echo
|
||||
}
|
||||
|
||||
SUCCESS=0
|
||||
|
||||
while getopts "c:t:" OPT; do
|
||||
case ${OPT} in
|
||||
c )
|
||||
CONFIG="$OPTARG"
|
||||
;;
|
||||
t )
|
||||
TP_SIZE="$OPTARG"
|
||||
;;
|
||||
\? )
|
||||
usage
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Parse list of configs.
|
||||
IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < "$CONFIG"
|
||||
|
||||
for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
|
||||
do
|
||||
LOCAL_SUCCESS=0
|
||||
|
||||
echo "=== RUNNING MODEL: $MODEL_CONFIG WITH TP SIZE: $TP_SIZE==="
|
||||
|
||||
export LM_EVAL_TEST_DATA_FILE=$PWD/configs/${MODEL_CONFIG}
|
||||
export LM_EVAL_TP_SIZE=$TP_SIZE
|
||||
pytest -s test_lm_eval_correctness.py || LOCAL_SUCCESS=$?
|
||||
|
||||
if [[ $LOCAL_SUCCESS == 0 ]]; then
|
||||
echo "=== PASSED MODEL: ${MODEL_CONFIG} ==="
|
||||
else
|
||||
echo "=== FAILED MODEL: ${MODEL_CONFIG} ==="
|
||||
fi
|
||||
|
||||
SUCCESS=$((SUCCESS + LOCAL_SUCCESS))
|
||||
|
||||
done
|
||||
|
||||
if [ "${SUCCESS}" -eq "0" ]; then
|
||||
exit 0
|
||||
else
|
||||
exit 1
|
||||
fi
|
@ -1,64 +1,55 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
LM eval harness on model to compare vs HF baseline computed offline.
|
||||
Configs are found in configs/$MODEL.yaml
|
||||
|
||||
* export LM_EVAL_TEST_DATA_FILE=configs/Meta-Llama-3-70B-Instruct.yaml
|
||||
* export LM_EVAL_TP_SIZE=4
|
||||
* pytest -s test_lm_eval_correctness.py
|
||||
pytest -s -v test_lm_eval_correctness.py \
|
||||
--config-list-file=configs/models-small.txt \
|
||||
--tp-size=1
|
||||
"""
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import lm_eval
|
||||
import numpy
|
||||
import numpy as np
|
||||
import yaml
|
||||
|
||||
RTOL = 0.05
|
||||
TEST_DATA_FILE = os.environ.get(
|
||||
"LM_EVAL_TEST_DATA_FILE",
|
||||
".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
|
||||
|
||||
TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1)
|
||||
RTOL = 0.08
|
||||
|
||||
|
||||
def launch_lm_eval(eval_config):
|
||||
trust_remote_code = eval_config.get('trust_remote_code', False)
|
||||
|
||||
model_args = f"pretrained={eval_config['model_name']}," \
|
||||
f"tensor_parallel_size={TP_SIZE}," \
|
||||
f"add_bos_token=true," \
|
||||
f"trust_remote_code={trust_remote_code}"
|
||||
|
||||
def launch_lm_eval(eval_config, tp_size):
|
||||
trust_remote_code = eval_config.get("trust_remote_code", False)
|
||||
model_args = (
|
||||
f"pretrained={eval_config['model_name']},"
|
||||
f"tensor_parallel_size={tp_size},"
|
||||
f"enforce_eager=true,"
|
||||
f"add_bos_token=true,"
|
||||
f"trust_remote_code={trust_remote_code}"
|
||||
)
|
||||
results = lm_eval.simple_evaluate(
|
||||
model="vllm",
|
||||
model_args=model_args,
|
||||
tasks=[task["name"] for task in eval_config["tasks"]],
|
||||
num_fewshot=eval_config["num_fewshot"],
|
||||
limit=eval_config["limit"],
|
||||
batch_size="auto")
|
||||
|
||||
batch_size="auto",
|
||||
)
|
||||
return results
|
||||
|
||||
|
||||
def test_lm_eval_correctness():
|
||||
eval_config = yaml.safe_load(
|
||||
Path(TEST_DATA_FILE).read_text(encoding="utf-8"))
|
||||
def test_lm_eval_correctness_param(config_filename, tp_size):
|
||||
eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8"))
|
||||
|
||||
# Launch eval requests.
|
||||
results = launch_lm_eval(eval_config)
|
||||
results = launch_lm_eval(eval_config, tp_size)
|
||||
|
||||
# Confirm scores match ground truth.
|
||||
success = True
|
||||
for task in eval_config["tasks"]:
|
||||
for metric in task["metrics"]:
|
||||
ground_truth = metric["value"]
|
||||
measured_value = results["results"][task["name"]][metric["name"]]
|
||||
print(f'{task["name"]} | {metric["name"]}: '
|
||||
f'ground_truth={ground_truth} | measured={measured_value}')
|
||||
success = success and numpy.isclose(
|
||||
ground_truth, measured_value, rtol=RTOL)
|
||||
print(
|
||||
f"{task['name']} | {metric['name']}: "
|
||||
f"ground_truth={ground_truth} | measured={measured_value}"
|
||||
)
|
||||
success = success and np.isclose(ground_truth, measured_value, rtol=RTOL)
|
||||
|
||||
# Assert at the end, print all scores even on failure for debugging.
|
||||
assert success
|
||||
|
@ -113,7 +113,7 @@ WARNING: The benchmarking script will save json results by itself, so please do
|
||||
|
||||
### Visualizing the results
|
||||
|
||||
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results.
|
||||
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results.
|
||||
You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
|
||||
If you do not see the table, please wait till the benchmark finish running.
|
||||
The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
|
||||
|
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import json
|
||||
import os
|
||||
@ -65,18 +66,18 @@ def read_markdown(file):
|
||||
|
||||
|
||||
def results_to_json(latency, throughput, serving):
|
||||
return json.dumps({
|
||||
'latency': latency.to_dict(),
|
||||
'throughput': throughput.to_dict(),
|
||||
'serving': serving.to_dict()
|
||||
})
|
||||
return json.dumps(
|
||||
{
|
||||
"latency": latency.to_dict(),
|
||||
"throughput": throughput.to_dict(),
|
||||
"serving": serving.to_dict(),
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
# collect results
|
||||
for test_file in results_folder.glob("*.json"):
|
||||
|
||||
with open(test_file) as f:
|
||||
raw_result = json.loads(f.read())
|
||||
|
||||
@ -84,8 +85,13 @@ if __name__ == "__main__":
|
||||
# this result is generated via `benchmark_serving.py`
|
||||
|
||||
# attach the benchmarking command to raw_result
|
||||
with open(test_file.with_suffix(".commands")) as f:
|
||||
command = json.loads(f.read())
|
||||
try:
|
||||
with open(test_file.with_suffix(".commands")) as f:
|
||||
command = json.loads(f.read())
|
||||
except OSError as e:
|
||||
print(e)
|
||||
continue
|
||||
|
||||
raw_result.update(command)
|
||||
|
||||
# update the test name of this result
|
||||
@ -99,8 +105,13 @@ if __name__ == "__main__":
|
||||
# this result is generated via `benchmark_latency.py`
|
||||
|
||||
# attach the benchmarking command to raw_result
|
||||
with open(test_file.with_suffix(".commands")) as f:
|
||||
command = json.loads(f.read())
|
||||
try:
|
||||
with open(test_file.with_suffix(".commands")) as f:
|
||||
command = json.loads(f.read())
|
||||
except OSError as e:
|
||||
print(e)
|
||||
continue
|
||||
|
||||
raw_result.update(command)
|
||||
|
||||
# update the test name of this result
|
||||
@ -110,7 +121,8 @@ if __name__ == "__main__":
|
||||
for perc in [10, 25, 50, 75, 90, 99]:
|
||||
# Multiply 1000 to convert the time unit from s to ms
|
||||
raw_result.update(
|
||||
{f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]})
|
||||
{f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]}
|
||||
)
|
||||
raw_result["avg_latency"] = raw_result["avg_latency"] * 1000
|
||||
|
||||
# add the result to raw_result
|
||||
@ -121,8 +133,13 @@ if __name__ == "__main__":
|
||||
# this result is generated via `benchmark_throughput.py`
|
||||
|
||||
# attach the benchmarking command to raw_result
|
||||
with open(test_file.with_suffix(".commands")) as f:
|
||||
command = json.loads(f.read())
|
||||
try:
|
||||
with open(test_file.with_suffix(".commands")) as f:
|
||||
command = json.loads(f.read())
|
||||
except OSError as e:
|
||||
print(e)
|
||||
continue
|
||||
|
||||
raw_result.update(command)
|
||||
|
||||
# update the test name of this result
|
||||
@ -138,26 +155,27 @@ if __name__ == "__main__":
|
||||
serving_results = pd.DataFrame.from_dict(serving_results)
|
||||
throughput_results = pd.DataFrame.from_dict(throughput_results)
|
||||
|
||||
raw_results_json = results_to_json(latency_results, throughput_results,
|
||||
serving_results)
|
||||
raw_results_json = results_to_json(
|
||||
latency_results, throughput_results, serving_results
|
||||
)
|
||||
|
||||
# remapping the key, for visualization purpose
|
||||
if not latency_results.empty:
|
||||
latency_results = latency_results[list(
|
||||
latency_column_mapping.keys())].rename(
|
||||
columns=latency_column_mapping)
|
||||
latency_results = latency_results[list(latency_column_mapping.keys())].rename(
|
||||
columns=latency_column_mapping
|
||||
)
|
||||
if not serving_results.empty:
|
||||
serving_results = serving_results[list(
|
||||
serving_column_mapping.keys())].rename(
|
||||
columns=serving_column_mapping)
|
||||
serving_results = serving_results[list(serving_column_mapping.keys())].rename(
|
||||
columns=serving_column_mapping
|
||||
)
|
||||
if not throughput_results.empty:
|
||||
throughput_results = throughput_results[list(
|
||||
throughput_results_column_mapping.keys())].rename(
|
||||
columns=throughput_results_column_mapping)
|
||||
throughput_results = throughput_results[
|
||||
list(throughput_results_column_mapping.keys())
|
||||
].rename(columns=throughput_results_column_mapping)
|
||||
|
||||
processed_results_json = results_to_json(latency_results,
|
||||
throughput_results,
|
||||
serving_results)
|
||||
processed_results_json = results_to_json(
|
||||
latency_results, throughput_results, serving_results
|
||||
)
|
||||
|
||||
for df in [latency_results, serving_results, throughput_results]:
|
||||
if df.empty:
|
||||
@ -169,38 +187,39 @@ if __name__ == "__main__":
|
||||
# The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
|
||||
# we want to turn it into "8xGPUTYPE"
|
||||
df["GPU"] = df["GPU"].apply(
|
||||
lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}")
|
||||
lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}"
|
||||
)
|
||||
|
||||
# get markdown tables
|
||||
latency_md_table = tabulate(latency_results,
|
||||
headers='keys',
|
||||
tablefmt='pipe',
|
||||
showindex=False)
|
||||
serving_md_table = tabulate(serving_results,
|
||||
headers='keys',
|
||||
tablefmt='pipe',
|
||||
showindex=False)
|
||||
throughput_md_table = tabulate(throughput_results,
|
||||
headers='keys',
|
||||
tablefmt='pipe',
|
||||
showindex=False)
|
||||
latency_md_table = tabulate(
|
||||
latency_results, headers="keys", tablefmt="pipe", showindex=False
|
||||
)
|
||||
serving_md_table = tabulate(
|
||||
serving_results, headers="keys", tablefmt="pipe", showindex=False
|
||||
)
|
||||
throughput_md_table = tabulate(
|
||||
throughput_results, headers="keys", tablefmt="pipe", showindex=False
|
||||
)
|
||||
|
||||
# document the result
|
||||
with open(results_folder / "benchmark_results.md", "w") as f:
|
||||
|
||||
results = read_markdown("../.buildkite/nightly-benchmarks/" +
|
||||
"performance-benchmarks-descriptions.md")
|
||||
results = read_markdown(
|
||||
"../.buildkite/nightly-benchmarks/"
|
||||
+ "performance-benchmarks-descriptions.md"
|
||||
)
|
||||
results = results.format(
|
||||
latency_tests_markdown_table=latency_md_table,
|
||||
throughput_tests_markdown_table=throughput_md_table,
|
||||
serving_tests_markdown_table=serving_md_table,
|
||||
benchmarking_results_in_json_string=processed_results_json)
|
||||
benchmarking_results_in_json_string=processed_results_json,
|
||||
)
|
||||
f.write(results)
|
||||
|
||||
# document benchmarking results in json
|
||||
with open(results_folder / "benchmark_results.json", "w") as f:
|
||||
|
||||
results = latency_results.to_dict(
|
||||
orient='records') + throughput_results.to_dict(
|
||||
orient='records') + serving_results.to_dict(orient='records')
|
||||
results = (
|
||||
latency_results.to_dict(orient="records")
|
||||
+ throughput_results.to_dict(orient="records")
|
||||
+ serving_results.to_dict(orient="records")
|
||||
)
|
||||
f.write(json.dumps(results))
|
||||
|
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import argparse
|
||||
|
||||
@ -14,15 +15,12 @@ def main(model, cachedir):
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Download and save Hugging Face tokenizer")
|
||||
parser.add_argument("--model",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Name of the model")
|
||||
parser.add_argument("--cachedir",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Directory to save the tokenizer")
|
||||
description="Download and save Hugging Face tokenizer"
|
||||
)
|
||||
parser.add_argument("--model", type=str, required=True, help="Name of the model")
|
||||
parser.add_argument(
|
||||
"--cachedir", type=str, required=True, help="Directory to save the tokenizer"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
main(args.model, args.cachedir)
|
||||
|
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import argparse
|
||||
import json
|
||||
@ -11,33 +12,33 @@ from tabulate import tabulate
|
||||
|
||||
def parse_arguments():
|
||||
parser = argparse.ArgumentParser(
|
||||
description=
|
||||
'Parse command line arguments for summary-nightly-results script.')
|
||||
parser.add_argument('--results-folder',
|
||||
type=str,
|
||||
required=True,
|
||||
help='The folder where the results are stored.')
|
||||
parser.add_argument('--description',
|
||||
type=str,
|
||||
required=True,
|
||||
help='Description of the results.')
|
||||
description="Parse command line arguments for summary-nightly-results script."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--results-folder",
|
||||
type=str,
|
||||
required=True,
|
||||
help="The folder where the results are stored.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--description", type=str, required=True, help="Description of the results."
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
def get_perf(df, method, model, metric):
|
||||
|
||||
means = []
|
||||
|
||||
for qps in [2, 4, 8, 16, "inf"]:
|
||||
target = df['Test name'].str.contains(model)
|
||||
target = target & df['Engine'].str.contains(method)
|
||||
target = target & df['Test name'].str.contains("qps_" + str(qps))
|
||||
target = df["Test name"].str.contains(model)
|
||||
target = target & df["Engine"].str.contains(method)
|
||||
target = target & df["Test name"].str.contains("qps_" + str(qps))
|
||||
filtered_df = df[target]
|
||||
|
||||
if filtered_df.empty:
|
||||
means.append(0.)
|
||||
means.append(0.0)
|
||||
else:
|
||||
means.append(filtered_df[metric].values[0])
|
||||
|
||||
@ -45,7 +46,6 @@ def get_perf(df, method, model, metric):
|
||||
|
||||
|
||||
def get_perf_w_std(df, method, model, metric):
|
||||
|
||||
if metric in ["TTFT", "ITL"]:
|
||||
mean = get_perf(df, method, model, "Mean " + metric + " (ms)")
|
||||
mean = mean.tolist()
|
||||
@ -60,7 +60,8 @@ def get_perf_w_std(df, method, model, metric):
|
||||
else:
|
||||
assert metric == "Tput"
|
||||
mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf(
|
||||
df, method, model, "Output Tput (tok/s)")
|
||||
df, method, model, "Output Tput (tok/s)"
|
||||
)
|
||||
mean = mean.tolist()
|
||||
std = None
|
||||
|
||||
@ -80,18 +81,17 @@ def main(args):
|
||||
# generate markdown table
|
||||
df = pd.DataFrame.from_dict(results)
|
||||
|
||||
md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
|
||||
md_table = tabulate(df, headers="keys", tablefmt="pipe", showindex=False)
|
||||
|
||||
with open(args.description) as f:
|
||||
description = f.read()
|
||||
|
||||
description = description.format(
|
||||
nightly_results_benchmarking_table=md_table)
|
||||
description = description.format(nightly_results_benchmarking_table=md_table)
|
||||
|
||||
with open("nightly_results.md", "w") as f:
|
||||
f.write(description)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
args = parse_arguments()
|
||||
main(args)
|
||||
|
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from lmdeploy.serve.openai.api_client import APIClient
|
||||
|
||||
|
@ -426,7 +426,7 @@ main() {
|
||||
|
||||
pip install -U transformers
|
||||
|
||||
pip install -r requirements-dev.txt
|
||||
pip install -r requirements/dev.txt
|
||||
which genai-perf
|
||||
|
||||
# check storage
|
||||
|
@ -10,15 +10,24 @@ set -x
|
||||
set -o pipefail
|
||||
|
||||
check_gpus() {
|
||||
# check the number of GPUs and GPU type.
|
||||
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
||||
if command -v nvidia-smi; then
|
||||
# check the number of GPUs and GPU type.
|
||||
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
||||
elif command -v amd-smi; then
|
||||
declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
|
||||
fi
|
||||
|
||||
if [[ $gpu_count -gt 0 ]]; then
|
||||
echo "GPU found."
|
||||
else
|
||||
echo "Need at least 1 GPU to run benchmarking."
|
||||
exit 1
|
||||
fi
|
||||
declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
|
||||
if command -v nvidia-smi; then
|
||||
declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
|
||||
elif command -v amd-smi; then
|
||||
declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}')
|
||||
fi
|
||||
echo "GPU type is $gpu_type"
|
||||
}
|
||||
|
||||
@ -90,9 +99,15 @@ kill_gpu_processes() {
|
||||
|
||||
|
||||
# wait until GPU memory usage smaller than 1GB
|
||||
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
|
||||
sleep 1
|
||||
done
|
||||
if command -v nvidia-smi; then
|
||||
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
|
||||
sleep 1
|
||||
done
|
||||
elif command -v amd-smi; then
|
||||
while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do
|
||||
sleep 1
|
||||
done
|
||||
fi
|
||||
|
||||
# remove vllm config file
|
||||
rm -rf ~/.config/vllm
|
||||
@ -309,11 +324,14 @@ run_serving_tests() {
|
||||
|
||||
new_test_name=$test_name"_qps_"$qps
|
||||
|
||||
# pass the tensor parallel size to the client so that it can be displayed
|
||||
# on the benchmark dashboard
|
||||
client_command="python3 benchmark_serving.py \
|
||||
--save-result \
|
||||
--result-dir $RESULTS_FOLDER \
|
||||
--result-filename ${new_test_name}.json \
|
||||
--request-rate $qps \
|
||||
--metadata "tensor_parallel_size=$tp" \
|
||||
$client_args"
|
||||
|
||||
echo "Running test case $test_name with qps $qps"
|
||||
@ -358,7 +376,7 @@ main() {
|
||||
# get the current IP address, required by benchmark_serving.py
|
||||
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
|
||||
# turn of the reporting of the status of each request, to clean up the terminal output
|
||||
export VLLM_LOG_LEVEL="WARNING"
|
||||
export VLLM_LOGGING_LEVEL="WARNING"
|
||||
|
||||
# prepare for benchmarking
|
||||
cd benchmarks || exit 1
|
||||
|
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import datetime
|
||||
import json
|
||||
@ -34,10 +35,8 @@ serving_column_mapping = {
|
||||
}
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
# collect results
|
||||
for test_file in results_folder.glob("*.json"):
|
||||
|
||||
with open(test_file) as f:
|
||||
raw_result = json.loads(f.read())
|
||||
|
||||
@ -56,17 +55,16 @@ if __name__ == "__main__":
|
||||
serving_results = pd.DataFrame.from_dict(serving_results)
|
||||
|
||||
if not serving_results.empty:
|
||||
serving_results = serving_results[list(
|
||||
serving_column_mapping.keys())].rename(
|
||||
columns=serving_column_mapping)
|
||||
serving_results = serving_results[list(serving_column_mapping.keys())].rename(
|
||||
columns=serving_column_mapping
|
||||
)
|
||||
|
||||
serving_md_table_with_headers = tabulate(serving_results,
|
||||
headers='keys',
|
||||
tablefmt='pipe',
|
||||
showindex=False)
|
||||
serving_md_table_with_headers = tabulate(
|
||||
serving_results, headers="keys", tablefmt="pipe", showindex=False
|
||||
)
|
||||
# remove the first line of header
|
||||
serving_md_table_lines = serving_md_table_with_headers.split('\n')
|
||||
serving_md_table_without_header = '\n'.join(serving_md_table_lines[2:])
|
||||
serving_md_table_lines = serving_md_table_with_headers.split("\n")
|
||||
serving_md_table_without_header = "\n".join(serving_md_table_lines[2:])
|
||||
|
||||
prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
||||
prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE")
|
||||
@ -76,10 +74,9 @@ if __name__ == "__main__":
|
||||
# document results with header.
|
||||
# for those who wants to reproduce our benchmark.
|
||||
f.write(serving_md_table_with_headers)
|
||||
f.write('\n')
|
||||
f.write("\n")
|
||||
|
||||
# document benchmarking results in json
|
||||
with open(results_folder / f"{prefix}_nightly_results.json", "w") as f:
|
||||
|
||||
results = serving_results.to_dict(orient='records')
|
||||
results = serving_results.to_dict(orient="records")
|
||||
f.write(json.dumps(results))
|
||||
|
@ -63,10 +63,12 @@
|
||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||
"disable_log_requests": "",
|
||||
"tensor_parallel_size": 4,
|
||||
"swap_space": 16,
|
||||
"speculative_model": "turboderp/Qwama-0.5B-Instruct",
|
||||
"num_speculative_tokens": 4,
|
||||
"speculative_draft_tensor_parallel_size": 1
|
||||
"swap_space": 16,
|
||||
"speculative_config": {
|
||||
"model": "turboderp/Qwama-0.5B-Instruct",
|
||||
"num_speculative_tokens": 4,
|
||||
"draft_tensor_parallel_size": 1
|
||||
}
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||
|
@ -32,4 +32,4 @@
|
||||
"backend": "vllm"
|
||||
}
|
||||
}
|
||||
]
|
||||
]
|
||||
|
46
.buildkite/pyproject.toml
Normal file
46
.buildkite/pyproject.toml
Normal file
@ -0,0 +1,46 @@
|
||||
# This local pyproject file is part of the migration from yapf to ruff format.
|
||||
# It uses the same core rules as the main pyproject.toml file, but with the
|
||||
# following differences:
|
||||
# - ruff line length is overridden to 88
|
||||
# - deprecated typing ignores (UP006, UP035) have been removed
|
||||
|
||||
[tool.ruff]
|
||||
line-length = 88
|
||||
|
||||
[tool.ruff.lint.per-file-ignores]
|
||||
"vllm/third_party/**" = ["ALL"]
|
||||
"vllm/version.py" = ["F401"]
|
||||
"vllm/_version.py" = ["ALL"]
|
||||
|
||||
[tool.ruff.lint]
|
||||
select = [
|
||||
# pycodestyle
|
||||
"E",
|
||||
# Pyflakes
|
||||
"F",
|
||||
# pyupgrade
|
||||
"UP",
|
||||
# flake8-bugbear
|
||||
"B",
|
||||
# flake8-simplify
|
||||
"SIM",
|
||||
# isort
|
||||
"I",
|
||||
# flake8-logging-format
|
||||
"G",
|
||||
]
|
||||
ignore = [
|
||||
# star imports
|
||||
"F405", "F403",
|
||||
# lambda expression assignment
|
||||
"E731",
|
||||
# Loop control variable not used within loop body
|
||||
"B007",
|
||||
# f-string format
|
||||
"UP032",
|
||||
# Can remove once 3.10+ is the minimum Python version
|
||||
"UP007",
|
||||
]
|
||||
|
||||
[tool.ruff.format]
|
||||
docstring-code-format = true
|
@ -1,12 +1,25 @@
|
||||
steps:
|
||||
- label: "Build wheel - CUDA 12.1"
|
||||
- label: "Build wheel - CUDA 12.8"
|
||||
id: build-wheel-cuda-12-8
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/upload-wheels.sh"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- label: "Build wheel - CUDA 12.6"
|
||||
id: build-wheel-cuda-12-6
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
@ -17,13 +30,14 @@ steps:
|
||||
|
||||
- label: "Build wheel - CUDA 11.8"
|
||||
# depends_on: block-build-cu118-wheel
|
||||
id: build-wheel-cuda-11-8
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/upload-wheels.sh"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
@ -33,33 +47,49 @@ steps:
|
||||
|
||||
- label: "Build release image"
|
||||
depends_on: block-release-image-build
|
||||
id: build-release-image
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ."
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||
|
||||
- label: "Annotate release workflow"
|
||||
depends_on:
|
||||
- build-release-image
|
||||
- build-wheel-cuda-12-8
|
||||
- build-wheel-cuda-12-6
|
||||
- build-wheel-cuda-11-8
|
||||
id: annotate-release-workflow
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "bash .buildkite/scripts/annotate-release.sh"
|
||||
|
||||
- label: "Build and publish TPU release image"
|
||||
depends_on: ~
|
||||
if: build.env("NIGHTLY") == "1"
|
||||
agents:
|
||||
queue: tpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ."
|
||||
- "yes | docker system prune -a"
|
||||
- "git fetch --all"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ."
|
||||
- "docker push vllm/vllm-tpu:nightly"
|
||||
- "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
|
||||
plugins:
|
||||
- docker-login#v3.0.0:
|
||||
username: vllm
|
||||
username: vllmbot
|
||||
password-env: DOCKERHUB_TOKEN
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- input: "Provide Release version here"
|
||||
id: input-release-version
|
||||
fields:
|
||||
- text: "What is the release version?"
|
||||
key: "release-version"
|
||||
key: release-version
|
||||
|
||||
- block: "Build CPU release image"
|
||||
key: block-cpu-release-image-build
|
||||
@ -71,7 +101,22 @@ steps:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --progress plain -f Dockerfile.cpu ."
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- block: "Build Neuron release image"
|
||||
key: block-neuron-release-image-build
|
||||
depends_on: ~
|
||||
|
||||
- label: "Build and publish Neuron release image"
|
||||
depends_on: block-neuron-release-image-build
|
||||
agents:
|
||||
queue: neuron-postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
@ -1,160 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script runs test inside the corresponding ROCm docker container.
|
||||
set -o pipefail
|
||||
|
||||
# Print ROCm version
|
||||
echo "--- Confirming Clean Initial State"
|
||||
while true; do
|
||||
sleep 3
|
||||
if grep -q clean /opt/amdgpu/etc/gpu_state; then
|
||||
echo "GPUs state is \"clean\""
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
echo "--- ROCm info"
|
||||
rocminfo
|
||||
|
||||
# cleanup older docker images
|
||||
cleanup_docker() {
|
||||
# Get Docker's root directory
|
||||
docker_root=$(docker info -f '{{.DockerRootDir}}')
|
||||
if [ -z "$docker_root" ]; then
|
||||
echo "Failed to determine Docker root directory."
|
||||
exit 1
|
||||
fi
|
||||
echo "Docker root directory: $docker_root"
|
||||
# Check disk usage of the filesystem where Docker's root directory is located
|
||||
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
|
||||
# Define the threshold
|
||||
threshold=70
|
||||
if [ "$disk_usage" -gt "$threshold" ]; then
|
||||
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
|
||||
# Remove dangling images (those that are not tagged and not used by any container)
|
||||
docker image prune -f
|
||||
# Remove unused volumes / force the system prune for old images as well.
|
||||
docker volume prune -f && docker system prune --force --filter "until=72h" --all
|
||||
echo "Docker images and volumes cleanup completed."
|
||||
else
|
||||
echo "Disk usage is below $threshold%. No cleanup needed."
|
||||
fi
|
||||
}
|
||||
|
||||
# Call the cleanup docker function
|
||||
cleanup_docker
|
||||
|
||||
echo "--- Resetting GPUs"
|
||||
|
||||
echo "reset" > /opt/amdgpu/etc/gpu_state
|
||||
|
||||
while true; do
|
||||
sleep 3
|
||||
if grep -q clean /opt/amdgpu/etc/gpu_state; then
|
||||
echo "GPUs state is \"clean\""
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
echo "--- Pulling container"
|
||||
image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
|
||||
container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
||||
docker pull "${image_name}"
|
||||
|
||||
remove_docker_container() {
|
||||
docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
|
||||
}
|
||||
trap remove_docker_container EXIT
|
||||
|
||||
echo "--- Running container"
|
||||
|
||||
HF_CACHE="$(realpath ~)/huggingface"
|
||||
mkdir -p "${HF_CACHE}"
|
||||
HF_MOUNT="/root/.cache/huggingface"
|
||||
|
||||
commands=$@
|
||||
echo "Commands:$commands"
|
||||
#ignore certain kernels tests
|
||||
if [[ $commands == *" kernels "* ]]; then
|
||||
commands="${commands} \
|
||||
--ignore=kernels/test_attention.py \
|
||||
--ignore=kernels/test_attention_selector.py \
|
||||
--ignore=kernels/test_blocksparse_attention.py \
|
||||
--ignore=kernels/test_causal_conv1d.py \
|
||||
--ignore=kernels/test_cutlass.py \
|
||||
--ignore=kernels/test_encoder_decoder_attn.py \
|
||||
--ignore=kernels/test_flash_attn.py \
|
||||
--ignore=kernels/test_flashinfer.py \
|
||||
--ignore=kernels/test_int8_quant.py \
|
||||
--ignore=kernels/test_machete_gemm.py \
|
||||
--ignore=kernels/test_mamba_ssm.py \
|
||||
--ignore=kernels/test_marlin_gemm.py \
|
||||
--ignore=kernels/test_moe.py \
|
||||
--ignore=kernels/test_prefix_prefill.py \
|
||||
--ignore=kernels/test_rand.py \
|
||||
--ignore=kernels/test_sampler.py"
|
||||
fi
|
||||
|
||||
#ignore certain Entrypoints tests
|
||||
if [[ $commands == *" entrypoints/openai "* ]]; then
|
||||
commands=${commands//" entrypoints/openai "/" entrypoints/openai \
|
||||
--ignore=entrypoints/openai/test_accuracy.py \
|
||||
--ignore=entrypoints/openai/test_audio.py \
|
||||
--ignore=entrypoints/openai/test_encoder_decoder.py \
|
||||
--ignore=entrypoints/openai/test_embedding.py \
|
||||
--ignore=entrypoints/openai/test_oot_registration.py "}
|
||||
fi
|
||||
|
||||
PARALLEL_JOB_COUNT=8
|
||||
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
|
||||
if [[ $commands == *"--shard-id="* ]]; then
|
||||
# assign job count as the number of shards used
|
||||
commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
|
||||
for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
|
||||
# assign shard-id for each shard
|
||||
commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
|
||||
echo "Shard ${GPU} commands:$commands_gpu"
|
||||
docker run \
|
||||
--device /dev/kfd --device /dev/dri \
|
||||
--network host \
|
||||
--shm-size=16gb \
|
||||
--rm \
|
||||
-e HIP_VISIBLE_DEVICES="${GPU}" \
|
||||
-e HF_TOKEN \
|
||||
-e AWS_ACCESS_KEY_ID \
|
||||
-e AWS_SECRET_ACCESS_KEY \
|
||||
-v "${HF_CACHE}:${HF_MOUNT}" \
|
||||
-e "HF_HOME=${HF_MOUNT}" \
|
||||
--name "${container_name}_${GPU}" \
|
||||
"${image_name}" \
|
||||
/bin/bash -c "${commands_gpu}" \
|
||||
|& while read -r line; do echo ">>Shard $GPU: $line"; done &
|
||||
PIDS+=($!)
|
||||
done
|
||||
#wait for all processes to finish and collect exit codes
|
||||
for pid in "${PIDS[@]}"; do
|
||||
wait "${pid}"
|
||||
STATUS+=($?)
|
||||
done
|
||||
for st in "${STATUS[@]}"; do
|
||||
if [[ ${st} -ne 0 ]]; then
|
||||
echo "One of the processes failed with $st"
|
||||
exit "${st}"
|
||||
fi
|
||||
done
|
||||
else
|
||||
docker run \
|
||||
--device /dev/kfd --device /dev/dri \
|
||||
--network host \
|
||||
--shm-size=16gb \
|
||||
--rm \
|
||||
-e HIP_VISIBLE_DEVICES=0 \
|
||||
-e HF_TOKEN \
|
||||
-e AWS_ACCESS_KEY_ID \
|
||||
-e AWS_SECRET_ACCESS_KEY \
|
||||
-v "${HF_CACHE}:${HF_MOUNT}" \
|
||||
-e "HF_HOME=${HF_MOUNT}" \
|
||||
--name "${container_name}" \
|
||||
"${image_name}" \
|
||||
/bin/bash -c "${commands}"
|
||||
fi
|
@ -1,88 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script build the CPU docker image and run the offline inference inside the container.
|
||||
# It serves a sanity check for compilation and basic model usage.
|
||||
set -ex
|
||||
|
||||
# allow to bind to different cores
|
||||
CORE_RANGE=${CORE_RANGE:-48-95}
|
||||
NUMA_NODE=${NUMA_NODE:-1}
|
||||
|
||||
# Try building the docker image
|
||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test-"$BUILDKITE_BUILD_NUMBER" -f Dockerfile.cpu .
|
||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 -f Dockerfile.cpu .
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() { set -e; docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; }
|
||||
trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Run the image, setting --shm-size=4g for tensor parallel.
|
||||
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
|
||||
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
|
||||
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
|
||||
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
|
||||
|
||||
function cpu_tests() {
|
||||
set -e
|
||||
export NUMA_NODE=$2
|
||||
|
||||
# offline inference
|
||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
||||
|
||||
# Run basic model test
|
||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pip install -r vllm/requirements-test.txt
|
||||
pytest -v -s tests/models/decoder_only/language -m cpu_model
|
||||
pytest -v -s tests/models/embedding/language -m cpu_model
|
||||
pytest -v -s tests/models/encoder_decoder/language -m cpu_model
|
||||
pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
|
||||
pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
|
||||
|
||||
# Run compressed-tensor test
|
||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pytest -s -v \
|
||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
|
||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
|
||||
|
||||
# Run AWQ test
|
||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pytest -s -v \
|
||||
tests/quantization/test_ipex_quant.py"
|
||||
|
||||
# Run chunked-prefill and prefix-cache test
|
||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pytest -s -v -k cpu_model \
|
||||
tests/basic_correctness/test_chunked_prefill.py"
|
||||
|
||||
# online serving
|
||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
export VLLM_CPU_KVCACHE_SPACE=10
|
||||
export VLLM_CPU_OMP_THREADS_BIND=$1
|
||||
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half &
|
||||
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
|
||||
python3 benchmarks/benchmark_serving.py \
|
||||
--backend vllm \
|
||||
--dataset-name random \
|
||||
--model facebook/opt-125m \
|
||||
--num-prompts 20 \
|
||||
--endpoint /v1/completions \
|
||||
--tokenizer facebook/opt-125m"
|
||||
|
||||
# Run multi-lora tests
|
||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pytest -s -v \
|
||||
tests/lora/test_qwen2vl.py"
|
||||
}
|
||||
|
||||
# All of CPU tests are expected to be finished less than 40 mins.
|
||||
export -f cpu_tests
|
||||
timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
|
@ -1,16 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script build the OpenVINO docker image and run the offline inference inside the container.
|
||||
# It serves a sanity check for compilation and basic model usage.
|
||||
set -ex
|
||||
|
||||
# Try building the docker image
|
||||
docker build -t openvino-test -f Dockerfile.openvino .
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() { docker rm -f openvino-test || true; }
|
||||
trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Run the image and launch offline inference
|
||||
docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/basic/generate.py --model facebook/opt-125m
|
@ -1,26 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
# Build the docker image.
|
||||
docker build -f Dockerfile.tpu -t vllm-tpu .
|
||||
|
||||
# Set up cleanup.
|
||||
remove_docker_container() { docker rm -f tpu-test || true; }
|
||||
trap remove_docker_container EXIT
|
||||
# Remove the container that might not be cleaned up in the previous run.
|
||||
remove_docker_container
|
||||
|
||||
# For HF_TOKEN.
|
||||
source /etc/environment
|
||||
# Run a simple end-to-end example.
|
||||
docker run --privileged --net host --shm-size=16G -it \
|
||||
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
|
||||
vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
|
||||
&& python3 -m pip install pytest \
|
||||
&& python3 -m pip install lm_eval[api]==0.4.4 \
|
||||
&& pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py \
|
||||
&& pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
|
||||
&& python3 /workspace/vllm/tests/tpu/test_compilation.py \
|
||||
&& python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
|
||||
&& python3 /workspace/vllm/examples/offline_inference/tpu.py"
|
@ -1,19 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script build the CPU docker image and run the offline inference inside the container.
|
||||
# It serves a sanity check for compilation and basic model usage.
|
||||
set -ex
|
||||
|
||||
# Try building the docker image
|
||||
docker build -t xpu-test -f Dockerfile.xpu .
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() { docker rm -f xpu-test || true; }
|
||||
trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Run the image and test offline inference/tensor parallel
|
||||
docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c '
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
|
||||
'
|
31
.buildkite/scripts/annotate-release.sh
Executable file
31
.buildkite/scripts/annotate-release.sh
Executable file
@ -0,0 +1,31 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -ex
|
||||
|
||||
# Get release version and strip leading 'v' if present
|
||||
RELEASE_VERSION=$(buildkite-agent meta-data get release-version | sed 's/^v//')
|
||||
|
||||
if [ -z "$RELEASE_VERSION" ]; then
|
||||
echo "Error: RELEASE_VERSION is empty. 'release-version' metadata might not be set or is invalid."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
|
||||
To download the wheel:
|
||||
\`\`\`
|
||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
|
||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
|
||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu118/vllm-${RELEASE_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl .
|
||||
\`\`\`
|
||||
|
||||
To download and upload the image:
|
||||
|
||||
\`\`\`
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} vllm/vllm-openai
|
||||
docker tag vllm/vllm-openai vllm/vllm-openai:latest
|
||||
docker tag vllm/vllm-openai vllm/vllm-openai:v${RELEASE_VERSION}
|
||||
docker push vllm/vllm-openai:latest
|
||||
docker push vllm/vllm-openai:v${RELEASE_VERSION}
|
||||
\`\`\`
|
||||
EOF
|
17
.buildkite/scripts/ci-clean-log.sh
Normal file
17
.buildkite/scripts/ci-clean-log.sh
Normal file
@ -0,0 +1,17 @@
|
||||
#!/bin/bash
|
||||
# Usage: ./ci_clean_log.sh ci.log
|
||||
# This script strips timestamps and color codes from CI log files.
|
||||
|
||||
# Check if argument is given
|
||||
if [ $# -lt 1 ]; then
|
||||
echo "Usage: $0 ci.log"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
INPUT_FILE="$1"
|
||||
|
||||
# Strip timestamps
|
||||
sed -i 's/^\[[0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}T[0-9]\{2\}:[0-9]\{2\}:[0-9]\{2\}Z\] //' "$INPUT_FILE"
|
||||
|
||||
# Strip colorization
|
||||
sed -i -r 's/\x1B\[[0-9;]*[mK]//g' "$INPUT_FILE"
|
245
.buildkite/scripts/hardware_ci/run-amd-test.sh
Executable file
245
.buildkite/scripts/hardware_ci/run-amd-test.sh
Executable file
@ -0,0 +1,245 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script runs test inside the corresponding ROCm docker container.
|
||||
set -o pipefail
|
||||
|
||||
# Export Python path
|
||||
export PYTHONPATH=".."
|
||||
|
||||
# Print ROCm version
|
||||
echo "--- Confirming Clean Initial State"
|
||||
while true; do
|
||||
sleep 3
|
||||
if grep -q clean /opt/amdgpu/etc/gpu_state; then
|
||||
echo "GPUs state is \"clean\""
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
echo "--- ROCm info"
|
||||
rocminfo
|
||||
|
||||
# cleanup older docker images
|
||||
cleanup_docker() {
|
||||
# Get Docker's root directory
|
||||
docker_root=$(docker info -f '{{.DockerRootDir}}')
|
||||
if [ -z "$docker_root" ]; then
|
||||
echo "Failed to determine Docker root directory."
|
||||
exit 1
|
||||
fi
|
||||
echo "Docker root directory: $docker_root"
|
||||
# Check disk usage of the filesystem where Docker's root directory is located
|
||||
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
|
||||
# Define the threshold
|
||||
threshold=70
|
||||
if [ "$disk_usage" -gt "$threshold" ]; then
|
||||
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
|
||||
# Remove dangling images (those that are not tagged and not used by any container)
|
||||
docker image prune -f
|
||||
# Remove unused volumes / force the system prune for old images as well.
|
||||
docker volume prune -f && docker system prune --force --filter "until=72h" --all
|
||||
echo "Docker images and volumes cleanup completed."
|
||||
else
|
||||
echo "Disk usage is below $threshold%. No cleanup needed."
|
||||
fi
|
||||
}
|
||||
|
||||
# Call the cleanup docker function
|
||||
cleanup_docker
|
||||
|
||||
echo "--- Resetting GPUs"
|
||||
|
||||
echo "reset" > /opt/amdgpu/etc/gpu_state
|
||||
|
||||
while true; do
|
||||
sleep 3
|
||||
if grep -q clean /opt/amdgpu/etc/gpu_state; then
|
||||
echo "GPUs state is \"clean\""
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
echo "--- Pulling container"
|
||||
image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
|
||||
container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
||||
docker pull "${image_name}"
|
||||
|
||||
remove_docker_container() {
|
||||
docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
|
||||
}
|
||||
trap remove_docker_container EXIT
|
||||
|
||||
echo "--- Running container"
|
||||
|
||||
HF_CACHE="$(realpath ~)/huggingface"
|
||||
mkdir -p "${HF_CACHE}"
|
||||
HF_MOUNT="/root/.cache/huggingface"
|
||||
|
||||
commands=$@
|
||||
echo "Commands:$commands"
|
||||
|
||||
if [[ $commands == *"pytest -v -s basic_correctness/test_basic_correctness.py"* ]]; then
|
||||
commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s basic_correctness/test_basic_correctness.py"}
|
||||
fi
|
||||
|
||||
if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
|
||||
commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
|
||||
fi
|
||||
|
||||
if [[ $commands == *"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"* ]]; then
|
||||
commands=${commands//"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"/"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2 and not BambaForCausalLM and not Gemma2ForCausalLM and not Grok1ModelForCausalLM and not Zamba2ForCausalLM and not Gemma2Model and not GritLM'"}
|
||||
fi
|
||||
|
||||
if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
|
||||
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
|
||||
fi
|
||||
|
||||
if [[ $commands == *"pytest -v -s lora"* ]]; then
|
||||
commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
|
||||
fi
|
||||
|
||||
#ignore certain kernels tests
|
||||
if [[ $commands == *" kernels/core"* ]]; then
|
||||
commands="${commands} \
|
||||
--ignore=kernels/core/test_fused_quant_layernorm.py \
|
||||
--ignore=kernels/core/test_permute_cols.py"
|
||||
fi
|
||||
|
||||
if [[ $commands == *" kernels/attention"* ]]; then
|
||||
commands="${commands} \
|
||||
--ignore=kernels/attention/stest_attention_selector.py \
|
||||
--ignore=kernels/attention/test_blocksparse_attention.py \
|
||||
--ignore=kernels/attention/test_encoder_decoder_attn.py \
|
||||
--ignore=kernels/attention/test_attention_selector.py \
|
||||
--ignore=kernels/attention/test_flash_attn.py \
|
||||
--ignore=kernels/attention/test_flashinfer.py \
|
||||
--ignore=kernels/attention/test_prefix_prefill.py \
|
||||
--ignore=kernels/attention/test_cascade_flash_attn.py \
|
||||
--ignore=kernels/attention/test_mha_attn.py \
|
||||
--ignore=kernels/attention/test_lightning_attn.py \
|
||||
--ignore=kernels/attention/test_attention.py"
|
||||
fi
|
||||
|
||||
if [[ $commands == *" kernels/quantization"* ]]; then
|
||||
commands="${commands} \
|
||||
--ignore=kernels/quantization/test_int8_quant.py \
|
||||
--ignore=kernels/quantization/test_aqlm.py \
|
||||
--ignore=kernels/quantization/test_machete_mm.py \
|
||||
--ignore=kernels/quantization/test_block_fp8.py \
|
||||
--ignore=kernels/quantization/test_block_int8.py \
|
||||
--ignore=kernels/quantization/test_marlin_gemm.py \
|
||||
--ignore=kernels/quantization/test_cutlass_scaled_mm.py \
|
||||
--ignore=kernels/quantization/test_int8_kernel.py"
|
||||
fi
|
||||
|
||||
if [[ $commands == *" kernels/mamba"* ]]; then
|
||||
commands="${commands} \
|
||||
--ignore=kernels/mamba/test_mamba_mixer2.py \
|
||||
--ignore=kernels/mamba/test_causal_conv1d.py \
|
||||
--ignore=kernels/mamba/test_mamba_ssm_ssd.py"
|
||||
fi
|
||||
|
||||
if [[ $commands == *" kernels/moe"* ]]; then
|
||||
commands="${commands} \
|
||||
--ignore=kernels/moe/test_moe.py \
|
||||
--ignore=kernels/moe/test_cutlass_moe.py \
|
||||
--ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
|
||||
fi
|
||||
|
||||
#ignore certain Entrypoints/openai tests
|
||||
if [[ $commands == *" entrypoints/openai "* ]]; then
|
||||
commands=${commands//" entrypoints/openai "/" entrypoints/openai \
|
||||
--ignore=entrypoints/openai/test_audio.py \
|
||||
--ignore=entrypoints/openai/test_shutdown.py \
|
||||
--ignore=entrypoints/openai/test_completion.py \
|
||||
--ignore=entrypoints/openai/test_sleep.py \
|
||||
--ignore=entrypoints/openai/test_models.py \
|
||||
--ignore=entrypoints/openai/test_lora_adapters.py \
|
||||
--ignore=entrypoints/openai/test_return_tokens_as_ids.py \
|
||||
--ignore=entrypoints/openai/test_root_path.py \
|
||||
--ignore=entrypoints/openai/test_tokenization.py \
|
||||
--ignore=entrypoints/openai/test_prompt_validation.py "}
|
||||
fi
|
||||
|
||||
#ignore certain Entrypoints/llm tests
|
||||
if [[ $commands == *" entrypoints/llm "* ]]; then
|
||||
commands=${commands//" entrypoints/llm "/" entrypoints/llm \
|
||||
--ignore=entrypoints/llm/test_chat.py \
|
||||
--ignore=entrypoints/llm/test_accuracy.py \
|
||||
--ignore=entrypoints/llm/test_init.py \
|
||||
--ignore=entrypoints/llm/test_generate_multiple_loras.py \
|
||||
--ignore=entrypoints/llm/test_prompt_validation.py "}
|
||||
fi
|
||||
|
||||
#Obsolete currently
|
||||
##ignore certain Entrypoints/llm tests
|
||||
#if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
|
||||
# commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
|
||||
#fi
|
||||
|
||||
# --ignore=entrypoints/openai/test_encoder_decoder.py \
|
||||
# --ignore=entrypoints/openai/test_embedding.py \
|
||||
# --ignore=entrypoints/openai/test_oot_registration.py
|
||||
# --ignore=entrypoints/openai/test_accuracy.py \
|
||||
# --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13
|
||||
|
||||
|
||||
PARALLEL_JOB_COUNT=8
|
||||
MYPYTHONPATH=".."
|
||||
|
||||
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
|
||||
if [[ $commands == *"--shard-id="* ]]; then
|
||||
# assign job count as the number of shards used
|
||||
commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
|
||||
for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
|
||||
# assign shard-id for each shard
|
||||
commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
|
||||
echo "Shard ${GPU} commands:$commands_gpu"
|
||||
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
|
||||
docker run \
|
||||
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
|
||||
--network=host \
|
||||
--shm-size=16gb \
|
||||
--rm \
|
||||
-e HIP_VISIBLE_DEVICES="${GPU}" \
|
||||
-e HF_TOKEN \
|
||||
-e AWS_ACCESS_KEY_ID \
|
||||
-e AWS_SECRET_ACCESS_KEY \
|
||||
-v "${HF_CACHE}:${HF_MOUNT}" \
|
||||
-e "HF_HOME=${HF_MOUNT}" \
|
||||
-e "PYTHONPATH=${MYPYTHONPATH}" \
|
||||
--name "${container_name}_${GPU}" \
|
||||
"${image_name}" \
|
||||
/bin/bash -c "${commands_gpu}" \
|
||||
|& while read -r line; do echo ">>Shard $GPU: $line"; done &
|
||||
PIDS+=($!)
|
||||
done
|
||||
#wait for all processes to finish and collect exit codes
|
||||
for pid in "${PIDS[@]}"; do
|
||||
wait "${pid}"
|
||||
STATUS+=($?)
|
||||
done
|
||||
for st in "${STATUS[@]}"; do
|
||||
if [[ ${st} -ne 0 ]]; then
|
||||
echo "One of the processes failed with $st"
|
||||
exit "${st}"
|
||||
fi
|
||||
done
|
||||
else
|
||||
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
|
||||
docker run \
|
||||
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
|
||||
--network=host \
|
||||
--shm-size=16gb \
|
||||
--rm \
|
||||
-e HIP_VISIBLE_DEVICES=0 \
|
||||
-e HF_TOKEN \
|
||||
-e AWS_ACCESS_KEY_ID \
|
||||
-e AWS_SECRET_ACCESS_KEY \
|
||||
-v "${HF_CACHE}:${HF_MOUNT}" \
|
||||
-e "HF_HOME=${HF_MOUNT}" \
|
||||
-e "PYTHONPATH=${MYPYTHONPATH}" \
|
||||
--name "${container_name}" \
|
||||
"${image_name}" \
|
||||
/bin/bash -c "${commands}"
|
||||
fi
|
49
.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
Executable file
49
.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
Executable file
@ -0,0 +1,49 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script build the CPU docker image and run the offline inference inside the container.
|
||||
# It serves a sanity check for compilation and basic model usage.
|
||||
set -ex
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() {
|
||||
if [[ -n "$container_id" ]]; then
|
||||
podman stop --all -t0
|
||||
podman rm -f "$container_id" || true
|
||||
fi
|
||||
podman system prune -f
|
||||
}
|
||||
trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Try building the docker image
|
||||
podman build -t cpu-test-ubi9-ppc -f docker/Dockerfile.ppc64le .
|
||||
|
||||
# Run the image
|
||||
container_id=$(podman run -itd --entrypoint /bin/bash -v /tmp/:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN cpu-test-ubi9-ppc)
|
||||
|
||||
function cpu_tests() {
|
||||
|
||||
# offline inference
|
||||
podman exec -it "$container_id" bash -c "
|
||||
set -e
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
||||
|
||||
# Run basic model test
|
||||
podman exec -it "$container_id" bash -c "
|
||||
set -e
|
||||
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
|
||||
pip install sentence-transformers datamodel_code_generator
|
||||
pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
|
||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
|
||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
|
||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
|
||||
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
|
||||
pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model"
|
||||
}
|
||||
|
||||
# All of CPU tests are expected to be finished less than 40 mins.
|
||||
|
||||
export container_id
|
||||
export -f cpu_tests
|
||||
timeout 40m bash -c cpu_tests
|
||||
|
@ -10,5 +10,4 @@ trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Try building the docker image
|
||||
docker build -t cpu-test -f Dockerfile.ppc64le .
|
||||
|
||||
docker build -t cpu-test -f docker/Dockerfile.s390x .
|
92
.buildkite/scripts/hardware_ci/run-cpu-test.sh
Normal file
92
.buildkite/scripts/hardware_ci/run-cpu-test.sh
Normal file
@ -0,0 +1,92 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script build the CPU docker image and run the offline inference inside the container.
|
||||
# It serves a sanity check for compilation and basic model usage.
|
||||
set -ex
|
||||
|
||||
# allow to bind to different cores
|
||||
CORE_RANGE=${CORE_RANGE:-48-95}
|
||||
OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
|
||||
NUMA_NODE=${NUMA_NODE:-1}
|
||||
|
||||
export CMAKE_BUILD_PARALLEL_LEVEL=32
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() {
|
||||
set -e;
|
||||
docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;
|
||||
}
|
||||
trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Try building the docker image
|
||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
|
||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
|
||||
|
||||
# Run the image, setting --shm-size=4g for tensor parallel.
|
||||
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
|
||||
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
|
||||
|
||||
function cpu_tests() {
|
||||
set -e
|
||||
export NUMA_NODE=$2
|
||||
|
||||
# offline inference
|
||||
docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
|
||||
set -e
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
||||
|
||||
# Run basic model test
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
|
||||
pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
|
||||
pytest -v -s tests/models/language/generation -m cpu_model
|
||||
pytest -v -s tests/models/language/pooling -m cpu_model
|
||||
pytest -v -s tests/models/multimodal/generation \
|
||||
--ignore=tests/models/multimodal/generation/test_mllama.py \
|
||||
--ignore=tests/models/multimodal/generation/test_pixtral.py \
|
||||
-m cpu_model"
|
||||
|
||||
# Run compressed-tensor test
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pytest -s -v \
|
||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
|
||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
|
||||
|
||||
# Run AWQ test
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
VLLM_USE_V1=0 pytest -s -v \
|
||||
tests/quantization/test_ipex_quant.py"
|
||||
|
||||
# Run chunked-prefill and prefix-cache test
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pytest -s -v -k cpu_model \
|
||||
tests/basic_correctness/test_chunked_prefill.py"
|
||||
|
||||
# online serving
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half &
|
||||
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
|
||||
python3 benchmarks/benchmark_serving.py \
|
||||
--backend vllm \
|
||||
--dataset-name random \
|
||||
--model facebook/opt-125m \
|
||||
--num-prompts 20 \
|
||||
--endpoint /v1/completions \
|
||||
--tokenizer facebook/opt-125m"
|
||||
|
||||
# Run multi-lora tests
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pytest -s -v \
|
||||
tests/lora/test_qwen2vl.py"
|
||||
}
|
||||
|
||||
# All of CPU tests are expected to be finished less than 40 mins.
|
||||
export -f cpu_tests
|
||||
timeout 1h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
|
@ -9,11 +9,13 @@ python3 use_existing_torch.py
|
||||
|
||||
# Try building the docker image
|
||||
DOCKER_BUILDKIT=1 docker build . \
|
||||
--file docker/Dockerfile \
|
||||
--target vllm-openai \
|
||||
--platform "linux/arm64" \
|
||||
-t gh200-test \
|
||||
--build-arg max_jobs=66 \
|
||||
--build-arg nvcc_threads=2 \
|
||||
--build-arg RUN_WHEEL_CHECK=false \
|
||||
--build-arg torch_cuda_arch_list="9.0+PTX" \
|
||||
--build-arg vllm_fa_cmake_gpu_arches="90-real"
|
||||
|
||||
@ -23,6 +25,6 @@ trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Run the image and test offline inference
|
||||
docker run -e HF_TOKEN -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
|
||||
docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
|
||||
python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B
|
||||
'
|
@ -5,20 +5,22 @@
|
||||
set -ex
|
||||
|
||||
# Try building the docker image
|
||||
docker build -t hpu-test-env -f Dockerfile.hpu .
|
||||
docker build -t hpu-test-env -f docker/Dockerfile.hpu .
|
||||
|
||||
# Setup cleanup
|
||||
# certain versions of HPU software stack have a bug that can
|
||||
# override the exit code of the script, so we need to use
|
||||
# separate remove_docker_container and remove_docker_container_and_exit
|
||||
# separate remove_docker_containers and remove_docker_containers_and_exit
|
||||
# functions, while other platforms only need one remove_docker_container
|
||||
# function.
|
||||
EXITCODE=1
|
||||
remove_docker_container() { docker rm -f hpu-test || true; }
|
||||
remove_docker_container_and_exit() { remove_docker_container; exit $EXITCODE; }
|
||||
trap remove_docker_container_and_exit EXIT
|
||||
remove_docker_container
|
||||
remove_docker_containers() { docker rm -f hpu-test || true; docker rm -f hpu-test-tp2 || true; }
|
||||
remove_docker_containers_and_exit() { remove_docker_containers; exit $EXITCODE; }
|
||||
trap remove_docker_containers_and_exit EXIT
|
||||
remove_docker_containers
|
||||
|
||||
# Run the image and launch offline inference
|
||||
docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
|
||||
docker run --runtime=habana --name=hpu-test-tp2 --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --tensor-parallel-size 2
|
||||
|
||||
EXITCODE=$?
|
@ -11,13 +11,14 @@ container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
||||
HF_CACHE="$(realpath ~)/huggingface"
|
||||
mkdir -p "${HF_CACHE}"
|
||||
HF_MOUNT="/root/.cache/huggingface"
|
||||
HF_TOKEN=$(aws secretsmanager get-secret-value --secret-id "ci/vllm-neuron/hf-token" --region us-west-2 --query 'SecretString' --output text | jq -r .VLLM_NEURON_CI_HF_TOKEN)
|
||||
|
||||
NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
|
||||
mkdir -p "${NEURON_COMPILE_CACHE_URL}"
|
||||
NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
|
||||
|
||||
# Try building the docker image
|
||||
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
|
||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws
|
||||
|
||||
# prune old image and containers to save disk space, and only once a day
|
||||
# by using a timestamp file in tmp.
|
||||
@ -35,7 +36,7 @@ else
|
||||
date "+%s" > /tmp/neuron-docker-build-timestamp
|
||||
fi
|
||||
|
||||
docker build -t "${image_name}" -f Dockerfile.neuron .
|
||||
docker build -t "${image_name}" -f docker/Dockerfile.neuron .
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() {
|
||||
@ -44,11 +45,19 @@ remove_docker_container() {
|
||||
trap remove_docker_container EXIT
|
||||
|
||||
# Run the image
|
||||
docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \
|
||||
docker run --rm -it --device=/dev/neuron0 --network bridge \
|
||||
-v "${HF_CACHE}:${HF_MOUNT}" \
|
||||
-e "HF_HOME=${HF_MOUNT}" \
|
||||
-e "HF_TOKEN=${HF_TOKEN}" \
|
||||
-v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
|
||||
-e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
|
||||
--name "${container_name}" \
|
||||
${image_name} \
|
||||
/bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/ -v --capture=tee-sys"
|
||||
/bin/bash -c "
|
||||
python3 /workspace/vllm/examples/offline_inference/neuron.py;
|
||||
python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys;
|
||||
for f in /workspace/vllm/tests/neuron/2_core/*.py; do
|
||||
echo 'Running test file: '$f;
|
||||
python3 -m pytest \$f -v --capture=tee-sys;
|
||||
done
|
||||
"
|
185
.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
Executable file
185
.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
Executable file
@ -0,0 +1,185 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -xu
|
||||
|
||||
|
||||
remove_docker_container() {
|
||||
docker rm -f tpu-test || true;
|
||||
docker rm -f vllm-tpu || true;
|
||||
}
|
||||
|
||||
trap remove_docker_container EXIT
|
||||
|
||||
# Remove the container that might not be cleaned up in the previous run.
|
||||
remove_docker_container
|
||||
|
||||
# Build the docker image.
|
||||
docker build -f docker/Dockerfile.tpu -t vllm-tpu .
|
||||
|
||||
# Set up cleanup.
|
||||
cleanup_docker() {
|
||||
# Get Docker's root directory
|
||||
docker_root=$(docker info -f '{{.DockerRootDir}}')
|
||||
if [ -z "$docker_root" ]; then
|
||||
echo "Failed to determine Docker root directory."
|
||||
exit 1
|
||||
fi
|
||||
echo "Docker root directory: $docker_root"
|
||||
# Check disk usage of the filesystem where Docker's root directory is located
|
||||
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
|
||||
# Define the threshold
|
||||
threshold=70
|
||||
if [ "$disk_usage" -gt "$threshold" ]; then
|
||||
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
|
||||
# Remove dangling images (those that are not tagged and not used by any container)
|
||||
docker image prune -f
|
||||
# Remove unused volumes / force the system prune for old images as well.
|
||||
docker volume prune -f && docker system prune --force --filter "until=72h" --all
|
||||
echo "Docker images and volumes cleanup completed."
|
||||
else
|
||||
echo "Disk usage is below $threshold%. No cleanup needed."
|
||||
fi
|
||||
}
|
||||
cleanup_docker
|
||||
|
||||
# For HF_TOKEN.
|
||||
source /etc/environment
|
||||
|
||||
docker run --privileged --net host --shm-size=16G -it \
|
||||
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
|
||||
vllm-tpu /bin/bash -c '
|
||||
set -e # Exit immediately if a command exits with a non-zero status.
|
||||
set -u # Treat unset variables as an error.
|
||||
|
||||
echo "--- Starting script inside Docker container ---"
|
||||
|
||||
# Create results directory
|
||||
RESULTS_DIR=$(mktemp -d)
|
||||
# If mktemp fails, set -e will cause the script to exit.
|
||||
echo "Results will be stored in: $RESULTS_DIR"
|
||||
|
||||
# Install dependencies
|
||||
echo "--- Installing Python dependencies ---"
|
||||
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
||||
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
||||
&& python3 -m pip install --progress-bar off lm_eval[api]==0.4.4
|
||||
echo "--- Python dependencies installed ---"
|
||||
export VLLM_USE_V1=1
|
||||
export VLLM_XLA_CHECK_RECOMPILATION=1
|
||||
export VLLM_XLA_CACHE_PATH=
|
||||
echo "Using VLLM V1"
|
||||
|
||||
echo "--- Hardware Information ---"
|
||||
tpu-info
|
||||
echo "--- Starting Tests ---"
|
||||
set +e
|
||||
overall_script_exit_code=0
|
||||
|
||||
# --- Test Definitions ---
|
||||
# If a test fails, this function will print logs and will not cause the main script to exit.
|
||||
run_test() {
|
||||
local test_num=$1
|
||||
local test_name=$2
|
||||
local test_command=$3
|
||||
local log_file="$RESULTS_DIR/test_${test_num}.log"
|
||||
local actual_exit_code
|
||||
|
||||
echo "--- TEST_$test_num: Running $test_name ---"
|
||||
|
||||
# Execute the test command.
|
||||
eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2)
|
||||
actual_exit_code=$?
|
||||
|
||||
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log
|
||||
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log
|
||||
|
||||
if [ "$actual_exit_code" -ne 0 ]; then
|
||||
echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2
|
||||
echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2
|
||||
if [ -f "$log_file" ]; then
|
||||
cat "$log_file" >&2
|
||||
else
|
||||
echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2
|
||||
fi
|
||||
echo "--- End of log for TEST_$test_num ($test_name) ---" >&2
|
||||
return "$actual_exit_code" # Return the failure code
|
||||
else
|
||||
echo "TEST_$test_num ($test_name) PASSED."
|
||||
return 0 # Return success
|
||||
fi
|
||||
}
|
||||
|
||||
# Helper function to call run_test and update the overall script exit code
|
||||
run_and_track_test() {
|
||||
local test_num_arg="$1"
|
||||
local test_name_arg="$2"
|
||||
local test_command_arg="$3"
|
||||
|
||||
# Run the test
|
||||
run_test "$test_num_arg" "$test_name_arg" "$test_command_arg"
|
||||
local test_specific_exit_code=$?
|
||||
|
||||
# If the test failed, set the overall script exit code to 1
|
||||
if [ "$test_specific_exit_code" -ne 0 ]; then
|
||||
# No need for extra echo here, run_test already logged the failure.
|
||||
overall_script_exit_code=1
|
||||
fi
|
||||
}
|
||||
|
||||
# --- Actual Test Execution ---
|
||||
run_and_track_test 0 "test_perf.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_perf.py"
|
||||
run_and_track_test 1 "test_compilation.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py"
|
||||
run_and_track_test 2 "test_basic.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py"
|
||||
run_and_track_test 3 "test_accuracy.py::test_lm_eval_accuracy_v1_engine" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine"
|
||||
run_and_track_test 4 "test_quantization_accuracy.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py"
|
||||
run_and_track_test 5 "examples/offline_inference/tpu.py" \
|
||||
"python3 /workspace/vllm/examples/offline_inference/tpu.py"
|
||||
run_and_track_test 6 "test_tpu_model_runner.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py"
|
||||
run_and_track_test 7 "test_sampler.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py"
|
||||
run_and_track_test 8 "test_topk_topp_sampler.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py"
|
||||
run_and_track_test 9 "test_multimodal.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py"
|
||||
run_and_track_test 10 "test_pallas.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
|
||||
run_and_track_test 11 "test_struct_output_generate.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
|
||||
run_and_track_test 12 "test_moe_pallas.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
|
||||
run_and_track_test 13 "test_lora.py" \
|
||||
"VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
|
||||
run_and_track_test 14 "test_tpu_qkv_linear.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
|
||||
run_and_track_test 15 "test_spmd_model_weight_loading.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
|
||||
|
||||
# After all tests have been attempted, exit with the overall status.
|
||||
if [ "$overall_script_exit_code" -ne 0 ]; then
|
||||
echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---"
|
||||
else
|
||||
echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---"
|
||||
fi
|
||||
exit "$overall_script_exit_code"
|
||||
' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct.
|
||||
|
||||
# Capture the exit code of the docker run command
|
||||
DOCKER_RUN_EXIT_CODE=$?
|
||||
|
||||
# The trap will run for cleanup.
|
||||
# Exit the main script with the Docker run command's exit code.
|
||||
if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then
|
||||
echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE."
|
||||
exit "$DOCKER_RUN_EXIT_CODE"
|
||||
else
|
||||
echo "Docker run command completed successfully."
|
||||
exit 0
|
||||
fi
|
||||
# TODO: This test fails because it uses RANDOM_SEED sampling
|
||||
# pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
|
31
.buildkite/scripts/hardware_ci/run-xpu-test.sh
Normal file
31
.buildkite/scripts/hardware_ci/run-xpu-test.sh
Normal file
@ -0,0 +1,31 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script build the CPU docker image and run the offline inference inside the container.
|
||||
# It serves a sanity check for compilation and basic model usage.
|
||||
set -ex
|
||||
|
||||
image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
|
||||
container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
||||
|
||||
# Try building the docker image
|
||||
docker build -t ${image_name} -f docker/Dockerfile.xpu .
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() {
|
||||
docker rm -f "${container_name}" || true;
|
||||
docker image rm -f "${image_name}" || true;
|
||||
docker system prune -f || true;
|
||||
}
|
||||
trap remove_docker_container EXIT
|
||||
|
||||
# Run the image and test offline inference/tensor parallel
|
||||
docker run \
|
||||
--device /dev/dri \
|
||||
-v /dev/dri/by-path:/dev/dri/by-path \
|
||||
--entrypoint="" \
|
||||
--name "${container_name}" \
|
||||
"${image_name}" \
|
||||
sh -c '
|
||||
VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
|
||||
VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
|
||||
'
|
18
.buildkite/scripts/rerun-test.sh
Normal file
18
.buildkite/scripts/rerun-test.sh
Normal file
@ -0,0 +1,18 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Usage: ./rerun_test.sh path/to/test.py::test_name
|
||||
|
||||
# Check if argument is given
|
||||
if [ $# -lt 1 ]; then
|
||||
echo "Usage: $0 path/to/test.py::test_name"
|
||||
echo "Example: $0 tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
TEST=$1
|
||||
COUNT=1
|
||||
|
||||
while pytest -sv "$TEST"; do
|
||||
COUNT=$((COUNT + 1))
|
||||
echo "RUN NUMBER ${COUNT}"
|
||||
done
|
@ -5,8 +5,8 @@
|
||||
set -ex
|
||||
set -o pipefail
|
||||
|
||||
# cd into parent directory of this file
|
||||
cd "$(dirname "${BASH_SOURCE[0]}")/.."
|
||||
# cd 2 levels into the working directory
|
||||
cd "$(dirname "${BASH_SOURCE[0]}")/../.."
|
||||
|
||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||
|
@ -3,7 +3,7 @@
|
||||
set -euox pipefail
|
||||
|
||||
if [[ $# -lt 4 ]]; then
|
||||
echo "Usage: .buildkite/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
|
||||
echo "Usage: .buildkite/scripts/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
|
||||
exit 1
|
||||
fi
|
||||
|
24
.buildkite/scripts/tpu/cleanup_docker.sh
Executable file
24
.buildkite/scripts/tpu/cleanup_docker.sh
Executable file
@ -0,0 +1,24 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
docker_root=$(docker info -f '{{.DockerRootDir}}')
|
||||
if [ -z "$docker_root" ]; then
|
||||
echo "Failed to determine Docker root directory."
|
||||
exit 1
|
||||
fi
|
||||
echo "Docker root directory: $docker_root"
|
||||
# Check disk usage of the filesystem where Docker's root directory is located
|
||||
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
|
||||
# Define the threshold
|
||||
threshold=70
|
||||
if [ "$disk_usage" -gt "$threshold" ]; then
|
||||
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
|
||||
# Remove dangling images (those that are not tagged and not used by any container)
|
||||
docker image prune -f
|
||||
# Remove unused volumes / force the system prune for old images as well.
|
||||
docker volume prune -f && docker system prune --force --filter "until=72h" --all
|
||||
echo "Docker images and volumes cleanup completed."
|
||||
else
|
||||
echo "Disk usage is below $threshold%. No cleanup needed."
|
||||
fi
|
14
.buildkite/scripts/tpu/config_v6e_1.env
Normal file
14
.buildkite/scripts/tpu/config_v6e_1.env
Normal file
@ -0,0 +1,14 @@
|
||||
# Environment config
|
||||
TEST_NAME=llama8b
|
||||
CONTAINER_NAME=vllm-tpu
|
||||
|
||||
# vllm config
|
||||
MODEL=meta-llama/Llama-3.1-8B-Instruct
|
||||
MAX_NUM_SEQS=512
|
||||
MAX_NUM_BATCHED_TOKENS=512
|
||||
TENSOR_PARALLEL_SIZE=1
|
||||
MAX_MODEL_LEN=2048
|
||||
DOWNLOAD_DIR=/mnt/disks/persist
|
||||
EXPECTED_THROUGHPUT=8.0
|
||||
INPUT_LEN=1800
|
||||
OUTPUT_LEN=128
|
102
.buildkite/scripts/tpu/docker_run_bm.sh
Executable file
102
.buildkite/scripts/tpu/docker_run_bm.sh
Executable file
@ -0,0 +1,102 @@
|
||||
#!/bin/bash
|
||||
|
||||
if [ ! -f "$1" ]; then
|
||||
echo "Error: The env file '$1' does not exist."
|
||||
exit 1 # Exit the script with a non-zero status to indicate an error
|
||||
fi
|
||||
|
||||
ENV_FILE=$1
|
||||
|
||||
# For testing on local vm, use `set -a` to export all variables
|
||||
source /etc/environment
|
||||
source $ENV_FILE
|
||||
|
||||
remove_docker_container() {
|
||||
docker rm -f tpu-test || true;
|
||||
docker rm -f vllm-tpu || true;
|
||||
docker rm -f $CONTAINER_NAME || true;
|
||||
}
|
||||
|
||||
trap remove_docker_container EXIT
|
||||
|
||||
# Remove the container that might not be cleaned up in the previous run.
|
||||
remove_docker_container
|
||||
|
||||
# Build docker image.
|
||||
# TODO: build the image outside the script and share the image with other
|
||||
# tpu test if building time is too long.
|
||||
DOCKER_BUILDKIT=1 docker build \
|
||||
--build-arg max_jobs=16 \
|
||||
--build-arg USE_SCCACHE=1 \
|
||||
--build-arg GIT_REPO_CHECK=0 \
|
||||
--tag vllm/vllm-tpu-bm \
|
||||
--progress plain -f docker/Dockerfile.tpu .
|
||||
|
||||
LOG_ROOT=$(mktemp -d)
|
||||
# If mktemp fails, set -e will cause the script to exit.
|
||||
echo "Results will be stored in: $LOG_ROOT"
|
||||
|
||||
if [ -z "$HF_TOKEN" ]; then
|
||||
echo "Error: HF_TOKEN is not set or is empty."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Make sure mounted disk or dir exists
|
||||
if [ ! -d "$DOWNLOAD_DIR" ]; then
|
||||
echo "Error: Folder $DOWNLOAD_DIR does not exist. This is useually a mounted drive. If no mounted drive, just create a folder."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Run model $MODEL"
|
||||
echo
|
||||
|
||||
echo "starting docker...$CONTAINER_NAME"
|
||||
echo
|
||||
docker run \
|
||||
-v $DOWNLOAD_DIR:$DOWNLOAD_DIR \
|
||||
--env-file $ENV_FILE \
|
||||
-e HF_TOKEN="$HF_TOKEN" \
|
||||
-e TARGET_COMMIT=$BUILDKITE_COMMIT \
|
||||
-e MODEL=$MODEL \
|
||||
-e WORKSPACE=/workspace \
|
||||
--name $CONTAINER_NAME \
|
||||
-d \
|
||||
--privileged \
|
||||
--network host \
|
||||
-v /dev/shm:/dev/shm \
|
||||
vllm/vllm-tpu-bm tail -f /dev/null
|
||||
|
||||
echo "run script..."
|
||||
echo
|
||||
docker exec "$CONTAINER_NAME" /bin/bash -c ".buildkite/scripts/hardware_ci/run_bm.sh"
|
||||
|
||||
echo "copy result back..."
|
||||
VLLM_LOG="$LOG_ROOT/$TEST_NAME"_vllm_log.txt
|
||||
BM_LOG="$LOG_ROOT/$TEST_NAME"_bm_log.txt
|
||||
docker cp "$CONTAINER_NAME:/workspace/vllm_log.txt" "$VLLM_LOG"
|
||||
docker cp "$CONTAINER_NAME:/workspace/bm_log.txt" "$BM_LOG"
|
||||
|
||||
throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g')
|
||||
echo "throughput for $TEST_NAME at $BUILDKITE_COMMIT: $throughput"
|
||||
|
||||
if [ "$BUILDKITE" = "true" ]; then
|
||||
echo "Running inside Buildkite"
|
||||
buildkite-agent artifact upload "$VLLM_LOG"
|
||||
buildkite-agent artifact upload "$BM_LOG"
|
||||
else
|
||||
echo "Not running inside Buildkite"
|
||||
fi
|
||||
|
||||
#
|
||||
# compare the throughput with EXPECTED_THROUGHPUT
|
||||
# and assert meeting the expectation
|
||||
#
|
||||
if [[ -z "$throughput" || ! "$throughput" =~ ^[0-9]+([.][0-9]+)?$ ]]; then
|
||||
echo "Failed to get the throughput"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if (( $(echo "$throughput < $EXPECTED_THROUGHPUT" | bc -l) )); then
|
||||
echo "Error: throughput($throughput) is less than expected($EXPECTED_THROUGHPUT)"
|
||||
exit 1
|
||||
fi
|
94
.buildkite/scripts/tpu/run_bm.sh
Executable file
94
.buildkite/scripts/tpu/run_bm.sh
Executable file
@ -0,0 +1,94 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
VLLM_LOG="$WORKSPACE/vllm_log.txt"
|
||||
BM_LOG="$WORKSPACE/bm_log.txt"
|
||||
|
||||
if [ -n "$TARGET_COMMIT" ]; then
|
||||
head_hash=$(git rev-parse HEAD)
|
||||
if [ "$TARGET_COMMIT" != "$head_hash" ]; then
|
||||
echo "Error: target commit $TARGET_COMMIT does not match HEAD: $head_hash"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
echo "model: $MODEL"
|
||||
echo
|
||||
|
||||
#
|
||||
# create a log folder
|
||||
#
|
||||
mkdir "$WORKSPACE/log"
|
||||
|
||||
# TODO: Move to image building.
|
||||
pip install pandas
|
||||
pip install datasets
|
||||
|
||||
#
|
||||
# create sonnet_4x
|
||||
#
|
||||
echo "Create sonnet_4x.txt"
|
||||
echo "" > benchmarks/sonnet_4x.txt
|
||||
for _ in {1..4}
|
||||
do
|
||||
cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt
|
||||
done
|
||||
|
||||
#
|
||||
# start vllm service in backend
|
||||
#
|
||||
echo "lanching vllm..."
|
||||
echo "logging to $VLLM_LOG"
|
||||
echo
|
||||
|
||||
VLLM_USE_V1=1 vllm serve $MODEL \
|
||||
--seed 42 \
|
||||
--disable-log-requests \
|
||||
--max-num-seqs $MAX_NUM_SEQS \
|
||||
--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
|
||||
--tensor-parallel-size $TENSOR_PARALLEL_SIZE \
|
||||
--no-enable-prefix-caching \
|
||||
--download_dir $DOWNLOAD_DIR \
|
||||
--max-model-len $MAX_MODEL_LEN > "$VLLM_LOG" 2>&1 &
|
||||
|
||||
|
||||
echo "wait for 20 minutes.."
|
||||
echo
|
||||
# sleep 1200
|
||||
# wait for 10 minutes...
|
||||
for i in {1..120}; do
|
||||
# TODO: detect other type of errors.
|
||||
if grep -Fq "raise RuntimeError" "$VLLM_LOG"; then
|
||||
echo "Detected RuntimeError, exiting."
|
||||
exit 1
|
||||
elif grep -Fq "Application startup complete" "$VLLM_LOG"; then
|
||||
echo "Application started"
|
||||
break
|
||||
else
|
||||
echo "wait for 10 seconds..."
|
||||
sleep 10
|
||||
fi
|
||||
done
|
||||
|
||||
#
|
||||
# run test
|
||||
#
|
||||
echo "run benchmark test..."
|
||||
echo "logging to $BM_LOG"
|
||||
echo
|
||||
python benchmarks/benchmark_serving.py \
|
||||
--backend vllm \
|
||||
--model $MODEL \
|
||||
--dataset-name sonnet \
|
||||
--dataset-path benchmarks/sonnet_4x.txt \
|
||||
--sonnet-input-len $INPUT_LEN \
|
||||
--sonnet-output-len $OUTPUT_LEN \
|
||||
--ignore-eos > "$BM_LOG"
|
||||
|
||||
echo "completed..."
|
||||
echo
|
||||
|
||||
throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g')
|
||||
echo "throughput: $throughput"
|
||||
echo
|
@ -50,8 +50,11 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
||||
if [[ $normal_wheel == *"cu118"* ]]; then
|
||||
# if $normal_wheel matches cu118, do not upload the index.html
|
||||
echo "Skipping index files for cu118 wheels"
|
||||
elif [[ $normal_wheel == *"cu126"* ]]; then
|
||||
# if $normal_wheel matches cu126, do not upload the index.html
|
||||
echo "Skipping index files for cu126 wheels"
|
||||
else
|
||||
# only upload index.html for cu12 wheels (default wheels)
|
||||
# only upload index.html for cu128 wheels (default wheels)
|
||||
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
|
||||
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
|
||||
fi
|
||||
@ -63,9 +66,13 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
|
||||
if [[ $normal_wheel == *"cu118"* ]]; then
|
||||
# if $normal_wheel matches cu118, do not upload the index.html
|
||||
echo "Skipping index files for cu118 wheels"
|
||||
elif [[ $normal_wheel == *"cu126"* ]]; then
|
||||
# if $normal_wheel matches cu126, do not upload the index.html
|
||||
echo "Skipping index files for cu126 wheels"
|
||||
else
|
||||
# only upload index.html for cu12 wheels (default wheels)
|
||||
# only upload index.html for cu128 wheels (default wheels)
|
||||
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
|
||||
fi
|
||||
|
||||
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
|
||||
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
|
||||
aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html"
|
@ -8,6 +8,7 @@
|
||||
# Documentation
|
||||
# label(str): the name of the test. emoji allowed.
|
||||
# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
|
||||
# torch_nightly(bool): whether to run this on vllm against torch nightly pipeline.
|
||||
# fast_check_only(bool): run this test on fastcheck pipeline only
|
||||
# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
|
||||
# command(str): the single command to run for tests. incompatible with commands.
|
||||
@ -31,17 +32,17 @@ steps:
|
||||
##### fast check tests #####
|
||||
|
||||
- label: Documentation Build # 2min
|
||||
working_dir: "/vllm-workspace/test_docs/docs"
|
||||
mirror_hardwares: [amdexperimental]
|
||||
working_dir: "/vllm-workspace/test_docs"
|
||||
fast_check: true
|
||||
no_gpu: True
|
||||
commands:
|
||||
- pip install -r requirements-docs.txt
|
||||
- SPHINXOPTS=\"-W\" make html
|
||||
# Check API reference (if it fails, you may have missing mock imports)
|
||||
- grep \"sig sig-object py\" build/html/api/inference_params.html
|
||||
- pip install -r ../requirements/docs.txt
|
||||
# TODO: add `--strict` once warnings in docstrings are fixed
|
||||
- mkdocs build
|
||||
|
||||
- label: Async Engine, Inputs, Utils, Worker Test # 24min
|
||||
fast_check: true
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/mq_llm_engine
|
||||
@ -57,11 +58,13 @@ steps:
|
||||
- pytest -v -s async_engine # AsyncLLMEngine
|
||||
- NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
|
||||
- pytest -v -s test_inputs.py
|
||||
- pytest -v -s test_outputs.py
|
||||
- pytest -v -s multimodal
|
||||
- pytest -v -s test_utils.py # Utils
|
||||
- pytest -v -s worker # Worker
|
||||
|
||||
- label: Python-only Installation Test
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- tests/standalone_tests/python_only_compile.sh
|
||||
- setup.py
|
||||
@ -69,8 +72,9 @@ steps:
|
||||
- bash standalone_tests/python_only_compile.sh
|
||||
|
||||
- label: Basic Correctness Test # 30min
|
||||
#mirror_hardwares: [amd]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
fast_check: true
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/basic_correctness/test_basic_correctness
|
||||
@ -78,12 +82,14 @@ steps:
|
||||
- tests/basic_correctness/test_preemption
|
||||
- tests/basic_correctness/test_cumem.py
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s basic_correctness/test_cumem.py
|
||||
- pytest -v -s basic_correctness/test_basic_correctness.py
|
||||
- pytest -v -s basic_correctness/test_cpu_offload.py
|
||||
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
|
||||
|
||||
- label: Chunked Prefill Test
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/basic_correctness/test_chunked_prefill
|
||||
@ -92,7 +98,7 @@ steps:
|
||||
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
|
||||
|
||||
- label: Core Test # 10min
|
||||
mirror_hardwares: [amd]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
fast_check: true
|
||||
source_file_dependencies:
|
||||
- vllm/core
|
||||
@ -102,9 +108,10 @@ steps:
|
||||
- pytest -v -s core
|
||||
|
||||
- label: Entrypoints Test # 40min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
fast_check: true
|
||||
mirror_hardwares: [amd]
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/entrypoints/llm
|
||||
@ -112,59 +119,71 @@ steps:
|
||||
- tests/entrypoints/test_chat_utils
|
||||
- tests/entrypoints/offline_mode
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
|
||||
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
|
||||
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
||||
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
|
||||
- pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
|
||||
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/correctness/
|
||||
- VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
|
||||
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/
|
||||
- pytest -v -s entrypoints/test_chat_utils.py
|
||||
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
||||
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
||||
|
||||
- label: Distributed Tests (4 GPUs) # 10min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
fast_check: true
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/
|
||||
- vllm/core/
|
||||
- tests/distributed/test_utils
|
||||
- tests/distributed/test_pynccl
|
||||
- tests/distributed/test_events
|
||||
- tests/spec_decode/e2e/test_integration_dist_tp4
|
||||
- tests/compile/test_basic_correctness
|
||||
- examples/offline_inference/rlhf.py
|
||||
- examples/offline_inference/rlhf_colocate.py
|
||||
- tests/examples/offline_inference/data_parallel.py
|
||||
- tests/v1/test_async_llm_dp.py
|
||||
- tests/v1/engine/test_engine_core_client.py
|
||||
commands:
|
||||
# test with tp=2 and external_dp=2
|
||||
- VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||
# test with tp=2 and pp=2
|
||||
- PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||
# test with internal dp
|
||||
- python3 ../examples/offline_inference/data_parallel.py
|
||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
||||
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
|
||||
- pytest -v -s distributed/test_utils.py
|
||||
- pytest -v -s compile/test_basic_correctness.py
|
||||
- pytest -v -s distributed/test_pynccl.py
|
||||
- pytest -v -s distributed/test_events.py
|
||||
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
|
||||
# TODO: create a dedicated test section for multi-GPU example tests
|
||||
# when we have multiple distributed example tests
|
||||
- python3 ../examples/offline_inference/rlhf.py
|
||||
- RAY_DEDUP_LOGS=0 python3 ../examples/offline_inference/rlhf_colocate.py
|
||||
- pushd ../examples/offline_inference
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
|
||||
- popd
|
||||
|
||||
- label: Metrics, Tracing Test # 10min
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
num_gpus: 2
|
||||
fast_check: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/metrics
|
||||
- tests/tracing
|
||||
commands:
|
||||
- pytest -v -s metrics
|
||||
- "pip install \
|
||||
'opentelemetry-sdk>=1.26.0,<1.27.0' \
|
||||
'opentelemetry-api>=1.26.0,<1.27.0' \
|
||||
'opentelemetry-exporter-otlp>=1.26.0,<1.27.0' \
|
||||
'opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0'"
|
||||
- pytest -v -s tracing
|
||||
|
||||
##### fast check tests #####
|
||||
##### 1 GPU test #####
|
||||
|
||||
- label: Regression Test # 5min
|
||||
mirror_hardwares: [amd]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/test_regression
|
||||
@ -174,7 +193,7 @@ steps:
|
||||
working_dir: "/vllm-workspace/tests" # optional
|
||||
|
||||
- label: Engine Test # 10min
|
||||
mirror_hardwares: [amd]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/engine
|
||||
@ -182,34 +201,41 @@ steps:
|
||||
- tests/test_sequence
|
||||
- tests/test_config
|
||||
- tests/test_logger
|
||||
- tests/test_vllm_port
|
||||
commands:
|
||||
- pytest -v -s engine test_sequence.py test_config.py test_logger.py
|
||||
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
|
||||
# OOM in the CI unless we run this separately
|
||||
- pytest -v -s tokenization
|
||||
|
||||
- label: V1 Test
|
||||
#mirror_hardwares: [amd]
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/v1
|
||||
commands:
|
||||
# split the test to avoid interference
|
||||
- VLLM_USE_V1=1 pytest -v -s v1/core
|
||||
- VLLM_USE_V1=1 pytest -v -s v1/engine
|
||||
- VLLM_USE_V1=1 pytest -v -s v1/sample
|
||||
- VLLM_USE_V1=1 pytest -v -s v1/worker
|
||||
- VLLM_USE_V1=1 pytest -v -s v1/test_stats.py
|
||||
- VLLM_USE_V1=1 pytest -v -s v1/test_utils.py
|
||||
- pytest -v -s v1/core
|
||||
- pytest -v -s v1/engine
|
||||
- pytest -v -s v1/entrypoints
|
||||
- pytest -v -s v1/sample
|
||||
- pytest -v -s v1/worker
|
||||
- pytest -v -s v1/structured_output
|
||||
- pytest -v -s v1/spec_decode
|
||||
- pytest -v -s v1/kv_connector/unit
|
||||
- pytest -v -s v1/test_serial_utils.py
|
||||
- pytest -v -s v1/test_utils.py
|
||||
- pytest -v -s v1/test_oracle.py
|
||||
- pytest -v -s v1/test_metrics_reader.py
|
||||
# TODO: accuracy does not match, whether setting
|
||||
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
|
||||
- VLLM_USE_V1=1 pytest -v -s v1/e2e
|
||||
- pytest -v -s v1/e2e
|
||||
# Integration test for streaming correctness (requires special branch).
|
||||
- pip install -U git+https://github.com/robertgshaw2-neuralmagic/lm-evaluation-harness.git@streaming-api
|
||||
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
||||
|
||||
- label: Examples Test # 25min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
working_dir: "/vllm-workspace/examples"
|
||||
#mirror_hardwares: [amd]
|
||||
source_file_dependencies:
|
||||
- vllm/entrypoints
|
||||
- examples/
|
||||
@ -220,17 +246,20 @@ steps:
|
||||
- python3 offline_inference/basic/chat.py
|
||||
- python3 offline_inference/prefix_caching.py
|
||||
- python3 offline_inference/llm_engine_example.py
|
||||
- python3 offline_inference/vision_language.py
|
||||
- python3 offline_inference/vision_language_multi_image.py
|
||||
- python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||
- python3 offline_inference/audio_language.py --seed 0
|
||||
- python3 offline_inference/vision_language.py --seed 0
|
||||
- python3 offline_inference/vision_language_embedding.py --seed 0
|
||||
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
||||
- VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||
- python3 offline_inference/encoder_decoder.py
|
||||
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
||||
- python3 offline_inference/basic/classify.py
|
||||
- python3 offline_inference/basic/embed.py
|
||||
- python3 offline_inference/basic/score.py
|
||||
- python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
|
||||
- VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
|
||||
|
||||
- label: Prefix Caching Test # 9min
|
||||
mirror_hardwares: [amd]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/prefix_caching
|
||||
@ -238,6 +267,7 @@ steps:
|
||||
- pytest -v -s prefix_caching
|
||||
|
||||
- label: Samplers Test # 36min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/layers
|
||||
- vllm/sampling_metadata.py
|
||||
@ -247,18 +277,8 @@ steps:
|
||||
- pytest -v -s samplers
|
||||
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
|
||||
|
||||
- label: LogitsProcessor Test # 5min
|
||||
mirror_hardwares: [amd]
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/layers
|
||||
- vllm/model_executor/guided_decoding
|
||||
- tests/test_logits_processor
|
||||
- tests/model_executor/test_guided_processors
|
||||
commands:
|
||||
- pytest -v -s test_logits_processor.py
|
||||
- pytest -v -s model_executor/test_guided_processors.py
|
||||
|
||||
- label: Speculative decoding tests # 40min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- vllm/spec_decode
|
||||
- tests/spec_decode
|
||||
@ -269,15 +289,29 @@ steps:
|
||||
- pytest -v -s spec_decode/e2e/test_eagle_correctness.py
|
||||
|
||||
- label: LoRA Test %N # 15min each
|
||||
mirror_hardwares: [amd]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
source_file_dependencies:
|
||||
- vllm/lora
|
||||
- tests/lora
|
||||
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py
|
||||
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
|
||||
parallelism: 4
|
||||
|
||||
- label: "PyTorch Fullgraph Smoke Test" # 9min
|
||||
fast_check: true
|
||||
- label: PyTorch Compilation Unit Tests
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/compile
|
||||
commands:
|
||||
- pytest -v -s compile/test_pass_manager.py
|
||||
- pytest -v -s compile/test_fusion.py
|
||||
- pytest -v -s compile/test_silu_mul_quant_fusion.py
|
||||
- pytest -v -s compile/test_sequence_parallelism.py
|
||||
- pytest -v -s compile/test_async_tp.py
|
||||
|
||||
- label: PyTorch Fullgraph Smoke Test # 9min
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/compile
|
||||
@ -286,60 +320,127 @@ steps:
|
||||
# these tests need to be separated, cannot combine
|
||||
- pytest -v -s compile/piecewise/test_simple.py
|
||||
- pytest -v -s compile/piecewise/test_toy_llama.py
|
||||
- pytest -v -s compile/piecewise/test_full_cudagraph.py
|
||||
|
||||
- label: "PyTorch Fullgraph Test" # 18min
|
||||
- label: PyTorch Fullgraph Test # 18min
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/compile
|
||||
commands:
|
||||
- pytest -v -s compile/test_full_graph.py
|
||||
|
||||
- label: Kernels Test %N # 1h each
|
||||
mirror_hardwares: [amd]
|
||||
- label: Kernels Core Operation Test
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/attention
|
||||
- tests/kernels
|
||||
- tests/kernels/core
|
||||
commands:
|
||||
- pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||
parallelism: 4
|
||||
- pytest -v -s kernels/core
|
||||
|
||||
- label: Kernels Attention Test %N
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
source_file_dependencies:
|
||||
- csrc/attention/
|
||||
- vllm/attention
|
||||
- vllm/v1/attention
|
||||
- tests/kernels/attention
|
||||
commands:
|
||||
- pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||
parallelism: 2
|
||||
|
||||
- label: Kernels Quantization Test %N
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/
|
||||
- vllm/model_executor/layers/quantization
|
||||
- tests/kernels/quantization
|
||||
commands:
|
||||
- pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||
parallelism: 2
|
||||
|
||||
- label: Kernels MoE Test
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- csrc/moe/
|
||||
- tests/kernels/moe
|
||||
- vllm/model_executor/layers/fused_moe/
|
||||
commands:
|
||||
- pytest -v -s kernels/moe
|
||||
|
||||
- label: Kernels Mamba Test
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- csrc/mamba/
|
||||
- tests/kernels/mamba
|
||||
commands:
|
||||
- pytest -v -s kernels/mamba
|
||||
|
||||
- label: Tensorizer Test # 11min
|
||||
mirror_hardwares: [amd]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
soft_fail: true
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/model_loader
|
||||
- tests/tensorizer_loader
|
||||
- tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
||||
commands:
|
||||
- apt-get update && apt-get install -y curl libsodium23
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s tensorizer_loader
|
||||
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
|
||||
|
||||
- label: Model Executor Test
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
soft_fail: true
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor
|
||||
- tests/model_executor
|
||||
commands:
|
||||
- apt-get update && apt-get install -y curl libsodium23
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s model_executor
|
||||
|
||||
- label: Benchmarks # 9min
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
working_dir: "/vllm-workspace/.buildkite"
|
||||
mirror_hardwares: [amd]
|
||||
source_file_dependencies:
|
||||
- benchmarks/
|
||||
commands:
|
||||
- bash run-benchmarks.sh
|
||||
- bash scripts/run-benchmarks.sh
|
||||
|
||||
- label: Quantization Test # 33min
|
||||
- label: Benchmarks CLI Test # 10min
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/benchmarks/
|
||||
commands:
|
||||
- pytest -v -s benchmarks/
|
||||
|
||||
- label: Quantization Test
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
- tests/quantization
|
||||
command: VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
|
||||
commands:
|
||||
# temporary install here since we need nightly, will move to requirements/test.in
|
||||
# after torchao 0.12 release
|
||||
- pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
|
||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
|
||||
|
||||
- label: LM Eval Small Models # 53min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- bash ./run-tests.sh -c configs/models-small.txt -t 1
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
|
||||
|
||||
- label: OpenAI API correctness
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/entrypoints/openai/
|
||||
@ -348,6 +449,7 @@ steps:
|
||||
- pytest -s entrypoints/openai/correctness/
|
||||
|
||||
- label: Encoder Decoder tests # 5min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/encoder_decoder
|
||||
@ -355,97 +457,117 @@ steps:
|
||||
- pytest -v -s encoder_decoder
|
||||
|
||||
- label: OpenAI-Compatible Tool Use # 20 min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
fast_check: false
|
||||
mirror_hardwares: [ amd ]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/tool_use
|
||||
- tests/mistral_tool_use
|
||||
commands:
|
||||
- pytest -v -s tool_use
|
||||
- pytest -v -s mistral_tool_use
|
||||
|
||||
##### models test #####
|
||||
|
||||
- label: Basic Models Test # 24min
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models
|
||||
commands:
|
||||
- pytest -v -s models/test_transformers.py
|
||||
- pytest -v -s models/test_registry.py
|
||||
- pytest -v -s models/test_utils.py
|
||||
- pytest -v -s models/test_vision.py
|
||||
- pytest -v -s models/test_initialization.py
|
||||
|
||||
- label: Language Models Test (Standard) # 32min
|
||||
#mirror_hardwares: [amd]
|
||||
- label: Language Models Test (Standard)
|
||||
mirror_hardwares: [amdexperimental]
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/decoder_only/language
|
||||
- tests/models/embedding/language
|
||||
- tests/models/encoder_decoder/language
|
||||
- tests/models/language
|
||||
commands:
|
||||
- pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
|
||||
- pytest -v -s models/embedding/language -m core_model
|
||||
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
|
||||
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
|
||||
- pip freeze | grep -E 'torch'
|
||||
- pytest -v -s models/language -m core_model
|
||||
|
||||
- label: Language Models Test (Extended) # 1h10min
|
||||
- label: Language Models Test (Extended Generation) # 1hr20min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/decoder_only/language
|
||||
- tests/models/embedding/language
|
||||
- tests/models/encoder_decoder/language
|
||||
- tests/models/language/generation
|
||||
commands:
|
||||
- pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
|
||||
- pytest -v -s models/embedding/language -m 'not core_model'
|
||||
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
|
||||
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
|
||||
- pytest -v -s models/language/generation -m 'not core_model'
|
||||
|
||||
- label: Multi-Modal Models Test (Standard) # 40min
|
||||
#mirror_hardwares: [amd]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/decoder_only/audio_language
|
||||
- tests/models/decoder_only/vision_language
|
||||
- tests/models/embedding/vision_language
|
||||
- tests/models/encoder_decoder/audio_language
|
||||
- tests/models/encoder_decoder/vision_language
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/multimodal
|
||||
- pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
|
||||
- pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
|
||||
- pytest -v -s models/embedding/vision_language -m core_model
|
||||
- pytest -v -s models/encoder_decoder/audio_language -m core_model
|
||||
- pytest -v -s models/encoder_decoder/language -m core_model
|
||||
- pytest -v -s models/encoder_decoder/vision_language -m core_model
|
||||
|
||||
- label: Multi-Modal Models Test (Extended) 1 # 48m
|
||||
- label: Language Models Test (Extended Pooling) # 36min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/decoder_only/audio_language
|
||||
- tests/models/decoder_only/vision_language
|
||||
- tests/models/embedding/vision_language
|
||||
- tests/models/encoder_decoder/vision_language
|
||||
- tests/models/language/pooling
|
||||
commands:
|
||||
- pytest -v -s models/language/pooling -m 'not core_model'
|
||||
|
||||
- label: Multi-Modal Models Test (Standard)
|
||||
mirror_hardwares: [amdexperimental]
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/multimodal
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
|
||||
- pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model'
|
||||
# HACK - run phi3v tests separately to sidestep this transformers bug
|
||||
# https://github.com/huggingface/transformers/issues/34307
|
||||
- pytest -v -s models/decoder_only/vision_language/test_phi3v.py
|
||||
- pytest -v -s --ignore models/decoder_only/vision_language/test_models.py --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
|
||||
- pytest -v -s models/embedding/vision_language -m 'not core_model'
|
||||
- pytest -v -s models/encoder_decoder/language -m 'not core_model'
|
||||
- pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
|
||||
- pip freeze | grep -E 'torch'
|
||||
- pytest -v -s models/multimodal/processing
|
||||
- pytest -v -s --ignore models/multimodal/generation/test_whisper.py models/multimodal -m core_model
|
||||
- cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
||||
|
||||
- label: Multi-Modal Models Test (Extended) 2 # 38m
|
||||
- label: Multi-Modal Models Test (Extended) 1
|
||||
mirror_hardwares: [amdexperimental]
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/decoder_only/vision_language
|
||||
- tests/models/multimodal
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=1) and not core_model and not quant_model'
|
||||
- pytest -v -s --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing models/multimodal -m 'not core_model'
|
||||
|
||||
- label: Multi-Modal Models Test (Extended) 2
|
||||
mirror_hardwares: [amdexperimental]
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/multimodal
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
|
||||
|
||||
- label: Multi-Modal Models Test (Extended) 3
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/multimodal
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
|
||||
|
||||
- label: Quantized Models Test
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/layers/quantization
|
||||
- tests/models/quantization
|
||||
commands:
|
||||
- pytest -v -s models/quantization
|
||||
|
||||
# This test is used only in PR development phase to test individual models and should never run on main
|
||||
- label: Custom Models Test
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
optional: true
|
||||
commands:
|
||||
- echo 'Testing custom models...'
|
||||
@ -457,6 +579,7 @@ steps:
|
||||
##### multi gpus test #####
|
||||
|
||||
- label: Distributed Comm Ops Test # 7min
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
source_file_dependencies:
|
||||
@ -467,6 +590,7 @@ steps:
|
||||
- pytest -v -s distributed/test_shm_broadcast.py
|
||||
|
||||
- label: 2 Node Tests (4 GPUs in total) # 16min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
num_nodes: 2
|
||||
@ -485,7 +609,7 @@ steps:
|
||||
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
|
||||
|
||||
- label: Distributed Tests (2 GPUs) # 40min
|
||||
#mirror_hardwares: [amd]
|
||||
mirror_hardwares: [amdexperimental]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
source_file_dependencies:
|
||||
@ -499,33 +623,39 @@ steps:
|
||||
- vllm/worker/worker.py
|
||||
- vllm/worker/model_runner.py
|
||||
- entrypoints/llm/test_collective_rpc.py
|
||||
- tests/v1/test_async_llm_dp.py
|
||||
- tests/v1/entrypoints/openai/test_multi_api_servers.py
|
||||
- vllm/v1/engine/
|
||||
commands:
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
||||
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
|
||||
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
||||
- torchrun --nproc-per-node=2 distributed/test_torchrun_example.py
|
||||
- pytest -v -s ./compile/test_basic_correctness.py
|
||||
- pytest -v -s ./compile/test_wrapper.py
|
||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
||||
# Avoid importing model tests that cause CUDA reinitialization error
|
||||
- pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
|
||||
- pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
|
||||
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
|
||||
- pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
|
||||
- pytest models/language -v -s -m 'distributed(num_gpus=2)'
|
||||
- pytest models/multimodal -v -s -m 'distributed(num_gpus=2)'
|
||||
# test sequence parallel
|
||||
- pytest -v -s distributed/test_sequence_parallel.py
|
||||
# this test fails consistently.
|
||||
# TODO: investigate and fix
|
||||
# - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
|
||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
|
||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py
|
||||
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
|
||||
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
|
||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
||||
|
||||
- label: Plugin Tests (2 GPUs) # 40min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
fast_check: true
|
||||
source_file_dependencies:
|
||||
- vllm/plugins/
|
||||
- tests/plugins/
|
||||
commands:
|
||||
# begin platform plugin tests, all the code in-between runs on dummy platform
|
||||
# begin platform plugin and general plugin tests, all the code in-between runs on dummy platform
|
||||
- pip install -e ./plugins/vllm_add_dummy_platform
|
||||
- pytest -v -s plugins_tests/test_platform_plugins.py
|
||||
- pip uninstall vllm_add_dummy_platform -y
|
||||
@ -536,8 +666,10 @@ steps:
|
||||
- pytest -v -s distributed/test_distributed_oot.py
|
||||
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
|
||||
- pytest -v -s models/test_oot_registration.py # it needs a clean process
|
||||
- pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
|
||||
|
||||
- label: Multi-step Tests (4 GPUs) # 36min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
source_file_dependencies:
|
||||
@ -558,6 +690,7 @@ steps:
|
||||
- pytest -v -s multi_step/test_correctness_llm.py
|
||||
|
||||
- label: Pipeline Parallelism Test # 45min
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
source_file_dependencies:
|
||||
@ -571,6 +704,7 @@ steps:
|
||||
- pytest -v -s distributed/test_pipeline_parallel.py
|
||||
|
||||
- label: LoRA TP Test (Distributed)
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
num_gpus: 4
|
||||
source_file_dependencies:
|
||||
- vllm/lora
|
||||
@ -579,16 +713,14 @@ steps:
|
||||
# FIXIT: find out which code initialize cuda before running the test
|
||||
# before the fix, we need to use spawn to test it
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
# This test runs llama 13B, so it is required to run on 4 GPUs.
|
||||
- pytest -v -s -x lora/test_long_context.py
|
||||
# There is some Tensor Parallelism related processing logic in LoRA that
|
||||
# requires multi-GPU testing for validation.
|
||||
- pytest -v -s -x lora/test_chatglm3_tp.py
|
||||
- pytest -v -s -x lora/test_llama_tp.py
|
||||
- pytest -v -s -x lora/test_minicpmv_tp.py
|
||||
|
||||
|
||||
- label: Weight Loading Multiple GPU Test # 33min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
source_file_dependencies:
|
||||
@ -598,6 +730,7 @@ steps:
|
||||
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
|
||||
|
||||
- label: Weight Loading Multiple GPU Test - Large Models # optional
|
||||
mirror_hardwares: [amdexperimental]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
gpu: a100
|
||||
@ -636,4 +769,4 @@ steps:
|
||||
- vllm/model_executor/layers/quantization
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- bash ./run-tests.sh -c configs/models-large.txt -t 4
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
||||
|
36
.github/CODEOWNERS
vendored
36
.github/CODEOWNERS
vendored
@ -10,27 +10,41 @@
|
||||
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
|
||||
/vllm/model_executor/guided_decoding @mgoin
|
||||
/vllm/model_executor/guided_decoding @mgoin @russellb @aarnphm
|
||||
/vllm/multimodal @DarkLight1337 @ywang96
|
||||
/vllm/vllm_flash_attn @LucasWilkinson
|
||||
/vllm/lora @jeejeelee
|
||||
/vllm/reasoning @aarnphm
|
||||
/vllm/entrypoints @aarnphm
|
||||
CMakeLists.txt @tlrmchlsmth
|
||||
|
||||
# vLLM V1
|
||||
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
|
||||
/vllm/v1/structured_output @mgoin @russellb @aarnphm
|
||||
|
||||
# Test ownership
|
||||
/tests/async_engine @njhill @robertgshaw2-redhat @simon-mo
|
||||
/tests/test_inputs.py @DarkLight1337 @ywang96
|
||||
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo
|
||||
/tests/models @DarkLight1337 @ywang96
|
||||
/tests/multimodal @DarkLight1337 @ywang96
|
||||
/tests/prefix_caching @comaniac @KuntaiDu
|
||||
/tests/spec_decode @njhill @LiuXiaoxuanPKU
|
||||
/tests/kernels @tlrmchlsmth @WoosukKwon
|
||||
/tests/quantization @mgoin @robertgshaw2-redhat
|
||||
/.buildkite/lm-eval-harness @mgoin @simon-mo
|
||||
/tests/async_engine @njhill @robertgshaw2-redhat @simon-mo
|
||||
/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac
|
||||
/tests/distributed/test_multi_node_assignment.py @youkaichao
|
||||
/tests/distributed/test_pipeline_parallel.py @youkaichao
|
||||
/tests/distributed/test_same_node.py @youkaichao
|
||||
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm
|
||||
/tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb @aarnphm
|
||||
/tests/kernels @tlrmchlsmth @WoosukKwon
|
||||
/tests/model_executor/test_guided_processors.py @mgoin @russellb
|
||||
/tests/models @DarkLight1337 @ywang96
|
||||
/tests/multi_step @alexm-redhat @comaniac
|
||||
/tests/multimodal @DarkLight1337 @ywang96
|
||||
/tests/prefix_caching @comaniac @KuntaiDu
|
||||
/tests/quantization @mgoin @robertgshaw2-redhat
|
||||
/tests/spec_decode @njhill @LiuXiaoxuanPKU
|
||||
/tests/test_inputs.py @DarkLight1337 @ywang96
|
||||
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
|
||||
/tests/v1/structured_output @mgoin @russellb @aarnphm
|
||||
/tests/weight_loading @mgoin @youkaichao
|
||||
/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac
|
||||
/tests/lora @jeejeelee
|
||||
|
||||
# Docs
|
||||
/docs @hmellor
|
||||
mkdocs.yaml @hmellor
|
||||
|
2
.github/ISSUE_TEMPLATE/200-installation.yml
vendored
2
.github/ISSUE_TEMPLATE/200-installation.yml
vendored
@ -14,7 +14,7 @@ body:
|
||||
description: |
|
||||
Please run the following and paste the output below.
|
||||
```sh
|
||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
|
||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
|
||||
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
||||
python collect_env.py
|
||||
```
|
||||
|
2
.github/ISSUE_TEMPLATE/300-usage.yml
vendored
2
.github/ISSUE_TEMPLATE/300-usage.yml
vendored
@ -14,7 +14,7 @@ body:
|
||||
description: |
|
||||
Please run the following and paste the output below.
|
||||
```sh
|
||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
|
||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
|
||||
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
||||
python collect_env.py
|
||||
```
|
||||
|
24
.github/ISSUE_TEMPLATE/400-bug-report.yml
vendored
24
.github/ISSUE_TEMPLATE/400-bug-report.yml
vendored
@ -8,25 +8,35 @@ body:
|
||||
attributes:
|
||||
value: >
|
||||
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
⚠️ **SECURITY WARNING:** Please review any text you paste to ensure it does not contain sensitive information such as:
|
||||
- API tokens or keys (e.g., Hugging Face tokens, OpenAI API keys)
|
||||
- Passwords or authentication credentials
|
||||
- Private URLs or endpoints
|
||||
- Personal or confidential data
|
||||
|
||||
Consider redacting or replacing sensitive values with placeholders like `<YOUR_TOKEN_HERE>` when sharing configuration or code examples.
|
||||
- type: textarea
|
||||
attributes:
|
||||
label: Your current environment
|
||||
description: |
|
||||
Please run the following and paste the output below.
|
||||
```sh
|
||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
|
||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
|
||||
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
||||
python collect_env.py
|
||||
```
|
||||
It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
|
||||
value: |
|
||||
<details>
|
||||
<summary>The output of `python collect_env.py`</summary>
|
||||
<summary>The output of <code>python collect_env.py</code></summary>
|
||||
|
||||
```text
|
||||
Your output of `python collect_env.py` here
|
||||
```
|
||||
|
||||
|
||||
</details>
|
||||
validations:
|
||||
required: true
|
||||
@ -75,20 +85,20 @@ body:
|
||||
```
|
||||
|
||||
```
|
||||
The error message you got, with the full traceback.
|
||||
The error message you got, with the full traceback and the error logs with [dump_input.py:##] if present.
|
||||
```
|
||||
validations:
|
||||
required: true
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: >
|
||||
⚠️ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the models' output:
|
||||
value: |
|
||||
⚠️ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the model's output:
|
||||
|
||||
- Try the counterpart of `transformers` first. If the error appears, please go to [their issues](https://github.com/huggingface/transformers/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc).
|
||||
|
||||
- If the error only appears in vllm, please provide the detailed script of how you run `transformers` and `vllm`, also highlight the difference and what you expect.
|
||||
|
||||
Thanks for contributing 🎉!
|
||||
Thanks for reporting 🙏!
|
||||
- type: checkboxes
|
||||
id: askllm
|
||||
attributes:
|
||||
|
69
.github/ISSUE_TEMPLATE/450-ci-failure.yml
vendored
Normal file
69
.github/ISSUE_TEMPLATE/450-ci-failure.yml
vendored
Normal file
@ -0,0 +1,69 @@
|
||||
name: 🧪 CI failure report
|
||||
description: Report a failing test.
|
||||
title: "[CI Failure]: "
|
||||
labels: ["ci-failure"]
|
||||
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: >
|
||||
#### Include the name of the failing Buildkite step and test file in the title.
|
||||
- type: input
|
||||
attributes:
|
||||
label: Name of failing test
|
||||
description: |
|
||||
Paste in the fully-qualified name of the failing test from the logs.
|
||||
placeholder: |
|
||||
`path/to/test_file.py::test_name[params]`
|
||||
validations:
|
||||
required: true
|
||||
- type: checkboxes
|
||||
attributes:
|
||||
label: Basic information
|
||||
description: Select all items that apply to the failing test.
|
||||
options:
|
||||
- label: Flaky test
|
||||
- label: Can reproduce locally
|
||||
- label: Caused by external libraries (e.g. bug in `transformers`)
|
||||
- type: textarea
|
||||
attributes:
|
||||
label: 🧪 Describe the failing test
|
||||
description: |
|
||||
Please provide a clear and concise description of the failing test.
|
||||
placeholder: |
|
||||
A clear and concise description of the failing test.
|
||||
|
||||
```
|
||||
The error message you got, with the full traceback and the error logs with [dump_input.py:##] if present.
|
||||
```
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
attributes:
|
||||
label: 📝 History of failing test
|
||||
description: |
|
||||
Since when did the test start to fail?
|
||||
You can look up its history via [Buildkite Test Suites](https://buildkite.com/organizations/vllm/analytics/suites/ci-1/tests?branch=main).
|
||||
|
||||
If you have time, identify the PR that caused the test to fail on main. You can do so via the following methods:
|
||||
|
||||
- Use Buildkite Test Suites to find the PR where the test failure first occurred, and reproduce the failure locally.
|
||||
|
||||
- Run [`git bisect`](https://git-scm.com/docs/git-bisect) locally.
|
||||
|
||||
- Manually unblock Buildkite steps for suspected PRs on main and check the results. (authorized users only)
|
||||
placeholder: |
|
||||
Approximate timeline and/or problematic PRs
|
||||
|
||||
A link to the Buildkite analytics of the failing test (if available)
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
attributes:
|
||||
label: CC List.
|
||||
description: >
|
||||
The list of people you want to CC. Usually, this includes those who worked on the PR that failed the test.
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: >
|
||||
Thanks for reporting 🙏!
|
2
.github/ISSUE_TEMPLATE/600-new-model.yml
vendored
2
.github/ISSUE_TEMPLATE/600-new-model.yml
vendored
@ -9,7 +9,7 @@ body:
|
||||
value: >
|
||||
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
|
||||
|
||||
#### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/adding_model.html first to understand how to add a new model.
|
||||
#### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/index.html first to understand how to add a new model.
|
||||
- type: textarea
|
||||
attributes:
|
||||
label: The model to consider.
|
||||
|
@ -35,7 +35,7 @@ body:
|
||||
description: |
|
||||
Please run the following and paste the output below.
|
||||
```sh
|
||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
|
||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
|
||||
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
||||
python collect_env.py
|
||||
```
|
||||
|
28
.github/ISSUE_TEMPLATE/800-misc-discussion.yml
vendored
28
.github/ISSUE_TEMPLATE/800-misc-discussion.yml
vendored
@ -1,28 +0,0 @@
|
||||
name: 🎲 Misc/random discussions that do not fit into the above categories.
|
||||
description: Submit a discussion as you like. Note that developers are heavily overloaded and we mainly rely on community users to answer these issues.
|
||||
title: "[Misc]: "
|
||||
labels: ["misc"]
|
||||
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: >
|
||||
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
|
||||
- type: textarea
|
||||
attributes:
|
||||
label: Anything you want to discuss about vllm.
|
||||
description: >
|
||||
Anything you want to discuss about vllm.
|
||||
validations:
|
||||
required: true
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: >
|
||||
Thanks for contributing 🎉!
|
||||
- type: checkboxes
|
||||
id: askllm
|
||||
attributes:
|
||||
label: Before submitting a new issue...
|
||||
options:
|
||||
- label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
|
||||
required: true
|
4
.github/ISSUE_TEMPLATE/config.yml
vendored
4
.github/ISSUE_TEMPLATE/config.yml
vendored
@ -1 +1,5 @@
|
||||
blank_issues_enabled: false
|
||||
contact_links:
|
||||
- name: Questions
|
||||
url: https://discuss.vllm.ai
|
||||
about: Ask questions and discuss with other vLLM community members
|
||||
|
18
.github/PULL_REQUEST_TEMPLATE.md
vendored
18
.github/PULL_REQUEST_TEMPLATE.md
vendored
@ -1,6 +1,18 @@
|
||||
FILL IN THE PR DESCRIPTION HERE
|
||||
## Essential Elements of an Effective PR Description Checklist
|
||||
- [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)".
|
||||
- [ ] The test plan, such as providing test command.
|
||||
- [ ] The test results, such as pasting the results comparison before and after, or e2e results
|
||||
- [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model.
|
||||
|
||||
FIX #xxxx (*link existing issues this PR will resolve*)
|
||||
PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS ABOVE HAVE BEEN CONSIDERED.
|
||||
|
||||
## Purpose
|
||||
|
||||
## Test Plan
|
||||
|
||||
## Test Result
|
||||
|
||||
## (Optional) Documentation Update
|
||||
|
||||
<!--- pyml disable-next-line no-emphasis-as-heading -->
|
||||
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing/overview.html>**
|
||||
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions)
|
||||
|
2
.github/dependabot.yml
vendored
2
.github/dependabot.yml
vendored
@ -23,7 +23,7 @@ updates:
|
||||
- dependency-name: "lm-format-enforcer"
|
||||
- dependency-name: "gguf"
|
||||
- dependency-name: "compressed-tensors"
|
||||
- dependency-name: "ray[adag]"
|
||||
- dependency-name: "ray[cgraph]" # Ray Compiled Graph
|
||||
- dependency-name: "lm-eval"
|
||||
groups:
|
||||
minor-update:
|
||||
|
105
.github/mergify.yml
vendored
105
.github/mergify.yml
vendored
@ -5,6 +5,7 @@ pull_request_rules:
|
||||
- or:
|
||||
- files~=^[^/]+\.md$
|
||||
- files~=^docs/
|
||||
- files~=^examples/
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
@ -18,7 +19,7 @@ pull_request_rules:
|
||||
- files~=\.buildkite/
|
||||
- files~=^cmake/
|
||||
- files=CMakeLists.txt
|
||||
- files~=^Dockerfile
|
||||
- files~=^docker/Dockerfile
|
||||
- files~=^requirements.*\.txt
|
||||
- files=setup.py
|
||||
actions:
|
||||
@ -35,15 +36,52 @@ pull_request_rules:
|
||||
add:
|
||||
- frontend
|
||||
|
||||
- name: label-llama
|
||||
description: Automatically apply llama label
|
||||
conditions:
|
||||
- or:
|
||||
- files~=^examples/.*llama.*\.py
|
||||
- files~=^tests/.*llama.*\.py
|
||||
- files~=^vllm/entrypoints/openai/tool_parsers/llama.*\.py
|
||||
- files~=^vllm/model_executor/models/.*llama.*\.py
|
||||
- files~=^vllm/transformers_utils/configs/.*llama.*\.py
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
- llama
|
||||
|
||||
- name: label-multi-modality
|
||||
description: Automatically apply multi-modality label
|
||||
conditions:
|
||||
- or:
|
||||
- files~=^vllm/multimodal/
|
||||
- files~=^tests/multimodal/
|
||||
- files~=^tests/models/multimodal/
|
||||
- files~=^tests/models/*/audio_language/
|
||||
- files~=^tests/models/*/vision_language/
|
||||
- files=tests/models/test_vision.py
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
- multi-modality
|
||||
|
||||
- name: label-structured-output
|
||||
description: Automatically apply structured-output label
|
||||
conditions:
|
||||
- or:
|
||||
- files~=^benchmarks/structured_schemas/
|
||||
- files=benchmarks/benchmark_serving_structured_output.py
|
||||
- files=benchmarks/run_structured_output_benchmark.sh
|
||||
- files=docs/features/structured_outputs.md
|
||||
- files=examples/offline_inference/structured_outputs.py
|
||||
- files=examples/online_serving/openai_chat_completion_structured_outputs.py
|
||||
- files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
|
||||
- files~=^vllm/model_executor/guided_decoding/
|
||||
- files=tests/model_executor/test_guided_processors.py
|
||||
- files=tests/entrypoints/llm/test_guided_generate.py
|
||||
- files=benchmarks/benchmark_serving_guided.py
|
||||
- files=benchmarks/benchmark_guided.py
|
||||
- files~=^tests/v1/structured_output/
|
||||
- files=tests/v1/entrypoints/llm/test_guided_generate.py
|
||||
- files~=^vllm/v1/structured_output/
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
@ -72,6 +110,56 @@ pull_request_rules:
|
||||
add:
|
||||
- v1
|
||||
|
||||
- name: label-tpu
|
||||
description: Automatically apply tpu label
|
||||
# Keep this list in sync with `label-tpu-remove` conditions
|
||||
conditions:
|
||||
- or:
|
||||
- files~=tpu.py
|
||||
- files~=_tpu
|
||||
- files~=tpu_
|
||||
- files~=/tpu/
|
||||
- files~=pallas
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
- tpu
|
||||
|
||||
- name: label-tpu-remove
|
||||
description: Automatically remove tpu label
|
||||
# Keep this list in sync with `label-tpu` conditions
|
||||
conditions:
|
||||
- and:
|
||||
- -files~=tpu.py
|
||||
- -files~=_tpu
|
||||
- -files~=tpu_
|
||||
- -files~=/tpu/
|
||||
- -files~=pallas
|
||||
actions:
|
||||
label:
|
||||
remove:
|
||||
- tpu
|
||||
|
||||
- name: label-tool-calling
|
||||
description: Automatically add tool-calling label
|
||||
conditions:
|
||||
- or:
|
||||
- files~=^tests/tool_use/
|
||||
- files~=^tests/mistral_tool_use/
|
||||
- files~=^tests/entrypoints/openai/tool_parsers/
|
||||
- files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
|
||||
- files~=^vllm/entrypoints/openai/tool_parsers/
|
||||
- files=docs/features/tool_calling.md
|
||||
- files~=^examples/tool_chat_*
|
||||
- files=examples/offline_inference/chat_with_tools.py
|
||||
- files=examples/online_serving/openai_chat_completion_client_with_tools_required.py
|
||||
- files=examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
|
||||
- files=examples/online_serving/openai_chat_completion_client_with_tools.py
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
- tool-calling
|
||||
|
||||
- name: ping author on conflicts and add 'needs-rebase' label
|
||||
conditions:
|
||||
- conflict
|
||||
@ -87,6 +175,17 @@ pull_request_rules:
|
||||
|
||||
https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork
|
||||
|
||||
- name: assign reviewer for tensorizer changes
|
||||
conditions:
|
||||
- files~=^vllm/model_executor/model_loader/tensorizer.py
|
||||
- files~=^vllm/model_executor/model_loader/tensorizer_loader.py
|
||||
- files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
||||
- files~=^tests/tensorizer_loader/
|
||||
actions:
|
||||
assign:
|
||||
users:
|
||||
- "sangstar"
|
||||
|
||||
- name: remove 'needs-rebase' label when conflict is resolved
|
||||
conditions:
|
||||
- -conflict
|
||||
|
2
.github/scripts/cleanup_pr_body.sh
vendored
2
.github/scripts/cleanup_pr_body.sh
vendored
@ -26,7 +26,7 @@ sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ.*\*\*/,$d' "${NEW}"
|
||||
|
||||
# Remove HTML <details> section that includes <summary> text of "PR Checklist (Click to Expand)"
|
||||
python3 - <<EOF
|
||||
import re
|
||||
import regex as re
|
||||
|
||||
with open("${NEW}", "r") as file:
|
||||
content = file.read()
|
||||
|
2
.github/workflows/add_label_automerge.yml
vendored
2
.github/workflows/add_label_automerge.yml
vendored
@ -1,4 +1,6 @@
|
||||
name: Add label on auto-merge enabled
|
||||
permissions:
|
||||
pull-requests: write
|
||||
on:
|
||||
pull_request_target:
|
||||
types:
|
||||
|
7
.github/workflows/cleanup_pr_body.yml
vendored
7
.github/workflows/cleanup_pr_body.yml
vendored
@ -20,7 +20,12 @@ jobs:
|
||||
with:
|
||||
python-version: '3.12'
|
||||
|
||||
- name: Install Python dependencies
|
||||
run: |
|
||||
python3 -m pip install --upgrade pip
|
||||
python3 -m pip install regex
|
||||
|
||||
- name: Update PR description
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
run: .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}"
|
||||
run: bash .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}"
|
||||
|
11
.github/workflows/lint-and-deploy.yaml
vendored
11
.github/workflows/lint-and-deploy.yaml
vendored
@ -2,6 +2,9 @@ name: Lint and Deploy Charts
|
||||
|
||||
on: pull_request
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
lint-and-deploy:
|
||||
runs-on: ubuntu-latest
|
||||
@ -12,7 +15,7 @@ jobs:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Set up Helm
|
||||
uses: azure/setup-helm@fe7b79cd5ee1e45176fcad797de68ecaf3ca4814 # v4.2.0
|
||||
uses: azure/setup-helm@b9e51907a09c216f16ebe8536097933489208112 # v4.3.0
|
||||
with:
|
||||
version: v3.14.4
|
||||
|
||||
@ -50,7 +53,7 @@ jobs:
|
||||
uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0
|
||||
|
||||
- name: Build the Docker image vllm cpu
|
||||
run: docker buildx build -f Dockerfile.cpu -t vllm-cpu-env .
|
||||
run: docker buildx build -f docker/Dockerfile.cpu -t vllm-cpu-env .
|
||||
|
||||
- name: Configuration of docker images, network and namespace for the kind cluster
|
||||
run: |
|
||||
@ -66,7 +69,7 @@ jobs:
|
||||
export AWS_SECRET_ACCESS_KEY=minioadmin
|
||||
sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" &
|
||||
helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
|
||||
|
||||
|
||||
- name: curl test
|
||||
run: |
|
||||
kubectl -n ns-vllm port-forward service/test-vllm-service 8001:80 &
|
||||
@ -79,4 +82,4 @@ jobs:
|
||||
"max_tokens": 7,
|
||||
"temperature": 0
|
||||
}'):$CODE"
|
||||
echo "$CODE"
|
||||
echo "$CODE"
|
||||
|
3
.github/workflows/pre-commit.yml
vendored
3
.github/workflows/pre-commit.yml
vendored
@ -5,6 +5,9 @@ on:
|
||||
push:
|
||||
branches: [main]
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
pre-commit:
|
||||
runs-on: ubuntu-latest
|
||||
|
4
.github/workflows/publish.yml
vendored
4
.github/workflows/publish.yml
vendored
@ -39,7 +39,7 @@ jobs:
|
||||
const script = require('.github/workflows/scripts/create_release.js')
|
||||
await script(github, context, core)
|
||||
|
||||
# NOTE(simon): No longer build wheel using Github Actions. See buildkite's release workflow.
|
||||
# NOTE(simon): No longer build wheel using GitHub Actions. See buildkite's release workflow.
|
||||
# wheel:
|
||||
# name: Build Wheel
|
||||
# runs-on: ${{ matrix.os }}
|
||||
@ -50,7 +50,7 @@ jobs:
|
||||
# matrix:
|
||||
# os: ['ubuntu-20.04']
|
||||
# python-version: ['3.9', '3.10', '3.11', '3.12']
|
||||
# pytorch-version: ['2.4.0'] # Must be the most recent version that meets requirements-cuda.txt.
|
||||
# pytorch-version: ['2.4.0'] # Must be the most recent version that meets requirements/cuda.txt.
|
||||
# cuda-version: ['11.8', '12.1']
|
||||
|
||||
# steps:
|
||||
|
2
.github/workflows/reminder_comment.yml
vendored
2
.github/workflows/reminder_comment.yml
vendored
@ -1,4 +1,6 @@
|
||||
name: PR Reminder Comment Bot
|
||||
permissions:
|
||||
pull-requests: write
|
||||
on:
|
||||
pull_request_target:
|
||||
types: [opened]
|
||||
|
2
.github/workflows/scripts/build.sh
vendored
2
.github/workflows/scripts/build.sh
vendored
@ -9,7 +9,7 @@ PATH=${cuda_home}/bin:$PATH
|
||||
LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH
|
||||
|
||||
# Install requirements
|
||||
$python_executable -m pip install -r requirements-build.txt -r requirements-cuda.txt
|
||||
$python_executable -m pip install -r requirements/build.txt -r requirements/cuda.txt
|
||||
|
||||
# Limit the number of parallel jobs to avoid OOM
|
||||
export MAX_JOBS=1
|
||||
|
2
.github/workflows/scripts/create_release.js
vendored
2
.github/workflows/scripts/create_release.js
vendored
@ -1,4 +1,4 @@
|
||||
// Uses Github's API to create the release and wait for result.
|
||||
// Uses GitHub's API to create the release and wait for result.
|
||||
// We use a JS script since github CLI doesn't provide a way to wait for the release's creation and returns immediately.
|
||||
|
||||
module.exports = async (github, context, core) => {
|
||||
|
12
.gitignore
vendored
12
.gitignore
vendored
@ -2,7 +2,7 @@
|
||||
/vllm/_version.py
|
||||
|
||||
# vllm-flash-attn built from source
|
||||
vllm/vllm_flash_attn/
|
||||
vllm/vllm_flash_attn/*
|
||||
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
@ -77,10 +77,6 @@ instance/
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
docs/source/getting_started/examples/
|
||||
|
||||
# PyBuilder
|
||||
.pybuilder/
|
||||
target/
|
||||
@ -150,6 +146,7 @@ venv.bak/
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
docs/examples
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
@ -197,8 +194,11 @@ _build/
|
||||
hip_compat.h
|
||||
|
||||
# Benchmark dataset
|
||||
benchmarks/*.json
|
||||
benchmarks/**/*.json
|
||||
|
||||
# Linting
|
||||
actionlint
|
||||
shellcheck*/
|
||||
|
||||
# Ingore moe/marlin_moe gen code
|
||||
csrc/moe/marlin_moe_wna16/kernel_*
|
||||
|
@ -1,49 +1,58 @@
|
||||
default_install_hook_types:
|
||||
- pre-commit
|
||||
- commit-msg
|
||||
default_stages:
|
||||
- pre-commit # Run locally
|
||||
- manual # Run in CI
|
||||
exclude: 'vllm/third_party/.*'
|
||||
repos:
|
||||
- repo: https://github.com/google/yapf
|
||||
rev: v0.43.0
|
||||
hooks:
|
||||
- id: yapf
|
||||
args: [--in-place, --verbose]
|
||||
additional_dependencies: [toml] # TODO: Remove when yapf is upgraded
|
||||
exclude: 'vllm/third_party/.*'
|
||||
# Keep the same list from yapfignore here to avoid yapf failing without any inputs
|
||||
exclude: '(.buildkite|benchmarks|build|examples)/.*'
|
||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||
rev: v0.9.3
|
||||
rev: v0.11.7
|
||||
hooks:
|
||||
- id: ruff
|
||||
args: [--output-format, github, --fix]
|
||||
exclude: 'vllm/third_party/.*'
|
||||
- id: ruff-format
|
||||
files: ^(.buildkite|benchmarks|examples)/.*
|
||||
- repo: https://github.com/codespell-project/codespell
|
||||
rev: v2.4.0
|
||||
rev: v2.4.1
|
||||
hooks:
|
||||
- id: codespell
|
||||
additional_dependencies: ['tomli']
|
||||
args: ['--toml', 'pyproject.toml']
|
||||
- repo: https://github.com/PyCQA/isort
|
||||
rev: 5.13.2
|
||||
rev: 6.0.1
|
||||
hooks:
|
||||
- id: isort
|
||||
exclude: 'vllm/third_party/.*'
|
||||
- repo: https://github.com/pre-commit/mirrors-clang-format
|
||||
rev: v19.1.7
|
||||
rev: v20.1.3
|
||||
hooks:
|
||||
- id: clang-format
|
||||
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
|
||||
types_or: [c++, cuda]
|
||||
args: [--style=file, --verbose]
|
||||
- repo: https://github.com/jackdewinter/pymarkdown
|
||||
rev: v0.9.27
|
||||
rev: v0.9.29
|
||||
hooks:
|
||||
- id: pymarkdown
|
||||
exclude: '.*\.inc\.md'
|
||||
args: [fix]
|
||||
exclude: 'vllm/third_party/.*'
|
||||
- repo: https://github.com/rhysd/actionlint
|
||||
rev: v1.7.7
|
||||
hooks:
|
||||
- id: actionlint
|
||||
exclude: 'vllm/third_party/.*'
|
||||
- repo: https://github.com/astral-sh/uv-pre-commit
|
||||
rev: 0.6.17
|
||||
hooks:
|
||||
- id: pip-compile
|
||||
args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128]
|
||||
files: ^requirements/test\.(in|txt)$
|
||||
- repo: local
|
||||
hooks:
|
||||
- id: mypy-local
|
||||
@ -51,9 +60,8 @@ repos:
|
||||
entry: tools/mypy.sh 0 "local"
|
||||
language: python
|
||||
types: [python]
|
||||
additional_dependencies: &mypy_deps [mypy==1.11.1, types-setuptools, types-PyYAML, types-requests]
|
||||
additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests, pydantic]
|
||||
stages: [pre-commit] # Don't run in CI
|
||||
exclude: 'vllm/third_party/.*'
|
||||
- id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||
name: Run mypy for Python 3.9
|
||||
entry: tools/mypy.sh 1 "3.9"
|
||||
@ -61,7 +69,6 @@ repos:
|
||||
types: [python]
|
||||
additional_dependencies: *mypy_deps
|
||||
stages: [manual] # Only run in CI
|
||||
exclude: 'vllm/third_party/.*'
|
||||
- id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||
name: Run mypy for Python 3.10
|
||||
entry: tools/mypy.sh 1 "3.10"
|
||||
@ -69,7 +76,6 @@ repos:
|
||||
types: [python]
|
||||
additional_dependencies: *mypy_deps
|
||||
stages: [manual] # Only run in CI
|
||||
exclude: 'vllm/third_party/.*'
|
||||
- id: mypy-3.11 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||
name: Run mypy for Python 3.11
|
||||
entry: tools/mypy.sh 1 "3.11"
|
||||
@ -77,7 +83,6 @@ repos:
|
||||
types: [python]
|
||||
additional_dependencies: *mypy_deps
|
||||
stages: [manual] # Only run in CI
|
||||
exclude: 'vllm/third_party/.*'
|
||||
- id: mypy-3.12 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||
name: Run mypy for Python 3.12
|
||||
entry: tools/mypy.sh 1 "3.12"
|
||||
@ -85,38 +90,33 @@ repos:
|
||||
types: [python]
|
||||
additional_dependencies: *mypy_deps
|
||||
stages: [manual] # Only run in CI
|
||||
exclude: 'vllm/third_party/.*'
|
||||
- id: shellcheck
|
||||
name: Lint shell scripts
|
||||
entry: tools/shellcheck.sh
|
||||
language: script
|
||||
types: [shell]
|
||||
exclude: 'vllm/third_party/.*'
|
||||
- id: png-lint
|
||||
name: Lint PNG exports from excalidraw
|
||||
entry: tools/png-lint.sh
|
||||
language: script
|
||||
types: [png]
|
||||
exclude: 'vllm/third_party/.*'
|
||||
- id: signoff-commit
|
||||
name: Sign-off Commit
|
||||
entry: bash
|
||||
args:
|
||||
- -c
|
||||
- |
|
||||
if ! grep -q "^Signed-off-by: $(git config user.name) <$(git config user.email)>" .git/COMMIT_EDITMSG; then
|
||||
printf "\nSigned-off-by: $(git config user.name) <$(git config user.email)>\n" >> .git/COMMIT_EDITMSG
|
||||
if ! grep -q "^Signed-off-by: $(git config user.name) <$(git config user.email)>" "$(git rev-parse --git-path COMMIT_EDITMSG)"; then
|
||||
printf "\nSigned-off-by: $(git config user.name) <$(git config user.email)>\n" >> "$(git rev-parse --git-path COMMIT_EDITMSG)"
|
||||
fi
|
||||
language: system
|
||||
verbose: true
|
||||
stages: [commit-msg]
|
||||
exclude: 'vllm/third_party/.*'
|
||||
- id: check-spdx-header
|
||||
name: Check SPDX headers
|
||||
entry: python tools/check_spdx_header.py
|
||||
language: python
|
||||
types: [python]
|
||||
exclude: 'vllm/third_party/.*'
|
||||
- id: check-filenames
|
||||
name: Check for spaces in all filenames
|
||||
entry: bash
|
||||
@ -126,7 +126,25 @@ repos:
|
||||
language: system
|
||||
always_run: true
|
||||
pass_filenames: false
|
||||
exclude: 'vllm/third_party/.*'
|
||||
- id: update-dockerfile-graph
|
||||
name: Update Dockerfile dependency graph
|
||||
entry: tools/update-dockerfile-graph.sh
|
||||
language: script
|
||||
- id: enforce-import-regex-instead-of-re
|
||||
name: Enforce import regex as re
|
||||
entry: python tools/enforce_regex_import.py
|
||||
language: python
|
||||
types: [python]
|
||||
pass_filenames: false
|
||||
additional_dependencies: [regex]
|
||||
# forbid directly import triton
|
||||
- id: forbid-direct-triton-import
|
||||
name: "Forbid direct 'import triton'"
|
||||
entry: python tools/check_triton_import.py
|
||||
language: python
|
||||
types: [python]
|
||||
pass_filenames: false
|
||||
additional_dependencies: [regex]
|
||||
# Keep `suggestion` last
|
||||
- id: suggestion
|
||||
name: Suggestion
|
||||
@ -134,5 +152,4 @@ repos:
|
||||
language: system
|
||||
verbose: true
|
||||
pass_filenames: false
|
||||
exclude: 'vllm/third_party/.*'
|
||||
# Insert new entries above the `suggestion` entry
|
||||
|
@ -8,14 +8,10 @@ build:
|
||||
tools:
|
||||
python: "3.12"
|
||||
|
||||
sphinx:
|
||||
configuration: docs/source/conf.py
|
||||
fail_on_warning: true
|
||||
|
||||
# If using Sphinx, optionally build your docs in additional formats such as PDF
|
||||
formats: []
|
||||
mkdocs:
|
||||
configuration: mkdocs.yaml
|
||||
|
||||
# Optionally declare the Python requirements required to build your docs
|
||||
python:
|
||||
install:
|
||||
- requirements: docs/requirements-docs.txt
|
||||
- requirements: requirements/docs.txt
|
||||
|
413
CMakeLists.txt
Executable file → Normal file
413
CMakeLists.txt
Executable file → Normal file
@ -15,7 +15,6 @@ project(vllm_extensions LANGUAGES CXX)
|
||||
|
||||
# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
|
||||
set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
|
||||
|
||||
message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
|
||||
message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")
|
||||
|
||||
@ -24,17 +23,17 @@ include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
|
||||
# Suppress potential warnings about unused manually-specified variables
|
||||
set(ignoreMe "${VLLM_PYTHON_PATH}")
|
||||
|
||||
# Prevent installation of dependencies (cutlass) by default.
|
||||
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
|
||||
|
||||
#
|
||||
# Supported python versions. These versions will be searched in order, the
|
||||
# first match will be selected. These should be kept in sync with setup.py.
|
||||
#
|
||||
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
|
||||
|
||||
# Supported NVIDIA architectures.
|
||||
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0")
|
||||
|
||||
# Supported AMD GPU architectures.
|
||||
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101")
|
||||
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
|
||||
|
||||
#
|
||||
# Supported/expected torch versions for CUDA/ROCm.
|
||||
@ -44,10 +43,10 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101")
|
||||
#
|
||||
# Note: the CUDA torch version is derived from pyproject.toml and various
|
||||
# requirements.txt files and should be kept consistent. The ROCm torch
|
||||
# versions are derived from Dockerfile.rocm
|
||||
# versions are derived from docker/Dockerfile.rocm
|
||||
#
|
||||
set(TORCH_SUPPORTED_VERSION_CUDA "2.5.1")
|
||||
set(TORCH_SUPPORTED_VERSION_ROCM "2.5.1")
|
||||
set(TORCH_SUPPORTED_VERSION_CUDA "2.7.0")
|
||||
set(TORCH_SUPPORTED_VERSION_ROCM "2.7.0")
|
||||
|
||||
#
|
||||
# Try to find python package with an executable that exactly matches
|
||||
@ -80,6 +79,15 @@ endif()
|
||||
#
|
||||
find_package(Torch REQUIRED)
|
||||
|
||||
# Supported NVIDIA architectures.
|
||||
# This check must happen after find_package(Torch) because that's when CMAKE_CUDA_COMPILER_VERSION gets defined
|
||||
if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
|
||||
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
|
||||
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
|
||||
else()
|
||||
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0")
|
||||
endif()
|
||||
|
||||
#
|
||||
# Forward the non-CUDA device extensions to external CMake scripts.
|
||||
#
|
||||
@ -174,6 +182,21 @@ include(FetchContent)
|
||||
file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists
|
||||
message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
|
||||
|
||||
if(VLLM_GPU_LANG STREQUAL "HIP")
|
||||
#
|
||||
# Overriding the default -O set up by cmake, adding ggdb3 for the most verbose devug info
|
||||
#
|
||||
set(CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG "${CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG} -O0 -ggdb3")
|
||||
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -ggdb3")
|
||||
|
||||
#
|
||||
# Certain HIP functions are marked as [[nodiscard]], yet vllm ignores the result which generates
|
||||
# a lot of warnings that always mask real issues. Suppressing until this is properly addressed.
|
||||
#
|
||||
set(CMAKE_${VLLM_GPU_LANG}_FLAGS "${CMAKE_${VLLM_GPU_LANG}_FLAGS} -Wno-unused-result")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-result")
|
||||
endif()
|
||||
|
||||
#
|
||||
# Define other extension targets
|
||||
#
|
||||
@ -208,28 +231,35 @@ endif()
|
||||
#
|
||||
|
||||
set(VLLM_EXT_SRC
|
||||
"csrc/mamba/mamba_ssm/selective_scan_fwd.cu"
|
||||
"csrc/mamba/causal_conv1d/causal_conv1d.cu"
|
||||
"csrc/cache_kernels.cu"
|
||||
"csrc/attention/paged_attention_v1.cu"
|
||||
"csrc/attention/paged_attention_v2.cu"
|
||||
"csrc/attention/merge_attn_states.cu"
|
||||
"csrc/attention/vertical_slash_index.cu"
|
||||
"csrc/pos_encoding_kernels.cu"
|
||||
"csrc/activation_kernels.cu"
|
||||
"csrc/layernorm_kernels.cu"
|
||||
"csrc/layernorm_quant_kernels.cu"
|
||||
"csrc/sampler.cu"
|
||||
"csrc/cuda_view.cu"
|
||||
"csrc/quantization/gptq/q_gemm.cu"
|
||||
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
|
||||
"csrc/quantization/fp8/common.cu"
|
||||
"csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
|
||||
"csrc/quantization/gguf/gguf_kernel.cu"
|
||||
"csrc/quantization/activation_kernels.cu"
|
||||
"csrc/cuda_utils_kernels.cu"
|
||||
"csrc/prepare_inputs/advance_step.cu"
|
||||
"csrc/custom_all_reduce.cu"
|
||||
"csrc/torch_bindings.cpp")
|
||||
|
||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
|
||||
|
||||
# Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
|
||||
# Please keep this in sync with FetchContent_Declare line below.
|
||||
set(CUTLASS_REVISION "v3.7.0" CACHE STRING "CUTLASS revision to use")
|
||||
# Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
|
||||
set(CUTLASS_REVISION "v3.9.2" CACHE STRING "CUTLASS revision to use")
|
||||
|
||||
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
|
||||
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
|
||||
@ -247,7 +277,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
cutlass
|
||||
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
|
||||
# Please keep this in sync with CUTLASS_REVISION line above.
|
||||
GIT_TAG v3.7.0
|
||||
GIT_TAG ${CUTLASS_REVISION}
|
||||
GIT_PROGRESS TRUE
|
||||
|
||||
# Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
|
||||
@ -259,16 +289,16 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
FetchContent_MakeAvailable(cutlass)
|
||||
|
||||
list(APPEND VLLM_EXT_SRC
|
||||
"csrc/mamba/mamba_ssm/selective_scan_fwd.cu"
|
||||
"csrc/mamba/causal_conv1d/causal_conv1d.cu"
|
||||
"csrc/quantization/aqlm/gemm_kernels.cu"
|
||||
"csrc/quantization/awq/gemm_kernels.cu"
|
||||
"csrc/custom_all_reduce.cu"
|
||||
"csrc/permute_cols.cu"
|
||||
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
|
||||
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
|
||||
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
|
||||
"csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu"
|
||||
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
|
||||
"csrc/cutlass_extensions/common.cpp")
|
||||
"csrc/cutlass_extensions/common.cpp"
|
||||
"csrc/attention/mla/cutlass_mla_entry.cu")
|
||||
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${VLLM_EXT_SRC}"
|
||||
@ -277,10 +307,55 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
# Only build Marlin kernels if we are building for at least some compatible archs.
|
||||
# Keep building Marlin for 9.0 as there are some group sizes and shapes that
|
||||
# are not supported by Machete yet.
|
||||
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
|
||||
# 9.0 for latest bf16 atomicAdd PTX
|
||||
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
|
||||
if (MARLIN_ARCHS)
|
||||
|
||||
#
|
||||
# For the Marlin kernels we automatically generate sources for various
|
||||
# preselected input type pairs and schedules.
|
||||
# Generate sources:
|
||||
set(MARLIN_GEN_SCRIPT
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/gptq_marlin/generate_kernels.py)
|
||||
file(MD5 ${MARLIN_GEN_SCRIPT} MARLIN_GEN_SCRIPT_HASH)
|
||||
|
||||
message(STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH}")
|
||||
message(STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH}")
|
||||
|
||||
if (NOT DEFINED CACHE{MARLIN_GEN_SCRIPT_HASH}
|
||||
OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH})
|
||||
execute_process(
|
||||
COMMAND ${CMAKE_COMMAND} -E env
|
||||
PYTHONPATH=$PYTHONPATH
|
||||
${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT}
|
||||
RESULT_VARIABLE marlin_generation_result
|
||||
OUTPUT_VARIABLE marlin_generation_result
|
||||
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
|
||||
ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
|
||||
)
|
||||
|
||||
if (NOT marlin_generation_result EQUAL 0)
|
||||
message(FATAL_ERROR "Marlin generation failed."
|
||||
" Result: \"${marlin_generation_result}\""
|
||||
"\nCheck the log for details: "
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log")
|
||||
else()
|
||||
set(MARLIN_GEN_SCRIPT_HASH ${MARLIN_GEN_SCRIPT_HASH}
|
||||
CACHE STRING "Last run Marlin generate script hash" FORCE)
|
||||
message(STATUS "Marlin generation completed successfully.")
|
||||
endif()
|
||||
else()
|
||||
message(STATUS "Marlin generation script has not changed, skipping generation.")
|
||||
endif()
|
||||
|
||||
file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/kernel_*.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
|
||||
CUDA_ARCHS "${MARLIN_ARCHS}")
|
||||
|
||||
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
|
||||
|
||||
set(MARLIN_SRCS
|
||||
"csrc/quantization/fp8/fp8_marlin.cu"
|
||||
"csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
|
||||
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
|
||||
"csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
|
||||
@ -297,43 +372,89 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
" in CUDA target architectures")
|
||||
endif()
|
||||
|
||||
# Only build AllSpark kernels if we are building for at least some compatible archs.
|
||||
cuda_archs_loose_intersection(ALLSPARK_ARCHS "8.0;8.6;8.7;8.9" "${CUDA_ARCHS}")
|
||||
if (ALLSPARK_ARCHS)
|
||||
set(ALLSPARK_SRCS
|
||||
"csrc/quantization/gptq_allspark/allspark_repack.cu"
|
||||
"csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${ALLSPARK_SRCS}"
|
||||
CUDA_ARCHS "${ALLSPARK_ARCHS}")
|
||||
list(APPEND VLLM_EXT_SRC "${ALLSPARK_SRCS}")
|
||||
message(STATUS "Building AllSpark kernels for archs: ${ALLSPARK_ARCHS}")
|
||||
else()
|
||||
message(STATUS "Not building AllSpark kernels as no compatible archs found"
|
||||
" in CUDA target architectures")
|
||||
endif()
|
||||
|
||||
|
||||
set(SCALED_MM_3X_ARCHS)
|
||||
# The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
|
||||
# CUDA 12.0 or later (and only work on Hopper, 9.0a for now).
|
||||
cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0a" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
|
||||
set(SRCS
|
||||
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
|
||||
# CUDA 12.0 or later
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS)
|
||||
set(SRCS
|
||||
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu"
|
||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
|
||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
|
||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu"
|
||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${SRCS}"
|
||||
CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
|
||||
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
||||
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
||||
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C3X=1")
|
||||
message(STATUS "Building scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
|
||||
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_SM90=1")
|
||||
# Let scaled_mm_c2x know it doesn't need to build these arches
|
||||
list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
|
||||
message(STATUS "Building scaled_mm_c3x_sm90 for archs: ${SCALED_MM_ARCHS}")
|
||||
else()
|
||||
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
|
||||
message(STATUS "Not building scaled_mm_c3x as CUDA Compiler version is "
|
||||
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS)
|
||||
message(STATUS "Not building scaled_mm_c3x_sm90 as CUDA Compiler version is "
|
||||
"not >= 12.0, we recommend upgrading to CUDA 12.0 or "
|
||||
"later if you intend on running FP8 quantized models on "
|
||||
"Hopper.")
|
||||
else()
|
||||
message(STATUS "Not building scaled_mm_c3x as no compatible archs found "
|
||||
message(STATUS "Not building scaled_mm_c3x_sm90 as no compatible archs found "
|
||||
"in CUDA target architectures")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't
|
||||
# build any 3x kernels
|
||||
set(SCALED_MM_3X_ARCHS)
|
||||
# The cutlass_scaled_mm kernels for Blackwell (c3x, i.e. CUTLASS 3.x) require
|
||||
# CUDA 12.8 or later
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;12.0a" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
|
||||
set(SRCS
|
||||
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
|
||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
|
||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu"
|
||||
)
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${SRCS}"
|
||||
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
||||
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
||||
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_SM100=1")
|
||||
# Let scaled_mm_c2x know it doesn't need to build these arches
|
||||
list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
|
||||
message(STATUS "Building scaled_mm_c3x_sm100 for archs: ${SCALED_MM_ARCHS}")
|
||||
else()
|
||||
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
|
||||
message(STATUS "Not building scaled_mm_c3x_sm100 as CUDA Compiler version is "
|
||||
"not >= 12.8, we recommend upgrading to CUDA 12.8 or "
|
||||
"later if you intend on running FP8 quantized models on "
|
||||
"Blackwell.")
|
||||
else()
|
||||
message(STATUS "Not building scaled_mm_c3x_100 as no compatible archs found "
|
||||
"in CUDA target architectures")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
#
|
||||
# For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
|
||||
# kernels for the remaining archs that are not already built for 3x.
|
||||
# (Build 8.9 for FP8)
|
||||
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
|
||||
"7.5;8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
|
||||
"7.5;8.0;8.7;8.9+PTX" "${CUDA_ARCHS}")
|
||||
# subtract out the archs that are already built for 3x
|
||||
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
|
||||
if (SCALED_MM_2X_ARCHS)
|
||||
@ -358,17 +479,18 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
# 2:4 Sparse Kernels
|
||||
|
||||
# The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
|
||||
# require CUDA 12.2 or later (and only work on Hopper, 9.0a for now).
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
|
||||
# require CUDA 12.2 or later (and only work on Hopper).
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS)
|
||||
set(SRCS "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${SRCS}"
|
||||
CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
|
||||
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
||||
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
||||
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SPARSE_SCALED_MM_C3X=1")
|
||||
message(STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
|
||||
message(STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_ARCHS}")
|
||||
else()
|
||||
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
|
||||
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS)
|
||||
message(STATUS "Not building sparse_scaled_mm_c3x kernels as CUDA Compiler version is "
|
||||
"not >= 12.2, we recommend upgrading to CUDA 12.2 or later "
|
||||
"if you intend on running FP8 sparse quantized models on Hopper.")
|
||||
@ -381,9 +503,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
# FP4 Archs and flags
|
||||
cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS)
|
||||
set(SRCS
|
||||
set(SRCS
|
||||
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
|
||||
)
|
||||
"csrc/quantization/fp4/nvfp4_experts_quant.cu"
|
||||
"csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu"
|
||||
"csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${SRCS}"
|
||||
CUDA_ARCHS "${FP4_ARCHS}")
|
||||
@ -396,6 +520,52 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
set(FP4_ARCHS)
|
||||
endif()
|
||||
|
||||
# CUTLASS MLA Archs and flags
|
||||
cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND MLA_ARCHS)
|
||||
set(SRCS
|
||||
"csrc/attention/mla/cutlass_mla_kernels.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${SRCS}"
|
||||
CUDA_ARCHS "${MLA_ARCHS}")
|
||||
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
||||
list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MLA=1")
|
||||
# Add MLA-specific include directories only to MLA source files
|
||||
set_source_files_properties(${SRCS}
|
||||
PROPERTIES INCLUDE_DIRECTORIES "${CUTLASS_DIR}/examples/77_blackwell_fmha;${CUTLASS_DIR}/examples/common")
|
||||
message(STATUS "Building CUTLASS MLA for archs: ${MLA_ARCHS}")
|
||||
else()
|
||||
message(STATUS "Not building CUTLASS MLA as no compatible archs were found.")
|
||||
# clear MLA_ARCHS
|
||||
set(MLA_ARCHS)
|
||||
endif()
|
||||
|
||||
# CUTLASS MoE kernels
|
||||
|
||||
# The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and only works
|
||||
# on Hopper). get_cutlass_(pplx_)moe_mm_data should only be compiled
|
||||
# if it's possible to compile MoE kernels that use its output.
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
|
||||
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu"
|
||||
"csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${SRCS}"
|
||||
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
||||
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
||||
list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM90=1")
|
||||
message(STATUS "Building grouped_mm_c3x for archs: ${SCALED_MM_ARCHS}")
|
||||
else()
|
||||
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
|
||||
message(STATUS "Not building grouped_mm_c3x kernels as CUDA Compiler version is "
|
||||
"not >= 12.3, we recommend upgrading to CUDA 12.3 or later "
|
||||
"if you intend on running FP8 quantized MoE models on Hopper.")
|
||||
else()
|
||||
message(STATUS "Not building grouped_mm_c3x as no compatible archs found "
|
||||
"in CUDA target architectures")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
#
|
||||
# Machete kernels
|
||||
|
||||
@ -477,6 +647,7 @@ define_gpu_extension_target(
|
||||
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
|
||||
ARCHITECTURES ${VLLM_GPU_ARCHES}
|
||||
INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
|
||||
INCLUDE_DIRECTORIES ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
|
||||
USE_SABI 3
|
||||
WITH_SOABI)
|
||||
|
||||
@ -495,28 +666,71 @@ set(VLLM_MOE_EXT_SRC
|
||||
"csrc/moe/moe_align_sum_kernels.cu"
|
||||
"csrc/moe/topk_softmax_kernels.cu")
|
||||
|
||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
list(APPEND VLLM_MOE_EXT_SRC "csrc/moe/moe_wna16.cu")
|
||||
endif()
|
||||
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${VLLM_MOE_EXT_SRC}"
|
||||
CUDA_ARCHS "${CUDA_ARCHS}")
|
||||
|
||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
|
||||
if (MARLIN_MOE_ARCHS)
|
||||
set(MARLIN_MOE_SRC
|
||||
"csrc/moe/marlin_kernels/marlin_moe_kernel.h"
|
||||
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h"
|
||||
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu"
|
||||
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h"
|
||||
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu"
|
||||
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.h"
|
||||
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.cu"
|
||||
"csrc/moe/marlin_moe_ops.cu")
|
||||
set(VLLM_MOE_WNA16_SRC
|
||||
"csrc/moe/moe_wna16.cu")
|
||||
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${VLLM_MOE_WNA16_SRC}"
|
||||
CUDA_ARCHS "${CUDA_ARCHS}")
|
||||
|
||||
list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
|
||||
# 9.0 for latest bf16 atomicAdd PTX
|
||||
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
|
||||
if (MARLIN_MOE_ARCHS)
|
||||
|
||||
#
|
||||
# For the Marlin MOE kernels we automatically generate sources for various
|
||||
# preselected input type pairs and schedules.
|
||||
# Generate sources:
|
||||
set(MOE_MARLIN_GEN_SCRIPT
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/csrc/moe/marlin_moe_wna16/generate_kernels.py)
|
||||
file(MD5 ${MOE_MARLIN_GEN_SCRIPT} MOE_MARLIN_GEN_SCRIPT_HASH)
|
||||
|
||||
message(STATUS "Marlin MOE generation script hash: ${MOE_MARLIN_GEN_SCRIPT_HASH}")
|
||||
message(STATUS "Last run Marlin MOE generate script hash: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}")
|
||||
|
||||
if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}
|
||||
OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH})
|
||||
execute_process(
|
||||
COMMAND ${CMAKE_COMMAND} -E env
|
||||
PYTHONPATH=$PYTHONPATH
|
||||
${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT}
|
||||
RESULT_VARIABLE moe_marlin_generation_result
|
||||
OUTPUT_VARIABLE moe_marlin_generation_output
|
||||
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
|
||||
ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
|
||||
)
|
||||
|
||||
if (NOT moe_marlin_generation_result EQUAL 0)
|
||||
message(FATAL_ERROR "Marlin MOE generation failed."
|
||||
" Result: \"${moe_marlin_generation_result}\""
|
||||
"\nCheck the log for details: "
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log")
|
||||
else()
|
||||
set(MOE_MARLIN_GEN_SCRIPT_HASH ${MOE_MARLIN_GEN_SCRIPT_HASH}
|
||||
CACHE STRING "Last run Marlin MOE generate script hash" FORCE)
|
||||
message(STATUS "Marlin MOE generation completed successfully.")
|
||||
endif()
|
||||
else()
|
||||
message(STATUS "Marlin MOE generation script has not changed, skipping generation.")
|
||||
endif()
|
||||
|
||||
file(GLOB MOE_WNAA16_MARLIN_SRC "csrc/moe/marlin_moe_wna16/*.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${MARLIN_MOE_SRC}"
|
||||
SRCS "${MOE_WNAA16_MARLIN_SRC}"
|
||||
CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
|
||||
|
||||
list(APPEND VLLM_MOE_EXT_SRC "${MARLIN_MOE_SRC}")
|
||||
list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC})
|
||||
|
||||
message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
|
||||
else()
|
||||
message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
|
||||
@ -524,6 +738,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
set(MOE_PERMUTE_SRC
|
||||
"csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu"
|
||||
"csrc/moe/moe_permute_unpermute_op.cu")
|
||||
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${MARLIN_PERMUTE_SRC}"
|
||||
CUDA_ARCHS "${MOE_PERMUTE_ARCHS}")
|
||||
|
||||
list(APPEND VLLM_MOE_EXT_SRC "${MOE_PERMUTE_SRC}")
|
||||
endif()
|
||||
message(STATUS "Enabling moe extension.")
|
||||
define_gpu_extension_target(
|
||||
_moe_C
|
||||
@ -532,6 +757,8 @@ define_gpu_extension_target(
|
||||
SOURCES ${VLLM_MOE_EXT_SRC}
|
||||
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
|
||||
ARCHITECTURES ${VLLM_GPU_ARCHES}
|
||||
INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
|
||||
INCLUDE_DIRECTORIES ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
|
||||
USE_SABI 3
|
||||
WITH_SOABI)
|
||||
|
||||
@ -541,6 +768,7 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
|
||||
#
|
||||
set(VLLM_ROCM_EXT_SRC
|
||||
"csrc/rocm/torch_bindings.cpp"
|
||||
"csrc/rocm/skinny_gemms.cu"
|
||||
"csrc/rocm/attention.cu")
|
||||
|
||||
define_gpu_extension_target(
|
||||
@ -554,77 +782,10 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
|
||||
WITH_SOABI)
|
||||
endif()
|
||||
|
||||
# vllm-flash-attn currently only supported on CUDA
|
||||
if (NOT VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
return()
|
||||
# For CUDA we also build and ship some external projects.
|
||||
if (VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
include(cmake/external_projects/flashmla.cmake)
|
||||
|
||||
# vllm-flash-attn should be last as it overwrites some CMake functions
|
||||
include(cmake/external_projects/vllm_flash_attn.cmake)
|
||||
endif ()
|
||||
|
||||
# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target
|
||||
# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the
|
||||
# arches in the CUDA case (and instead set the gencodes on a per file basis)
|
||||
# we need to manually set VLLM_GPU_ARCHES here.
|
||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
foreach(_ARCH ${CUDA_ARCHS})
|
||||
string(REPLACE "." "" _ARCH "${_ARCH}")
|
||||
list(APPEND VLLM_GPU_ARCHES "${_ARCH}-real")
|
||||
endforeach()
|
||||
endif()
|
||||
|
||||
#
|
||||
# Build vLLM flash attention from source
|
||||
#
|
||||
# IMPORTANT: This has to be the last thing we do, because vllm-flash-attn uses the same macros/functions as vLLM.
|
||||
# Because functions all belong to the global scope, vllm-flash-attn's functions overwrite vLLMs.
|
||||
# They should be identical but if they aren't, this is a massive footgun.
|
||||
#
|
||||
# The vllm-flash-attn install rules are nested under vllm to make sure the library gets installed in the correct place.
|
||||
# To only install vllm-flash-attn, use --component _vllm_fa2_C (for FA2) or --component _vllm_fa3_C (for FA3).
|
||||
# If no component is specified, vllm-flash-attn is still installed.
|
||||
|
||||
# If VLLM_FLASH_ATTN_SRC_DIR is set, vllm-flash-attn is installed from that directory instead of downloading.
|
||||
# This is to enable local development of vllm-flash-attn within vLLM.
|
||||
# It can be set as an environment variable or passed as a cmake argument.
|
||||
# The environment variable takes precedence.
|
||||
if (DEFINED ENV{VLLM_FLASH_ATTN_SRC_DIR})
|
||||
set(VLLM_FLASH_ATTN_SRC_DIR $ENV{VLLM_FLASH_ATTN_SRC_DIR})
|
||||
endif()
|
||||
|
||||
if(VLLM_FLASH_ATTN_SRC_DIR)
|
||||
FetchContent_Declare(
|
||||
vllm-flash-attn SOURCE_DIR
|
||||
${VLLM_FLASH_ATTN_SRC_DIR}
|
||||
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
||||
)
|
||||
else()
|
||||
FetchContent_Declare(
|
||||
vllm-flash-attn
|
||||
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
|
||||
GIT_TAG 720c94869cf2e0ff5a706e9c7f1dce0939686ade
|
||||
GIT_PROGRESS TRUE
|
||||
# Don't share the vllm-flash-attn build between build types
|
||||
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
||||
)
|
||||
endif()
|
||||
|
||||
|
||||
# Fetch the vllm-flash-attn library
|
||||
FetchContent_MakeAvailable(vllm-flash-attn)
|
||||
message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}")
|
||||
|
||||
# Copy over the vllm-flash-attn python files (duplicated for fa2 and fa3, in
|
||||
# case only one is built, in the case both are built redundant work is done)
|
||||
install(
|
||||
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
|
||||
DESTINATION vllm_flash_attn
|
||||
COMPONENT _vllm_fa2_C
|
||||
FILES_MATCHING PATTERN "*.py"
|
||||
)
|
||||
|
||||
install(
|
||||
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
|
||||
DESTINATION vllm_flash_attn
|
||||
COMPONENT _vllm_fa3_C
|
||||
FILES_MATCHING PATTERN "*.py"
|
||||
)
|
||||
|
||||
# Nothing after vllm-flash-attn, see comment about macros above
|
||||
|
@ -1,3 +1,3 @@
|
||||
# Contributing to vLLM
|
||||
|
||||
You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing/overview.html).
|
||||
You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing).
|
||||
|
@ -1,69 +0,0 @@
|
||||
# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
|
||||
|
||||
FROM ubuntu:22.04 AS cpu-test-1
|
||||
|
||||
ENV CCACHE_DIR=/root/.cache/ccache
|
||||
|
||||
ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
|
||||
|
||||
RUN --mount=type=cache,target=/var/cache/apt \
|
||||
apt-get update -y \
|
||||
&& apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
|
||||
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
|
||||
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
|
||||
|
||||
# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
|
||||
# intel-openmp provides additional performance improvement vs. openmp
|
||||
# tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
pip install intel-openmp==2025.0.1
|
||||
|
||||
ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so"
|
||||
|
||||
RUN echo 'ulimit -c 0' >> ~/.bashrc
|
||||
|
||||
RUN pip install intel_extension_for_pytorch==2.5.0
|
||||
|
||||
WORKDIR /workspace
|
||||
|
||||
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
|
||||
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
--mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
|
||||
pip install --upgrade pip && \
|
||||
pip install -r requirements-build.txt
|
||||
|
||||
FROM cpu-test-1 AS build
|
||||
|
||||
WORKDIR /workspace/vllm
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
--mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \
|
||||
--mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \
|
||||
pip install -v -r requirements-cpu.txt
|
||||
|
||||
COPY . .
|
||||
ARG GIT_REPO_CHECK=0
|
||||
RUN --mount=type=bind,source=.git,target=.git \
|
||||
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
|
||||
|
||||
# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
|
||||
ARG VLLM_CPU_DISABLE_AVX512
|
||||
ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
--mount=type=cache,target=/root/.cache/ccache \
|
||||
--mount=type=bind,source=.git,target=.git \
|
||||
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
|
||||
pip install dist/*.whl && \
|
||||
rm -rf dist
|
||||
|
||||
WORKDIR /workspace/
|
||||
|
||||
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
|
||||
|
||||
# install development dependencies (for testing)
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
pip install -e tests/vllm_test_utils
|
||||
|
||||
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
|
@ -1,29 +0,0 @@
|
||||
# The vLLM Dockerfile is used to construct vLLM image that can be directly used
|
||||
# to run the OpenAI compatible server.
|
||||
|
||||
FROM ubuntu:22.04 AS dev
|
||||
|
||||
RUN apt-get update -y && \
|
||||
apt-get install -y \
|
||||
git python3-pip \
|
||||
ffmpeg libsm6 libxext6 libgl1
|
||||
WORKDIR /workspace
|
||||
|
||||
COPY . .
|
||||
ARG GIT_REPO_CHECK=0
|
||||
RUN --mount=type=bind,source=.git,target=.git \
|
||||
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
|
||||
|
||||
RUN python3 -m pip install -U pip
|
||||
# install build requirements
|
||||
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/requirements-build.txt
|
||||
# build vLLM with OpenVINO backend
|
||||
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace
|
||||
|
||||
COPY examples/ /workspace/examples
|
||||
COPY benchmarks/ /workspace/benchmarks
|
||||
|
||||
# install development dependencies (for testing)
|
||||
RUN python3 -m pip install -e tests/vllm_test_utils
|
||||
|
||||
CMD ["/bin/bash"]
|
@ -1,37 +0,0 @@
|
||||
FROM mambaorg/micromamba
|
||||
ARG MAMBA_DOCKERFILE_ACTIVATE=1
|
||||
USER root
|
||||
|
||||
ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/"
|
||||
|
||||
RUN apt-get update -y && apt-get install -y git wget kmod curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 libssl-dev
|
||||
|
||||
# Some packages in requirements-cpu are installed here
|
||||
# IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
|
||||
# Currently these may not be available for venv or pip directly
|
||||
RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 rust && micromamba clean --all --yes
|
||||
|
||||
COPY ./ /workspace/vllm
|
||||
|
||||
WORKDIR /workspace/vllm
|
||||
ARG GIT_REPO_CHECK=0
|
||||
RUN --mount=type=bind,source=.git,target=.git \
|
||||
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
RUSTFLAGS='-L /opt/conda/lib' pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
|
||||
'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
|
||||
-r requirements-cpu.txt \
|
||||
xformers uvloop==0.20.0
|
||||
|
||||
RUN --mount=type=bind,source=.git,target=.git \
|
||||
VLLM_TARGET_DEVICE=cpu python3 setup.py install
|
||||
|
||||
# install development dependencies (for testing)
|
||||
RUN python3 -m pip install -e tests/vllm_test_utils
|
||||
|
||||
WORKDIR /workspace/
|
||||
|
||||
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
|
||||
|
||||
ENTRYPOINT ["/opt/conda/bin/python3", "-m", "vllm.entrypoints.openai.api_server"]
|
@ -1,69 +0,0 @@
|
||||
FROM intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04 AS vllm-base
|
||||
|
||||
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
|
||||
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
|
||||
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
|
||||
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
|
||||
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
|
||||
chmod 644 /usr/share/keyrings/intel-graphics.gpg
|
||||
|
||||
RUN apt-get update -y && \
|
||||
apt-get install -y --no-install-recommends --fix-missing \
|
||||
curl \
|
||||
ffmpeg \
|
||||
git \
|
||||
libsndfile1 \
|
||||
libsm6 \
|
||||
libxext6 \
|
||||
libgl1 \
|
||||
lsb-release \
|
||||
numactl \
|
||||
python3 \
|
||||
python3-dev \
|
||||
python3-pip \
|
||||
# vim \
|
||||
wget
|
||||
|
||||
WORKDIR /workspace/vllm
|
||||
COPY requirements-xpu.txt /workspace/vllm/requirements-xpu.txt
|
||||
COPY requirements-common.txt /workspace/vllm/requirements-common.txt
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
pip install --no-cache-dir \
|
||||
-r requirements-xpu.txt
|
||||
|
||||
RUN git clone https://github.com/intel/pti-gpu && \
|
||||
cd pti-gpu/sdk && \
|
||||
git checkout 6c491f07a777ed872c2654ca9942f1d0dde0a082 && \
|
||||
mkdir build && \
|
||||
cd build && \
|
||||
cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchains/icpx_toolchain.cmake -DBUILD_TESTING=OFF .. && \
|
||||
make -j && \
|
||||
cmake --install . --config Release --prefix "/usr/local"
|
||||
|
||||
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/"
|
||||
|
||||
COPY . .
|
||||
ARG GIT_REPO_CHECK
|
||||
RUN --mount=type=bind,source=.git,target=.git \
|
||||
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
|
||||
|
||||
ENV VLLM_TARGET_DEVICE=xpu
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
--mount=type=bind,source=.git,target=.git \
|
||||
python3 setup.py install
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
|
||||
FROM vllm-base AS vllm-openai
|
||||
|
||||
# install additional dependencies for openai api server
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
pip install accelerate hf_transfer 'modelscope!=1.15.0'
|
||||
|
||||
ENV VLLM_USAGE_SOURCE production-docker-image \
|
||||
TRITON_XPU_PROFILE 1
|
||||
# install development dependencies (for testing)
|
||||
RUN python3 -m pip install -e tests/vllm_test_utils
|
||||
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user