mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 23:03:52 +08:00
Compare commits
2684 Commits
v0.6.4.pos
...
disable-sd
Author | SHA1 | Date | |
---|---|---|---|
b73fdb927a | |||
a0304dc504 | |||
c7941cca18 | |||
b6dd32aa07 | |||
f94886946e | |||
72dfe4c74f | |||
8b464d9660 | |||
889ebb2638 | |||
3ad986c28b | |||
344e193b7d | |||
fb1c933ade | |||
72c5b97231 | |||
fa93cd9f60 | |||
aec9674dbe | |||
7fcc4223dc | |||
8262a3e23b | |||
f211331c48 | |||
9053d0b134 | |||
cb3f2d8d10 | |||
c12df53b60 | |||
d1aeea7553 | |||
d8bccde686 | |||
20e489eaa1 | |||
4213475ec7 | |||
d92879baf6 | |||
690fe019f0 | |||
ed7a29d9f8 | |||
756848e79e | |||
18445edd0f | |||
30215ca61f | |||
838cedade7 | |||
4283a28c2f | |||
93a126fbc7 | |||
8e4b351a0c | |||
9869453c42 | |||
3642c59aa8 | |||
43eea2953b | |||
de7eb10ce4 | |||
fd11a325b8 | |||
4d17e20310 | |||
10fd1d7380 | |||
52b4f4a8d7 | |||
e782e0a170 | |||
dc2ceca5c5 | |||
f8acd01ff7 | |||
c48334d405 | |||
909fdaf152 | |||
8c1c926d00 | |||
df6f3ce883 | |||
513f074766 | |||
b07bf83c7d | |||
53e8cf53a4 | |||
54271bb766 | |||
9e96f56efb | |||
b278911229 | |||
7bd0c7745c | |||
1cf0719ebd | |||
537d5ee025 | |||
c8e5be35f7 | |||
a6e72e1e4f | |||
5e83a7277f | |||
68af5f6c5c | |||
8de2901fea | |||
c53e0730cb | |||
a0e619e62a | |||
70116459c3 | |||
65e262b93b | |||
43faa0461a | |||
48cb2109b6 | |||
a5450f11c9 | |||
9d98ab5ec6 | |||
df5c879527 | |||
423e9f1cbe | |||
0bd7f8fca5 | |||
d5615af9ae | |||
19dcc02a72 | |||
7feae92c1f | |||
f851b84266 | |||
fc966e9cc6 | |||
ef19e67d2c | |||
a41351f363 | |||
6aae216b4e | |||
b22980a1dc | |||
881f735827 | |||
2f54045508 | |||
5aa6efb9a5 | |||
6ca0234478 | |||
649818995f | |||
7a0a9da72b | |||
69bff9bc89 | |||
41ca7eb491 | |||
eef364723c | |||
0d6e187e88 | |||
9420a1fc30 | |||
583e900996 | |||
05e1fbfc52 | |||
fe92176321 | |||
6d0df0ebeb | |||
0fa939e2d1 | |||
0422ce109f | |||
47bdee409c | |||
49f189439d | |||
5adf6f6b7f | |||
4115f19958 | |||
340d7b1b21 | |||
1bcbcbf574 | |||
82e43b2d7e | |||
67309a1cb5 | |||
b724afe343 | |||
21f4f1c9a4 | |||
b0c1f6202d | |||
c0dfd97519 | |||
a9138e85b1 | |||
0a05ed57e6 | |||
14288d1332 | |||
b411418ff0 | |||
2bc0f72ae5 | |||
9c1244de57 | |||
db2f8d915c | |||
6167c0e5d2 | |||
ed2e464653 | |||
2c8ed8ee48 | |||
ed50f46641 | |||
46e678bcff | |||
6b2427f995 | |||
b07d741661 | |||
41fb013d29 | |||
32d4b669d0 | |||
3cde34a4a4 | |||
bdb3660312 | |||
f3a21e9c68 | |||
8e630d680e | |||
af869f6dff | |||
53c0fa1e25 | |||
f7912cba3d | |||
6317a5174a | |||
aa72d9a4ea | |||
ce17db8085 | |||
8c87a9ad46 | |||
ec69124eb4 | |||
d0da99fb70 | |||
b2f195c429 | |||
047797ef90 | |||
eb8ef4224d | |||
56a735261c | |||
e1cf90e099 | |||
6bc1e30ef9 | |||
7e081ba7ca | |||
1e013fa388 | |||
bc7c4d206b | |||
f67e9e9f22 | |||
36fe78769f | |||
83d933718c | |||
5175b884f7 | |||
5536b30a4c | |||
7f58fb9718 | |||
30bc3e0f66 | |||
f34410715f | |||
68d4c33202 | |||
f961d7f6ef | |||
d059110498 | |||
571e8dd65e | |||
4b91c927f6 | |||
0e237f0035 | |||
8f7bace7c3 | |||
e4d6144232 | |||
8d32dc603d | |||
c4ab9f3e71 | |||
2689d5c027 | |||
acba33a0f1 | |||
a114bf20a3 | |||
3097ce3a32 | |||
d6da9322c8 | |||
71ce44047f | |||
188b7f9b8c | |||
b9b4746950 | |||
7b8a2ab76f | |||
c9acbf1141 | |||
5b794cae8d | |||
0e4254492f | |||
1311913f55 | |||
29f395c97c | |||
fa3bba2a53 | |||
986537f1c3 | |||
210207525e | |||
71eda0bb76 | |||
471fe65630 | |||
3a0fba5cf4 | |||
299ebb62b2 | |||
f728ab8e35 | |||
63e26fff78 | |||
fe3462c774 | |||
3b34fd5273 | |||
55d6d3fdb8 | |||
7272bfae77 | |||
d9ac9e3dc5 | |||
d41faaf9df | |||
b34f33438a | |||
26c0406555 | |||
4c41278b77 | |||
bb3605db85 | |||
fe742aef5a | |||
4b07d36891 | |||
87aaadef73 | |||
682e0b6d2f | |||
d6195a748b | |||
205d84aaa9 | |||
5124f5bf51 | |||
83f3c3bd91 | |||
d9737ca1c6 | |||
9d4ca19d50 | |||
2ef0dc53b8 | |||
1d4680fad2 | |||
2c1bd848a6 | |||
5c9121203c | |||
490b1698a5 | |||
5a5e29de88 | |||
3d3ab3689f | |||
686623c5e7 | |||
aadb656562 | |||
87e067de41 | |||
26507f8973 | |||
9c1d5b456d | |||
e31045f95c | |||
aaec845f8e | |||
7bdfd29a35 | |||
e78587a64c | |||
7eb4255628 | |||
6a0f547561 | |||
30ed81b7ca | |||
7a4a5de729 | |||
c16fb5dae8 | |||
e37073efd7 | |||
183dad7a85 | |||
3408e47159 | |||
0377b8310b | |||
e4755f7fac | |||
92edf35826 | |||
eb5819b2d9 | |||
5989f4684d | |||
5125d72f02 | |||
a018e555fd | |||
6211b92273 | |||
05fcd1b430 | |||
7c02d6a137 | |||
11c3b98491 | |||
dbe7f07001 | |||
c69bf4ee06 | |||
d27ea94034 | |||
99ed526101 | |||
207da28186 | |||
5b1aca2ae3 | |||
d8e557b5e5 | |||
61a44a0b22 | |||
a6481525b8 | |||
8cac35ba43 | |||
9dbf7a2dc1 | |||
607029e515 | |||
cb072ce93b | |||
95aca283b4 | |||
2b05b8ce69 | |||
3c776dcefb | |||
2cbd4d2999 | |||
3092375e27 | |||
3cd91dc955 | |||
8a7368e069 | |||
93e561ec4d | |||
e1b004839a | |||
ee378f3d49 | |||
e82ee40de3 | |||
facbe2a114 | |||
7168920491 | |||
21378a2323 | |||
976711d9db | |||
44fa4d556c | |||
3ac98edcb1 | |||
966c742ed2 | |||
0d7d05f4b6 | |||
96bb8aa68b | |||
3badb0213b | |||
fdcb850f14 | |||
54a66e5fee | |||
280d62b8a2 | |||
1666e66443 | |||
1575c1701a | |||
6ae996a873 | |||
b590adfdc1 | |||
b4fe16c75b | |||
bc5dd4f669 | |||
dbb036cf61 | |||
70e7ed841d | |||
d06ba4ed3f | |||
6b40996ae8 | |||
d2020acac7 | |||
1eb3c2ed48 | |||
c64ee87267 | |||
b1308b84a3 | |||
7b5ecf79bd | |||
9883a18859 | |||
b3f2fddd17 | |||
aa29841ede | |||
6bf27affb6 | |||
1dd23386ec | |||
7cbfc10943 | |||
ce4ddd2d1a | |||
e51929ebca | |||
dc1b4a6f13 | |||
63d2705edb | |||
d085a44082 | |||
f49e5aff11 | |||
6c11ecf8d3 | |||
93e5f3c5fb | |||
70363bccfa | |||
3cdc57669f | |||
68bb122eb4 | |||
d9fc8cd9da | |||
f069f3ea74 | |||
c5bc0e7fcc | |||
4a3a518722 | |||
fbf722c6e6 | |||
e92d7085bf | |||
bd6028d6b0 | |||
802329dee9 | |||
41cc883c29 | |||
57504a4bcf | |||
ed4792c990 | |||
87b836ba77 | |||
56c76c2e0e | |||
c09632a66c | |||
a3bf8d4a2b | |||
16eda8c43a | |||
cd77382ac1 | |||
71b9cde010 | |||
5285589f37 | |||
f41647ee6b | |||
4d022cbc75 | |||
70de35a881 | |||
34b2cf3b33 | |||
9e90c9f73f | |||
e9528f6dc6 | |||
51baa9c333 | |||
35e076b3a8 | |||
a26f59ccbc | |||
aa3b3d76e0 | |||
f7030df3be | |||
905e91e9ac | |||
f8f9c0ba62 | |||
dda811021a | |||
93195146ea | |||
ed37599544 | |||
99ef59cf7f | |||
d544d141ec | |||
3e397a9484 | |||
268c325078 | |||
3cc9af88ff | |||
7cd0bd7212 | |||
56d4aefa33 | |||
dd143ef541 | |||
daefed052c | |||
5fbab20e02 | |||
e8224f3dca | |||
9665313c39 | |||
0c54fc7273 | |||
c1b57855ec | |||
83b824c8b4 | |||
7678fcd5b6 | |||
8661c0241d | |||
ce8d6b75fc | |||
61de3ef74b | |||
ec1f9c8c91 | |||
65e09094c4 | |||
c70cf0fe06 | |||
a5d11a54dc | |||
3d4c87758e | |||
a9bd832fc5 | |||
417bcefbae | |||
baada0e737 | |||
82eb61dd4c | |||
0d4d06fe2f | |||
4aed0ca6a2 | |||
1621b25288 | |||
a564797151 | |||
1da6a09274 | |||
1e44ffc3ff | |||
a454748544 | |||
1bff42c4b7 | |||
cb391d85dc | |||
fee5b8d37f | |||
b2ce859bd2 | |||
566f10a929 | |||
c3b5189137 | |||
a25866ac8d | |||
098900d7c2 | |||
98d01d3ce2 | |||
d55244df31 | |||
04149cce27 | |||
24834f4894 | |||
ec7da6fcf3 | |||
819d548e8a | |||
477d2a8aa2 | |||
e484e02857 | |||
24f6b9a713 | |||
9cdde47289 | |||
b1eb4ca152 | |||
87b4ac56c2 | |||
cb84e45ac7 | |||
4716377fbc | |||
4e9cf8c1dd | |||
2976dc27e9 | |||
102bf967f0 | |||
1f4b09b525 | |||
86c3369eb8 | |||
2755c34a8f | |||
db10422184 | |||
e1a2c699dd | |||
0115ccd5c0 | |||
40b4284fe3 | |||
4ebc0b9640 | |||
dc96fd54c6 | |||
1f5d13ab9f | |||
90cb44eb02 | |||
e11880deea | |||
9351f91be9 | |||
5a1e1c8353 | |||
69ecaa7c79 | |||
7f00899ff7 | |||
995e3d1f41 | |||
b4ac449a83 | |||
8e5314a468 | |||
87918e40c4 | |||
f6b32efb7f | |||
b99733d092 | |||
05a015d6a5 | |||
ad971af8c7 | |||
f2ebb6f541 | |||
1d01211264 | |||
f94ab12f79 | |||
a865bc1ca6 | |||
21802c4b6d | |||
652907b354 | |||
24f1c01e0f | |||
fad6e2538e | |||
7f6d47c1a2 | |||
3147586ebd | |||
ed636d99ca | |||
090c856d76 | |||
ad434d4cfe | |||
66d433b94f | |||
027b204ff1 | |||
55dcce91df | |||
8017c8db7f | |||
dc3529dbf6 | |||
7699258ef0 | |||
e9ba99f296 | |||
7c80368710 | |||
95d63f38c0 | |||
bb8dab821e | |||
fc0f87768a | |||
0a57386721 | |||
3749e28774 | |||
86fc2321ff | |||
2549c0dfef | |||
b10e519895 | |||
9bde5ba127 | |||
72c8f1ad04 | |||
da224daaa9 | |||
3a100b9278 | |||
242a637aea | |||
c2a9671510 | |||
d5ae4f7f42 | |||
b6c502a150 | |||
9ca710e525 | |||
eb07c8cb5b | |||
ba10801961 | |||
620fc2d09e | |||
29283eaa7e | |||
2fa66ef713 | |||
13affc432d | |||
d8f094a92a | |||
97ae6d777f | |||
6baeee70d1 | |||
d2517a4939 | |||
6342adc438 | |||
0adba91547 | |||
4285e423a6 | |||
63375f0cdb | |||
70ad3f9e98 | |||
d6fc629f4d | |||
af51d80fa1 | |||
f5722a5052 | |||
651cf0fec1 | |||
4dc52e1c53 | |||
4708f13a9c | |||
a6d042df0a | |||
40a36ccfeb | |||
ef608c37a7 | |||
2386803f2a | |||
95862f7b4d | |||
230b131b54 | |||
0812d8dd41 | |||
bf7e3c51ae | |||
a35a8a8392 | |||
4ef0bb1fcf | |||
fadc59c0e6 | |||
86cbd2eee9 | |||
092475f738 | |||
dcc56d62da | |||
f15e70d906 | |||
b6be6f8d1e | |||
03a70eacaf | |||
45b1ff7a25 | |||
15ba07ef25 | |||
d2b58ca203 | |||
82e7e19a6e | |||
421c462948 | |||
84884cd9ac | |||
a43aa183dc | |||
463bbb1835 | |||
5e125e74d1 | |||
06f21ce7a5 | |||
57a810db9c | |||
8b664706aa | |||
37bfee92bf | |||
e73ff24e31 | |||
bd7599d34a | |||
01b6113659 | |||
1b84eff03a | |||
55acf86bf8 | |||
f021b97993 | |||
1cab43c2d2 | |||
8bd651b318 | |||
58e234a754 | |||
e86c414d6a | |||
550b2801ad | |||
cefb9e5a28 | |||
98d7367b61 | |||
594a8b9030 | |||
44f990515b | |||
252937806c | |||
51826d51fa | |||
14e53ed11f | |||
ddb94c2605 | |||
90969fb39a | |||
101f1481f9 | |||
2edc87b161 | |||
4203926f10 | |||
cdb57015a7 | |||
aa557e6422 | |||
0e00d40e4f | |||
c920e01242 | |||
274d8e8818 | |||
2039c6305b | |||
6efb195a6e | |||
24b7fb455a | |||
58f5a59769 | |||
db9dfcfa6a | |||
9ef98d527e | |||
93491aefc7 | |||
7acd539cd7 | |||
e75a6301bd | |||
a79cc68b3a | |||
7e3f7a4ee7 | |||
9ec8257914 | |||
38327cf454 | |||
dfa82e2a3d | |||
e59ca942f5 | |||
a57a3044aa | |||
4e5a0f6ae2 | |||
b63bd14999 | |||
2041c0e360 | |||
085cbc4f9f | |||
2b93162fb0 | |||
2e45bd29fe | |||
51d7c6a2b2 | |||
f3aca1ee30 | |||
8dd41d6bcc | |||
0a298ea418 | |||
d330558bab | |||
656fd72976 | |||
79455cf421 | |||
30d6a015e0 | |||
8af5a5c4e5 | |||
3a5f0afcd2 | |||
c7e63aa4d8 | |||
4a9ce1784c | |||
7e4e709b43 | |||
63d8eabed0 | |||
e830b01383 | |||
ff6473980d | |||
a164aea35d | |||
a76f547e11 | |||
b7b7676d67 | |||
e6e3c55ef2 | |||
f98a4920f9 | |||
d4bfc23ef0 | |||
9a2160fa55 | |||
2de4118243 | |||
239b7befdd | |||
09e974d483 | |||
e5ef4fa99a | |||
037bcd942c | |||
c2e7507ad4 | |||
3aa2b6a637 | |||
555aa21905 | |||
e7ae3bf3d6 | |||
b932c048ac | |||
e85829450d | |||
effc5d24fa | |||
18ed3132d2 | |||
9b459eca88 | |||
70fedd0f79 | |||
bb103b29bf | |||
248e76c4df | |||
803d5c35f3 | |||
7fd8c0f85c | |||
44c3a5abc3 | |||
6909a76201 | |||
045533716b | |||
3c0ff914ac | |||
2bc4be4e32 | |||
c67abd614f | |||
6fa7cd3dbc | |||
94744ba41a | |||
4965ec42d2 | |||
73aa7041bf | |||
7c1f760024 | |||
da461f3cbf | |||
5b800f0932 | |||
8427f70493 | |||
7a7992085b | |||
1286211f57 | |||
6d531ad7b8 | |||
762b424a52 | |||
de1cb38769 | |||
c802f5430d | |||
cff8991a50 | |||
f3f8d8fff4 | |||
26df46ee59 | |||
c3f687ac22 | |||
04437e313d | |||
038bededba | |||
d03308be0c | |||
c6bc0034d0 | |||
70e132244a | |||
47e9038d23 | |||
432cf22a6a | |||
2914006fe0 | |||
7329ff5468 | |||
541d1df486 | |||
3b00ff9138 | |||
91276c5721 | |||
0b4167526d | |||
fd5fd26902 | |||
3bbaacbe15 | |||
a10314c6b3 | |||
70f2c2a709 | |||
280d074103 | |||
32b14baf8a | |||
2d9045fce8 | |||
355f66348c | |||
8693e47e6a | |||
cec8c7d7f8 | |||
4d0ec37267 | |||
e7f720ea56 | |||
4ae17bf1e2 | |||
8a49eea74b | |||
b4245a48df | |||
4e0f6076be | |||
726efc6a32 | |||
bd45912b99 | |||
15dac210f0 | |||
112b3e5b3b | |||
32d669275b | |||
4098b72210 | |||
46450b8d33 | |||
13ac9cab21 | |||
66aa4c0bf4 | |||
247181536f | |||
07bf813fb5 | |||
8958217ad5 | |||
ac5bc615b0 | |||
8063dfc61a | |||
6278bc829e | |||
3f532cb6a6 | |||
e6c9053f9e | |||
43ed4143c4 | |||
f4c98b4d4c | |||
e1e0fd7543 | |||
df8d3d1287 | |||
619d3de8bd | |||
ecff8309a3 | |||
dcf2a590f5 | |||
54aa619459 | |||
fb22be5817 | |||
7f301dd8ef | |||
8095341a01 | |||
69db16a46a | |||
ce78f9af4e | |||
9239bf718e | |||
7a6d45bc8a | |||
e74ff409e0 | |||
7a888271f5 | |||
9d119a86ae | |||
b2e85e26f4 | |||
dd8a29da99 | |||
27df5199d9 | |||
35fad35a48 | |||
733e7c9e95 | |||
0af4d764d6 | |||
e64afa455c | |||
1711b929b6 | |||
c091c0a588 | |||
1aa162e030 | |||
cf5c8f1686 | |||
4ec2cee000 | |||
99f536f830 | |||
5ebf66748b | |||
781d056280 | |||
5aefd6ac31 | |||
6c663dfd5e | |||
33437bc6e7 | |||
23114d3364 | |||
997c8811d6 | |||
e42389f9d7 | |||
ff38f0a32c | |||
a5cfbab3c8 | |||
ac3cd6e83c | |||
082ab86f5f | |||
6aa196c8dc | |||
a0dd7dcd49 | |||
e977c11111 | |||
5f063a80bd | |||
5d8e1c9279 | |||
0a049c7d86 | |||
d0cfec7ab9 | |||
a608160027 | |||
3f04a7fbf2 | |||
5994430b84 | |||
a9e879b316 | |||
3e2f37a69a | |||
4f044b1d67 | |||
4157f563b4 | |||
051da7efe3 | |||
25f560a62c | |||
a09ad90a72 | |||
10b34e36b9 | |||
b5269db959 | |||
6db94571d7 | |||
97cfa65df7 | |||
911c8eb000 | |||
ebcebeeb6b | |||
f533b5837f | |||
8279201ce6 | |||
23fdab00a8 | |||
623e2ed29f | |||
9d72daf4ce | |||
6dd55af6c9 | |||
3eb08ed9b1 | |||
5eeadc2642 | |||
3aee6573dc | |||
9cc645141d | |||
0893567db9 | |||
8abe69b499 | |||
761702fd19 | |||
9606d572ed | |||
cbcdf2c609 | |||
038de04d7b | |||
6b3cc75be0 | |||
7ffcccfa5c | |||
cc8accfd53 | |||
948ab03e7e | |||
5797fb97e9 | |||
3892e58ad7 | |||
d20e261199 | |||
f622dbcf39 | |||
dccf535f8e | |||
9c5c81b0da | |||
d6cd59f122 | |||
bc8ed3c4ba | |||
b9bd76ca14 | |||
6ebaf9ac71 | |||
f90d34b498 | |||
f68cce8e64 | |||
09b6a95551 | |||
50c9636d87 | |||
0661cfef7a | |||
a827aa815d | |||
b877031d80 | |||
dd861b992f | |||
eb63ea1e18 | |||
2f4bd358f1 | |||
8a8b30eac1 | |||
2fa0e1396b | |||
1c2bec0f82 | |||
ec870fba9a | |||
df1430265c | |||
4c69e228b3 | |||
790b79750b | |||
cfbb8c930f | |||
baec0d4de9 | |||
c21b99b912 | |||
93a00d7dde | |||
61e8c18350 | |||
8afcd0f633 | |||
91ca929dc7 | |||
84e00adc8a | |||
47c7126213 | |||
a989ca2bf6 | |||
0fa3970deb | |||
da6ea29f7a | |||
7297941b38 | |||
f8a08cb90d | |||
b15fd2be2a | |||
e588ac237c | |||
5df2da5b97 | |||
11b986b3fb | |||
296f927f24 | |||
0032903a5b | |||
47195057e9 | |||
6edbfa924d | |||
1e508343e1 | |||
2e0b4cfde0 | |||
10f55fe6c5 | |||
d3ccbd6350 | |||
0cfe7d386d | |||
0c6f5023c3 | |||
06dd08256f | |||
2b22290ce0 | |||
d8e82bc06d | |||
086b56824c | |||
5a0905ba2a | |||
a8f12a63fd | |||
69ae2380c6 | |||
27261e40a6 | |||
e3f813c33b | |||
c607a2652b | |||
3d45e3d749 | |||
742369d35a | |||
bfe2fe0af4 | |||
a8652f4f0f | |||
2f726b241e | |||
a597a57595 | |||
ae65f3e237 | |||
34868b106a | |||
1f16b7fe74 | |||
b88be22165 | |||
d8c6d7d6b5 | |||
40828ce5fe | |||
ffa443afed | |||
70e500cad9 | |||
4cb1c05c9e | |||
c47aafa37c | |||
cfbca8a2f2 | |||
0fe5609874 | |||
22d33baca2 | |||
b0e96aaebb | |||
8310e0b59b | |||
26dd972adb | |||
61c7a1b856 | |||
374ee287d8 | |||
a4d83661d7 | |||
8363cd093d | |||
6c5a3195db | |||
073d1ed354 | |||
3d446433ec | |||
1fe0fd12d3 | |||
dafb4e504a | |||
68cf1601d3 | |||
61f412187d | |||
05ccd0aa35 | |||
f690372b68 | |||
8b3e94a357 | |||
437f9162d0 | |||
4f065f12f5 | |||
228b768db6 | |||
027827cc1d | |||
72a8639b68 | |||
99abb8b650 | |||
3a1e648158 | |||
46c759c165 | |||
179a619c21 | |||
452e8fd968 | |||
8b793f7ec6 | |||
af35d3a3cc | |||
3b457143d2 | |||
ab656f2c2f | |||
64fc2193dc | |||
dd732028f5 | |||
414919138b | |||
db7c8ca910 | |||
f863ffc965 | |||
400d483e87 | |||
d1695758b2 | |||
53a0cf8b95 | |||
5eeabc2a44 | |||
18551e820c | |||
e41e160263 | |||
b89fb2a4a1 | |||
5340b0e221 | |||
37e3806132 | |||
c0efdd655b | |||
aaaec52ad9 | |||
e1eb45d397 | |||
89fca671fb | |||
d20b0c139c | |||
166a168b0f | |||
2bb0e1a799 | |||
6eaf1e5c52 | |||
868a8c5b2c | |||
b4ad56c1bd | |||
69698f257e | |||
cd0cd85102 | |||
0a74bfce9c | |||
dd3b865854 | |||
9b87a579aa | |||
b539222d4e | |||
8d6cf89526 | |||
583a9778e0 | |||
a73e183e36 | |||
1e799b7ec1 | |||
7f6c5ee06c | |||
faa0275730 | |||
8a5a9b70d7 | |||
bb3aeddfaf | |||
aecc780dba | |||
90df7f23aa | |||
b9b5bdfc7d | |||
31060b2757 | |||
fc1f67715d | |||
f6137adbcb | |||
e53b1350f2 | |||
d30aa7e9e6 | |||
d1ad2a57af | |||
b82662d952 | |||
71c1e07107 | |||
b30c75dda4 | |||
def232e122 | |||
3453b964a3 | |||
61c6a5a796 | |||
74bc397b0a | |||
f58aea002c | |||
3556a41434 | |||
9ed6ee92d6 | |||
ee3778d5fc | |||
aaacf17324 | |||
4c7629cae9 | |||
e0fdfa1608 | |||
5952d8ab61 | |||
a2ae496589 | |||
877e352262 | |||
d4d93db2c5 | |||
8c0d15d5c5 | |||
97ac781c62 | |||
776dcec8fe | |||
ccf02fcbae | |||
acaea3bb07 | |||
9f37422779 | |||
dd344e0342 | |||
54a8804455 | |||
bbd94a19fc | |||
233ffce1eb | |||
40677783aa | |||
14f301b541 | |||
46f98893dd | |||
fe66b34728 | |||
270a5da495 | |||
7097b4cc1c | |||
977a16772c | |||
73deea2fdb | |||
9d2b4a70f4 | |||
0b0d6421b2 | |||
1140991a7b | |||
613c5bb945 | |||
fd8e055ffb | |||
ab93f1360f | |||
40253bab44 | |||
c77620d22d | |||
989ecd2007 | |||
54cc46f3eb | |||
601bd3268e | |||
09269b3127 | |||
27b50f1fe6 | |||
9532c49836 | |||
0c2af17c76 | |||
a6e0d096dd | |||
d3d4956261 | |||
4059adc31b | |||
f1f632d9ec | |||
95d680b862 | |||
fb4c7f8ef0 | |||
0b1cfa6180 | |||
32ef4983cd | |||
ad19c8a003 | |||
2a602b055a | |||
7888e1d0a3 | |||
60c872d4b6 | |||
3fb17d26c8 | |||
d47807ba08 | |||
02fcaa3d0a | |||
8a4a2efc6f | |||
8e9ffd37d6 | |||
01b3fd0af7 | |||
f53a0586b9 | |||
b1cc4dfef5 | |||
382403921f | |||
a73122de96 | |||
bd44b812cb | |||
55211b01e8 | |||
5d043c1685 | |||
36d1ccb286 | |||
1bc3b739c4 | |||
1bd32bc8dd | |||
128bf75283 | |||
a94a699c3f | |||
ab426ec9c0 | |||
165290d357 | |||
ce20124671 | |||
53be4a8634 | |||
f5d3acd474 | |||
916836bbfb | |||
d9f83d6206 | |||
4a754fcf15 | |||
c0c25e25fa | |||
45f3f3f59e | |||
ff47aab056 | |||
debd6bbf09 | |||
5c538c37b2 | |||
e22ee1e7a2 | |||
e392d85831 | |||
77a318bd01 | |||
80e78d02ac | |||
4a42b9f5d6 | |||
47532cd9f4 | |||
36e0c8f7da | |||
9f583e360c | |||
b706d898af | |||
863d315c86 | |||
d374f04a33 | |||
61a01b27a7 | |||
53056731fd | |||
4cbf286794 | |||
c6e14a61ab | |||
07b4b7a37f | |||
07964e2f30 | |||
4bf82d4b90 | |||
9ab326713f | |||
af295e9b01 | |||
a1c8f3796c | |||
08a1a1121d | |||
1477ffc381 | |||
70b808fe1a | |||
63d635d179 | |||
1fc973c0b5 | |||
c982ac5722 | |||
4290b704ff | |||
c91b64f749 | |||
d6123170d5 | |||
485afdd3cb | |||
90e88ab756 | |||
04421dff8a | |||
432d6dad15 | |||
5ff0d32580 | |||
0967110e42 | |||
fb0acb6c72 | |||
92b0ce2ac7 | |||
bc2d4473bf | |||
3b352a2f92 | |||
dea985aef0 | |||
39be30351f | |||
001a9c7b0d | |||
89cdaa83e7 | |||
b0746fae3d | |||
60a98b2de5 | |||
460f553a6d | |||
1253b15774 | |||
dc74613fa2 | |||
a21076ed3a | |||
212007b168 | |||
fb16eea48b | |||
73ae0b44e9 | |||
6d7f037748 | |||
10f7552789 | |||
b0d541947a | |||
5f0b53c6ea | |||
eb8b5eb183 | |||
9513290032 | |||
0d5e73d30e | |||
609ef61fea | |||
db84f5eb3b | |||
206e2577fa | |||
e02883c400 | |||
9085aabd62 | |||
8d5aa466fb | |||
0b7f06b447 | |||
03fe18ae0f | |||
cb8bdfade2 | |||
33f227e16b | |||
cfd0ae8234 | |||
7caff01a7b | |||
be0b399d74 | |||
b8b0ccbd2d | |||
c908a07f57 | |||
7b6fd6e486 | |||
47512b3200 | |||
3b9c6c6947 | |||
4aae667668 | |||
9f3bc0f58c | |||
980385f8c1 | |||
ca7a2d5f28 | |||
333681408f | |||
ef64044079 | |||
66e16a038e | |||
e1f0835ae0 | |||
8ed5421aaa | |||
c6359e8ca6 | |||
952a074980 | |||
d0feea31c7 | |||
58abe35455 | |||
f7ebad2307 | |||
80e9afb5bc | |||
1e3598edeb | |||
f7a6bd0fa1 | |||
0ca3b8e01c | |||
cc10281498 | |||
05fb6718f0 | |||
12c29a881f | |||
70da0c0748 | |||
c1588a2c94 | |||
8ca7a71df7 | |||
63137cd922 | |||
ddd1ef66ec | |||
e5e03c2c1b | |||
e1744502c2 | |||
dae6896977 | |||
c34eeec58d | |||
ad60bbb2b2 | |||
0578e5a462 | |||
04222984f8 | |||
6832707e90 | |||
6b2ef5cd17 | |||
958adce478 | |||
99b0915d3b | |||
8ca2b21c98 | |||
d9292786e1 | |||
cc2f9b32c8 | |||
cd579352bf | |||
9f1710f1ac | |||
e642ec962c | |||
ada19210a3 | |||
bf0560bda9 | |||
151b08e0fe | |||
81b2f4a45f | |||
82551ad616 | |||
caac5c2e59 | |||
6bd1dd9d26 | |||
4f27044aab | |||
0ddc991f5c | |||
fa82b93853 | |||
69ff99fdcd | |||
5d802522a7 | |||
1769928079 | |||
ed6ea06577 | |||
5ee10e990d | |||
3dbd2d813a | |||
f5f7f00cd9 | |||
abcc61e0af | |||
f6bb18fd9a | |||
71eaf8969b | |||
ca100c90fe | |||
ffad94397d | |||
4dacaa4a83 | |||
a7ea35aa67 | |||
1e3e76b6cc | |||
53ea6ad830 | |||
1b7624bf5c | |||
ac60dc7fe1 | |||
a4f1ee35d6 | |||
a32c8669ca | |||
ca2ca8de57 | |||
f71b00a19e | |||
8f808cf86e | |||
7bab4bb048 | |||
e17e4488bd | |||
257e200a25 | |||
47d4a7e004 | |||
7f89a594dd | |||
961644e6a8 | |||
8d6cd32b7b | |||
ec79b67c77 | |||
32985bed7c | |||
dae9ec464c | |||
6eaf93020d | |||
72c62eae5f | |||
0a995d5434 | |||
ade3f7d988 | |||
0df25101d6 | |||
e123aafdf0 | |||
5b143d33be | |||
eb59b5a6cb | |||
fbfc3ee37e | |||
3e1d223626 | |||
4f5b059f14 | |||
288ca110f6 | |||
c2bd2196fc | |||
550c7ba3dc | |||
e5b2f1601a | |||
9badee53de | |||
beebf4742a | |||
f89978ad7c | |||
b3cf368d79 | |||
c8525f06fc | |||
5db6b2c961 | |||
6247bae6c6 | |||
3610fb4930 | |||
71c4b40562 | |||
ac65bc92df | |||
f78c0be80a | |||
66233af7b6 | |||
bf13d40972 | |||
989f4f430c | |||
bb5b640359 | |||
c060b71408 | |||
79e4937c65 | |||
cd1d3c3df8 | |||
19d98e0c7d | |||
2b04c209ee | |||
ae122b1cbd | |||
872db2be0e | |||
2dfdfed8a0 | |||
c41d27156b | |||
91373a0d15 | |||
848a6438ae | |||
98175b2816 | |||
4167252eaf | |||
f35f8e2242 | |||
b87c21fc89 | |||
e584b85afd | |||
09e56f9262 | |||
cf069aa8aa | |||
bf33700ecd | |||
bc6ccb9878 | |||
82fbeae92b | |||
cc5e8f6db8 | |||
d54990da47 | |||
b9f1d4294e | |||
b28246f6ff | |||
3b5567a209 | |||
fdcc405346 | |||
8994dabc22 | |||
02296f420d | |||
6a92ff93e1 | |||
6a84164add | |||
f64ffa8c25 | |||
bd56c983d6 | |||
084bbac8cc | |||
28943d36ce | |||
b526ca6726 | |||
e7bd944e08 | |||
c3b6559a10 | |||
4be4b26cb7 | |||
2aed2c9fa7 | |||
9b61dd41e7 | |||
f7bee5c815 | |||
e0734387fb | |||
f58f8b5c96 | |||
b3f7aaccd0 | |||
b91660ddb8 | |||
76c89fcadd | |||
b9e41734c5 | |||
1088f06242 | |||
73e0225ee9 | |||
6c85da3a18 | |||
67fc426845 | |||
9804145cac | |||
2e94b9cfbb | |||
8294773e48 | |||
cd813c6d4d | |||
38acae6e97 | |||
a2dd48c386 | |||
126f6beeb4 | |||
58d1b2aa77 | |||
f1579b229d | |||
7864875879 | |||
1dd422b64a | |||
06c8f8d885 | |||
5677c9bb3e | |||
512d77d582 | |||
7f0be2aa24 | |||
edf309ebbe | |||
788f284b53 | |||
4b1d141f49 | |||
10c3b8c1cf | |||
a7f37314b7 | |||
cd711c48b2 | |||
378b3ef6f8 | |||
c9944acbf9 | |||
ca377cf1b9 | |||
a31614e386 | |||
f95903909f | |||
b382a7f28f | |||
4cb6fa0a9c | |||
d08b285adf | |||
b27122acc2 | |||
934bb99c71 | |||
3f808cc044 | |||
ec8a5e5386 | |||
215bf150a6 | |||
0ecdd98031 | |||
7b700ec8c8 | |||
7ca1da020f | |||
5157338ed9 | |||
e206b54331 | |||
1d35662e6d | |||
e656f638de | |||
145944cb94 | |||
094b7d9496 | |||
e1fe7591f2 | |||
5629f26df7 | |||
9ba28043b5 | |||
24679788ed | |||
07c4353057 | |||
34e3494e70 | |||
f75aa72732 | |||
340e39e387 | |||
f4133ce4e5 | |||
6522d55b6f | |||
6ff518626c | |||
fa82074167 | |||
75e9d49796 | |||
32c3b6bfd1 | |||
37b6cb4985 | |||
aabeb2688f | |||
2f42a4888c | |||
3173c3b34e | |||
2d87d7d1ac | |||
aab392774b | |||
6724e79164 | |||
03f48b3db6 | |||
4d251ad00e | |||
18e505930d | |||
4a8cfc7551 | |||
bc32bc73aa | |||
ab1091d5f2 | |||
1e15aaef56 | |||
51010a1807 | |||
7196a3b1db | |||
cdc1fa12eb | |||
f61528d46d | |||
1f0ae3ed0a | |||
db986c19ea | |||
227578480d | |||
befc402d34 | |||
444b0f0f62 | |||
ccc00515fd | |||
781096e385 | |||
7940d8a6a7 | |||
c0e3ecd6d2 | |||
23eca9cf68 | |||
437b76ff59 | |||
f90a375593 | |||
e7ef74e26e | |||
cbae7af552 | |||
eb24dc4a45 | |||
9bebc9512f | |||
5a2ba16f5c | |||
ba5106e519 | |||
d5ca2110f1 | |||
2c5e637b57 | |||
322d2a27d6 | |||
82e0d601fc | |||
78ac0f591d | |||
b56155e7f3 | |||
382f66fb08 | |||
8354f6640c | |||
c904fdddf6 | |||
558db8083c | |||
e109e598c7 | |||
8db1b9d0a1 | |||
2382ad29d1 | |||
3e472d882a | |||
7f6bae561c | |||
105b8ce4c0 | |||
2cb8c1540e | |||
1cd981da4f | |||
fca20841c2 | |||
da31b5333e | |||
bb78fb318e | |||
8aca27fa11 | |||
95c617e04b | |||
9a1f1da5d1 | |||
68d630a0c7 | |||
68d535ef44 | |||
c6ed93860f | |||
0ffdf8ce0c | |||
8c0dd3d4df | |||
ada7c780d5 | |||
288cc6c234 | |||
900edbfa48 | |||
b2c3fc5d65 | |||
839b27c6cc | |||
34ad27fe83 | |||
1c3c975766 | |||
1cdc88614a | |||
31aa045c11 | |||
a30c093502 | |||
c7b07a95a6 | |||
27a09dc52c | |||
981f3c831e | |||
44c33f01f3 | |||
33170081f1 | |||
71face8540 | |||
bfbc0b32c6 | |||
6a417b8600 | |||
d3ea50113c | |||
34aad515c8 | |||
ed6e9075d3 | |||
992e5c3d34 | |||
b69692a2d8 | |||
a64a84433d | |||
aa1e62d0db | |||
497bc83124 | |||
3738e6fa80 | |||
0023cd2b9d | |||
041e294716 | |||
9621667874 | |||
8c755c3b6d | |||
ba81163997 | |||
0d243f2a54 | |||
88f6ba3281 | |||
512368e34a | |||
473f51cfd9 | |||
a4c402a756 | |||
550d97eb58 | |||
fbbe1fbac6 | |||
01c184b8f3 | |||
ad5a35c21b | |||
5ae9f26a5a | |||
377d10bd14 | |||
52ce14d31f | |||
81dabf24a8 | |||
423330263b | |||
caf7ff4456 | |||
f525c0be8b | |||
983a40a8bb | |||
fdc5df6f54 | |||
3b05cd4555 | |||
d5d214ac7f | |||
fd84857f64 | |||
8aada19dfc | |||
9aa95b0e6a | |||
d0a7a2769d | |||
00b69c2d27 | |||
4c82229898 | |||
c8d70e2437 | |||
30172b4947 | |||
a4d577b379 | |||
7b203b7694 | |||
4fb8142a0e | |||
a02c86b4dd | |||
3809458456 | |||
d3231cb436 | |||
435b502a6e | |||
29fc5772c4 | |||
2358ca527b | |||
8cf97f8661 | |||
e2603fefb8 | |||
b53d79983c | |||
9915912f7f | |||
d1b649f1ef | |||
ac19b519ed | |||
a1074b3efe | |||
00294e1bc6 | |||
88787bce1d | |||
932b51cedd | |||
7c7adf81fc | |||
67ef8f666a | |||
efbe854448 | |||
b3942e157e | |||
cd4a72a28d | |||
6ac485a953 | |||
4c21ce9eba | |||
ce77eb9410 | |||
30513d1cb6 | |||
1f69c4a892 | |||
7b623fca0b | |||
238dfc8ac3 | |||
45186834a0 | |||
f857311d13 | |||
46cdd59577 | |||
2010f04c17 | |||
69e1d23e1e | |||
d67cc21b78 | |||
e18227b04a | |||
7b89386553 | |||
da833b0aee | |||
5d2965b7d7 | |||
a0231b7c25 | |||
124776ebd5 | |||
b7d309860e | |||
dc0f7ccf8b | |||
d3d547e057 | |||
12913d17ba | |||
80f63a3966 | |||
367cb8ce8c | |||
54ed913f34 | |||
9206b3d7ec | |||
ed0de3e4b8 | |||
2ad1bc7afe | |||
7fdaaf48ef | |||
067fa2255b | |||
9076325677 | |||
97a3d6d995 | |||
579d7a63b2 | |||
c9f9d5b397 | |||
0c73026844 | |||
6a854c7a2b | |||
e7eea5a520 | |||
a12934d3ec | |||
3bcb8c75da | |||
5e5c8e091e | |||
c9e2d644e7 | |||
7734e9a291 | |||
6224a9f620 | |||
085b7b2d6c | |||
4da1f667e9 | |||
556ef7f714 | |||
83481ceb49 | |||
185cc19f92 | |||
45f90bcbba | |||
b0ccfc565a | |||
ba59b78a9c | |||
cbc40128eb | |||
f0b2da72a8 | |||
f2b20fe491 | |||
40932d7a05 | |||
84683fa271 | |||
067678262a | |||
09545c0a94 | |||
dd5ede4440 | |||
8c32b08a86 | |||
410886950a | |||
e38be640e6 | |||
c1e37bf71b | |||
2344192a55 | |||
bffddd9a05 | |||
d84cef76eb | |||
37dfa60037 | |||
1bc3b5e71b | |||
02ed8a1fbe | |||
2092a6fa7d | |||
c9d3ecf016 | |||
fdcf64d3c6 | |||
578087e56c | |||
fa253f1a70 | |||
9605c1256e | |||
0ccd8769fb | |||
cb944d5818 | |||
d46d490c27 | |||
04f50ad9d1 | |||
60c68df6d1 | |||
009439caeb | |||
bc55d13070 | |||
d88c8666a1 | |||
4fc5c23bb6 | |||
9f9704dca6 | |||
8eafe5eaea | |||
4c0d93f4b2 | |||
14b7899d10 | |||
09972e716c | |||
36a08630e8 | |||
2c2b560f48 | |||
042c3419fa | |||
82cabf53a3 | |||
314cfade02 | |||
985b4a2b19 | |||
f4d97e4fc2 | |||
f1042e86f0 | |||
7c4033acd4 | |||
d59def4730 | |||
0c7d9effce | |||
dd3b4a01f8 | |||
a0597c6b75 | |||
e92694b6fe | |||
842b0fd402 | |||
974dfd4971 | |||
3ee696a63d | |||
72c2b68dc9 | |||
14ecab5be2 | |||
deb6c1c6b4 | |||
565c1efa65 | |||
2b25b7d2e1 | |||
6c4dbe23eb | |||
21f5d50fa5 | |||
bf3e05215c | |||
ad9776353e | |||
75e6e14516 | |||
110f59a33e | |||
2e3b969ec0 | |||
da317197dd | |||
7539bbc6a6 | |||
9cf4759493 | |||
41c5dd45b9 | |||
fc6485d277 | |||
78a141d768 | |||
c320ca8edd | |||
58047c6f04 | |||
cb080f32e3 | |||
2c0f58203c | |||
2ff4857678 | |||
91e876750e | |||
08b2d845d6 | |||
2ae889052c | |||
51f0b5f7f6 | |||
fde71262e0 | |||
243137143c | |||
b2496bb07f | |||
44607e07d3 | |||
67c4637ccf | |||
aa0ca5ebb7 | |||
59fff4a01a | |||
29f1d47e73 | |||
cf797aa856 | |||
24700c346b | |||
d366ccc4e3 | |||
870c37481e | |||
86222a3dab | |||
fe743b798d | |||
913df14da3 | |||
8a69e0e20e | |||
4c8dd12ef3 | |||
256a2d29dc | |||
c45d398e6f | |||
011e612d92 | |||
7e1837676a | |||
2880e21e3d | |||
407b5537db | |||
4ea48fb35c | |||
e31498bdcb | |||
91dd8f7aa6 | |||
d01f66b039 | |||
cc01223f3b | |||
306923da82 | |||
3243158336 | |||
b21f0f9d17 | |||
45cbc4991d | |||
932c6b7461 | |||
eaa92d4437 | |||
0630d4537a | |||
538fab93cd | |||
ce26b16268 | |||
1918aa1b80 | |||
6e1fc61f0f | |||
aa375dca9f | |||
433c4a4923 | |||
ef533d25fb | |||
b260782357 | |||
741429a4cd | |||
aff404571b | |||
467a96a541 | |||
8108ac841d | |||
afe74f7a96 | |||
09b95e36ab | |||
85ac82d228 | |||
1e57b1ee63 | |||
e152f29502 | |||
c786e757fa | |||
cefd56ee35 | |||
7ca9934fe7 | |||
0408efc6d0 | |||
449d1bce02 | |||
1a6fcad4c9 | |||
56534cd577 | |||
d88506dda4 | |||
9cdea30b4f | |||
76abd0c881 | |||
5b19b93082 | |||
75404d041b | |||
bf3b79efb8 | |||
9a5b1554b4 | |||
a4ce74c14a | |||
3b2005e1db | |||
af8486de49 | |||
4c3aac51e1 | |||
bc1bdecebf | |||
022bcc701a | |||
c53dc466b1 | |||
3d09e592a8 | |||
fcf2e3d7fc | |||
58b218d7ae | |||
7ff7a638b6 | |||
686006a220 | |||
98fd089fc9 | |||
249824c3bf | |||
64862d106e | |||
b3a0d01e45 | |||
75e94309e8 | |||
233df6f5c4 | |||
18016a5e62 | |||
649550f27e | |||
62467a834a | |||
6469038b14 | |||
815079de8e | |||
18a88fcccc | |||
d1ca7df84d | |||
96b23621c1 | |||
c36ac98d01 | |||
4896d0c2dd | |||
bb392af434 | |||
5d98d56089 | |||
73b35cca7f | |||
5095e96606 | |||
cf58b9c4ca | |||
4797dad3ec | |||
6dd5e52823 | |||
c11de33dad | |||
33e0602e59 | |||
a1a2aaadb9 | |||
1298a400e8 | |||
ad4a9dc817 | |||
b9986454fe | |||
c5932e5dac | |||
20579c0fae | |||
95460fc513 | |||
326fcc8b9f | |||
e64330910b | |||
e489ad7a21 | |||
f256ebe4df | |||
f8ece6e17f | |||
abfcdcdf27 | |||
e497f33491 | |||
baaa2b24da | |||
b4e5c03306 | |||
3194039c0e | |||
4f4d427ac2 | |||
1e3698393f | |||
baeded2569 | |||
3e1c76cf3a | |||
cfa134d247 | |||
35b7a05507 | |||
1867c258bd | |||
cb3e73e4c8 | |||
b1340f9d55 | |||
44bbca78d7 | |||
60808bd4c7 | |||
fc542144c4 | |||
eb5741ad42 | |||
145c2ff648 | |||
415f19474d | |||
89003c4082 | |||
60bcef000e | |||
847f883232 | |||
325f679f32 | |||
e3f7ff65e7 | |||
7a8987dac5 | |||
cabaf4eff3 | |||
a1fc18c030 | |||
9798b2fb00 | |||
4078052f09 | |||
bd2107e30a | |||
9b0c4bab36 | |||
41bf5612f5 | |||
a2769032ca | |||
f17f1d4608 | |||
1c1bb0bbf2 | |||
e0cc5f259a | |||
73aa6cfdf7 | |||
27b78c73ca | |||
b02fd288b2 | |||
ff7424f491 | |||
d93bf4da85 | |||
036ca94c25 | |||
ef001d98ef | |||
5f671cb4c3 | |||
bd02164cf9 | |||
46fb056749 | |||
dd6a3a02cb | |||
a7e3eba66f | |||
fbb5bd4cef | |||
80fcc3ed1c | |||
c386c43ca3 | |||
f26d790718 | |||
0f657bdc52 | |||
3fd1fb63ef | |||
925d2f1908 | |||
8f58a51358 | |||
2079e43bee | |||
e29d4358ef | |||
8cbc424975 | |||
dd66fd2b01 | |||
0f465ab533 | |||
23a7cbc88b | |||
426a5c3625 | |||
ddee88d0ff | |||
823ab79633 | |||
6116ca8cd7 | |||
2bc3fbba0c | |||
3f1fc7425a | |||
01ba927040 | |||
103bd17ac5 | |||
ce69f7f754 | |||
624a1e4711 | |||
372bf0890b | |||
5204ff5c3f | |||
0cc6b383d7 | |||
28e0750847 | |||
582cf78798 | |||
0034b09ceb | |||
72bac73067 | |||
68f11149d8 | |||
72f4880425 | |||
aa2cd2c43d | |||
9ddc35220b | |||
a5255270c3 | |||
0ee349b553 | |||
fa63e710c7 | |||
2a0309a646 | |||
324960a95c | |||
f1fc0510df | |||
bf21481dde | |||
fb30ee92ee | |||
221d388cc5 | |||
3132a933b6 | |||
df5dafaa5b | |||
ab5bbf5ae3 | |||
3bb8e2c9a2 | |||
e784c6b998 | |||
9a0f3bdbe5 | |||
c7c9851036 | |||
3c818bdb42 | |||
6dd94dbe94 | |||
0e74d797ce | |||
55ef66edf4 | |||
5e5630a478 | |||
d3d6bb13fb | |||
24b0205f58 | |||
c5cffcd0cd | |||
682b55bc07 | |||
9726ad676d | |||
eb5cb5e528 | |||
2cbeedad09 | |||
2c85529bfc | |||
e97f802b2d | |||
6e650f56a1 | |||
3f50c148fd | |||
8c01b8022c | |||
99d01a5e3d | |||
d07efb31c5 | |||
978b45f399 | |||
c5b4b11d7f | |||
8ae5ff2009 | |||
511627445e | |||
f0ef37233e | |||
7551a34032 | |||
01a55941f5 | |||
8d7aa9de71 | |||
68c4421b6d | |||
aea94362c9 | |||
7206ce4ce1 | |||
96f6a7596f | |||
84bee4bd5c | |||
fc66dee76d | |||
6609cdf019 | |||
16366ee8bb | |||
528dbcac7d | |||
cd7b6f0857 | |||
68ad4e3a8d | |||
4004f144f3 | |||
66818e5b63 | |||
222a9dc350 | |||
cbdc4ad5a5 | |||
016e3676e7 | |||
64ea24d0b3 | |||
df76e5af26 | |||
09ccc9c8f7 | |||
69196a9bc7 | |||
2acba47d9b | |||
9c485d9e25 | |||
fa9ee08121 | |||
347eeebe3b | |||
18fd4a8331 | |||
132a132100 | |||
1e60f87bb3 | |||
9705b90bcf | |||
3aec49e56f | |||
c64612802b | |||
9a7c3a0042 | |||
b197a5ccfd | |||
c81081fece | |||
a94eee4456 | |||
f2e9f2a3be | |||
1f1542afa9 | |||
96912550c8 | |||
2fc6944c5e | |||
5fe6bf29d6 | |||
d4b62d4641 | |||
ecf67814f1 | |||
750f4cabfa | |||
06a760d6e8 | |||
da7512215f | |||
af69a6aded | |||
7bd3630067 | |||
96663699b2 | |||
18572e3384 | |||
86bfb6dba7 | |||
5f0ec3935a | |||
c222f47992 | |||
170eb35079 | |||
b37d82791e | |||
3127e975fb | |||
4001ea1266 | |||
5c89a29c22 | |||
59a0192fb9 | |||
83609791d2 | |||
0974c9bc5c | |||
d2643128f7 | |||
c5c06209ec | |||
3ea7b94523 | |||
51ef828f10 | |||
df450aa567 | |||
bbe5f9de7d | |||
81763c58a0 | |||
edaae198e7 | |||
936db119ed | |||
e66faf4809 | |||
630eb5b5ce | |||
4e94951bb1 | |||
7a8a48d51e | |||
32eb0da808 | |||
6d0e3d3724 | |||
02798ecabe | |||
813f249f02 | |||
da02cb4b27 | |||
c09503ddd6 | |||
2b83503227 | |||
7b98a65ae6 | |||
b5b57e301e | |||
54cacf008f | |||
58fd57ff1d | |||
87a0c076af | |||
d4e6194570 | |||
07934cc237 | |||
69d765f5a5 | |||
8027a72461 | |||
d75ab55f10 | |||
d1adb9b403 | |||
b8bfa46a18 | |||
1475847a14 | |||
fead53ba78 | |||
ebc73f2828 | |||
d06e824006 | |||
62b06ba23d | |||
5fd24ec02e | |||
874f7c292a | |||
92e793d91a | |||
bf53e0c70b | |||
dd7c9ad870 | |||
9aa1519f08 | |||
f8ef146f03 | |||
fa0050db08 | |||
cd9d06fb8d | |||
ebd8c669ef | |||
70755e819e | |||
edce722eaa | |||
57e729e874 | |||
de0526f668 | |||
5ecf3e0aaf | |||
97eb97b5a4 | |||
3adf0ffda8 | |||
ad388d25a8 | |||
cbe94391eb | |||
994fc655b7 | |||
3f9b7ab9f5 | |||
ad34c0df0f | |||
f218f9c24d | |||
0794e7446e | |||
b7ee940a82 | |||
9ddac56311 | |||
1a51b9f872 | |||
42f5e7c52a | |||
a3a3ee4e6f | |||
87054a57ab | |||
c9d6ff530b | |||
a2d2acb4c8 | |||
2e0e017610 | |||
1f18adb245 | |||
bb354e6b2d | |||
ff39141a49 | |||
8a1f938e6f | |||
078da31903 | |||
1a401252b5 | |||
f35ec461fc | |||
289b5191d5 | |||
c6db21313c | |||
a7d59688fb | |||
458e63a2c6 | |||
e8c23ff989 | |||
cd8249903f | |||
0f8cafe2d1 | |||
5340a30d01 | |||
89ce62a316 | |||
c3f05b09a0 | |||
cf6bbcb493 | |||
80ea3af1a0 | |||
9dd02d85ca | |||
f7b3ba82c3 | |||
619ae268c3 | |||
d14e98d924 | |||
9597a095f2 | |||
263a870ee1 | |||
8bddb73512 | |||
f967e51f38 | |||
43f3d9e699 | |||
b25cfab9a0 | |||
4b657d3292 | |||
d697dc01b4 | |||
a991f7d508 | |||
7a3a83e3b8 | |||
c32a7c7c0c | |||
2118d0565c | |||
899136b857 | |||
c9f09a4fe8 | |||
d45cbe70f5 | |||
8a579408f3 | |||
46fa98ccad | |||
aa1e77a19c | |||
5959564f94 | |||
f33e033e27 | |||
482cdc494e | |||
20410b2fda | |||
12664ddda5 | |||
241ad7b301 | |||
d85c47d6ad | |||
ef725feafc | |||
d907be7dc7 | |||
d53575a5f0 | |||
61af633256 | |||
ac2f3f7fee | |||
cf5f000d21 | |||
3de2b1eafb | |||
b844b99ad3 | |||
c3cf54dda4 | |||
36f5303578 | |||
9a228348d2 | |||
bd82872211 | |||
405eb8e396 | |||
65097ca0af | |||
1d967acb45 | |||
0bd1ff4346 | |||
310aca88c9 | |||
a732900efc | |||
d848800e88 | |||
730e9592e9 | |||
1fe554bac3 | |||
615e4a5401 | |||
3db0cafdf1 | |||
526de822d5 | |||
56fe4c297c | |||
47de8821d3 | |||
5984499e47 | |||
ca47e176af | |||
78f4590b60 | |||
2f7024987e | |||
6cd40a5bfe | |||
aba8d6ee00 | |||
2a0596bc48 | |||
f12141170a | |||
cfd3219f58 | |||
a1b2b8606e | |||
ad9f1aa679 | |||
889e662eae | |||
ef68eb28d8 | |||
259abd8953 | |||
f645eb6954 | |||
f4923cb8bc | |||
b640b19cc0 | |||
dc71af0a71 | |||
4d29e91be8 | |||
91445c7bc8 | |||
5950f555a1 | |||
a4e2b26856 | |||
973f5dc581 | |||
c994223d56 | |||
869579a702 | |||
c0efe92d8b | |||
d9fa1c05ad | |||
2de197bdd4 | |||
869e829b85 | |||
8f37be38eb | |||
8082ad7950 | |||
1e4ce295ae | |||
ce1917fcf2 | |||
e512f76a89 | |||
898cdf033e | |||
0f3f3c86ec | |||
b278557935 | |||
8ceffbf315 | |||
d93d2d74fd | |||
d0169e1b0f | |||
08fb75c72e | |||
91b361ae89 | |||
e20c92bb61 | |||
32c9eff2ff | |||
4ca5d40adc | |||
9279b9f83d | |||
ee77fdb5de | |||
996357e480 | |||
2a622d704a | |||
9c749713f6 | |||
022c5c6944 | |||
f8fcca100b | |||
06bfb51963 | |||
408e560015 | |||
402d378360 | |||
9e764e7b10 | |||
33fc1e2e86 | |||
eba17173d3 | |||
635b897246 | |||
4068f4b5b5 | |||
47831430cc | |||
65c08928c2 | |||
ba214dffbe | |||
eed11ebee9 | |||
300acb8347 | |||
d91457d529 | |||
fbf2564554 | |||
d1d49397e7 | |||
9c93636d84 | |||
e5d7ed0c53 | |||
ad0d567e1c | |||
bf0d97d786 | |||
a655eb3025 | |||
1543914c04 | |||
61fed92c7e | |||
80c751e7f6 | |||
e1a5c2f0a1 | |||
fd3a62a122 | |||
07064cb1d4 | |||
2f1e8e8f54 | |||
68d37809b9 | |||
5dba257506 | |||
187e32997c | |||
b55ed6ef8a | |||
2f385183f3 | |||
84c35c374a | |||
8c38ee7007 | |||
b6087a6bee | |||
23c1b10a4c | |||
a115ac46b5 | |||
73001445fb | |||
6d70198b17 | |||
f962f426bc | |||
11d8a091c6 | |||
365801fedd | |||
4db72e57f6 | |||
0c6f998554 | |||
e7c7c5e822 | |||
8c3230d8c1 | |||
2c5718809b | |||
82c49d3260 | |||
74fa1d123c | |||
a2a40bcd0d | |||
ccb1aabcca | |||
36e7670045 | |||
5886aa496e | |||
8d9b6721e7 | |||
b12e87f942 | |||
5dbf854553 | |||
970d6d0776 | |||
628ec6c17b | |||
3682e33f9f | |||
0aa38d16f5 | |||
faef77c0d6 | |||
dba4d9dec6 | |||
32b4c63f02 | |||
4fb8e329fd | |||
328841d002 | |||
d427e5cfda | |||
42bb201fd6 | |||
59d6bb4c86 | |||
b7dcc003dc | |||
d34be24bb1 | |||
b5cbe8eeb3 | |||
df04dffade | |||
a60731247f | |||
ac79799403 | |||
dde1fa18c9 | |||
0240402c46 | |||
55509c2114 | |||
101418096f | |||
5ce4627a7e | |||
7af553ea30 | |||
2c9b8ea2b0 | |||
d003f3ea39 | |||
6c6f7fe8a8 | |||
2339d59f92 | |||
1b875a0ef3 | |||
eb881ed006 | |||
46d4359450 | |||
81b979f2a8 | |||
371d04d39b | |||
0c0c2015c5 | |||
82d24f7aac | |||
f49777ba62 | |||
55fb97f7bd | |||
2072924d14 | |||
720b10fdc6 | |||
b85a977822 | |||
eec906d811 | |||
f57ee5650d | |||
dcb1a944d4 | |||
7492a36207 | |||
aa25985bd1 | |||
dbeac95dbb | |||
51a624bf02 | |||
6ad909fdda | |||
b689ada91e | |||
fc601665eb | |||
9832e5572a | |||
3f3e92e1f2 | |||
409475a827 | |||
196c34b0ac | |||
5c7963249d | |||
461cde2080 | |||
7a5286cc04 | |||
b1b1038fbd | |||
9edca6bf8f | |||
4f074fbf53 | |||
a491d6f535 | |||
32aa2059ad | |||
94d545a1a1 | |||
60fb4f3bcf | |||
63afbe9215 | |||
8cef6e02dc | |||
b866cdbd05 | |||
2e726680b3 | |||
5bfb30a529 | |||
e51719ae72 | |||
f30581c518 | |||
048fc57a0f | |||
f1d1bf6288 | |||
72d9c316d3 | |||
4a9139780a | |||
29c748930e | |||
c2d1b075ba | |||
584f0ae40d | |||
51ff216d85 | |||
dd2b5633dd | |||
47a0b615b4 | |||
5d2248d81a | |||
d573aeadcc | |||
995f56236b | |||
7c7aa37c69 | |||
04139ade59 | |||
1ecc645b8f | |||
c954f21ac0 | |||
86c2d8fd1c | |||
b880ffb87e | |||
7801f56ed7 | |||
48edab8041 | |||
a985f7af9f | |||
e461c262f0 | |||
276738ce0f | |||
cdf22afdda | |||
e24113a8fe | |||
7379b3d4b2 | |||
6c7f881541 | |||
a0f7d53beb | |||
5aef49806d | |||
98356735ac | |||
f26c4aeecb | |||
8936316d58 | |||
6142ef0ada | |||
c6b0a7d3ba | |||
a30482f054 | |||
17ca964273 | |||
5a9da2e6e9 | |||
fdea8ec167 | |||
ca5f54a9b9 | |||
f954fe0e65 | |||
362cff1eb3 | |||
996aa70f00 | |||
60508ffda9 | |||
f04e407e6b | |||
8b79f9e107 | |||
866fa4550d | |||
bf8717ebae | |||
c77eb8a33c | |||
2d1b9baa8f | |||
f9ecbb18bf | |||
02222a0256 | |||
2bfdbf2a36 | |||
e88db68cf5 | |||
59c9b6ebeb | |||
66d4b16724 | |||
0064f697d3 | |||
35bae114a8 | |||
88a412ed3d | |||
c301616ed2 | |||
35ffa682b1 | |||
551603feff | |||
efbce85f4d | |||
2ca830dbaa | |||
d927dbcd88 | |||
bddbbcb132 | |||
b3b1526f03 | |||
17138af7c4 | |||
69ba344de8 | |||
da6f409246 | |||
25ebed2f8c | |||
d263bd9df7 | |||
38e599d6a8 | |||
96d673e0f8 | |||
b10609e6a1 | |||
a1c02058ba | |||
15859f2357 | |||
886936837c | |||
6d917d0eeb | |||
93abf23a64 | |||
9c3dadd1c9 | |||
3cb5769883 | |||
ea7bd68d10 | |||
48259264a4 | |||
24a3d12b82 | |||
9855aea21b | |||
4b5b8a6a3b | |||
4863e5fba5 | |||
0d8451c3a4 | |||
0a56bcc03d | |||
0920ab9131 | |||
238c0d93b4 | |||
5b0ed8391d | |||
c31d4a57a6 | |||
d1fa714cb1 | |||
969da7d70b | |||
eeec9e3390 | |||
f93bf2b189 | |||
7cd7409142 | |||
be39e3cd18 | |||
34f1a806d5 | |||
00c1bde5d8 | |||
3989a79824 | |||
1efce68605 | |||
30870b4f66 | |||
78ed8f57d8 | |||
db6c264a1e | |||
9f3974a319 | |||
2c97eca1ff | |||
5d712571af | |||
d4d5291cc2 | |||
4816d20aa4 | |||
85362f028c | |||
62de37a38e | |||
8195824206 | |||
f092153fbe | |||
1da8f0e1dd | |||
ccede2b264 | |||
24a36d6d5f | |||
8fb26dac61 | |||
7439a8b5fc | |||
4e11683368 | |||
452a723bf2 | |||
d1e21a979b | |||
72ff3a9686 | |||
66aaa7722d | |||
d643c2aba1 | |||
91642db952 | |||
fd22220687 | |||
b2f775456e | |||
cad5c0a6ed | |||
8f10d5e393 | |||
40766ca1b8 | |||
2e32f5d28d | |||
61b1d2f6ae | |||
9974fca047 | |||
3fb4b4f163 | |||
2e33fe4191 | |||
e39400a4b6 | |||
ffa48c9146 | |||
d5c5154fcf | |||
9a93973708 | |||
134810b3d9 | |||
75f89dc44c | |||
e739194926 | |||
250ee65d72 | |||
9b9cef3145 | |||
d05f88679b | |||
beb16b2c81 | |||
fe2e10c71b | |||
82c73fd510 | |||
bfd610430c | |||
e35879c276 | |||
ebf778061d | |||
28b3a1c7e5 | |||
bc192a2b09 | |||
980ad394a8 | |||
391d7b2763 | |||
d1f6d1c8af | |||
6d525288c1 | |||
6faec54505 | |||
5ed5d5f128 | |||
b63ba84832 | |||
9c6459e4cb | |||
1a2f8fb828 | |||
cbcbdb1ceb | |||
a811dd6608 | |||
ca871491ed | |||
3b61cb450d | |||
edc4fa3188 | |||
25b79d9fd3 | |||
aea2fc38c3 | |||
e691b26f6f | |||
c690357928 | |||
d1c2e15eb3 | |||
af7c4a92e6 | |||
46004e83a2 | |||
43b05fa314 | |||
a11f326528 | |||
fd57d2b534 | |||
7be15d9356 | |||
1b62745b1d | |||
78029b34ed | |||
c889d5888b | |||
39e227c7ae | |||
1c768fe537 | |||
bf0e382e16 | |||
b26b4cd03c | |||
f13cf9ad50 | |||
955fa9533a | |||
acf092d348 | |||
69d357ba12 | |||
dcdc3fafe5 | |||
c05cfb67da | |||
7406274041 | |||
8b59631855 | |||
a1887f2c96 | |||
222f5b082a | |||
b031a455a9 | |||
db87eb6c67 | |||
9743d64e4e | |||
a43065272f | |||
998eeafe58 | |||
571da8fc43 | |||
39c89e71a8 | |||
1f958a7d52 | |||
aa39a8e175 | |||
8d370e91cb | |||
7883c2bbe7 | |||
2a56e1264f | |||
e4c34c23de | |||
82eb5ea8f3 | |||
10398b4706 | |||
01d079fd8e | |||
c92acb9693 | |||
8db957ee3a | |||
c9ca4fce3f | |||
fa2dea61df | |||
b5b647b084 | |||
d2bd88b122 | |||
381ac93bb5 | |||
a061fe601e | |||
7c32b6861e | |||
7090c27bb2 | |||
2f2cdc745a | |||
3bc94cab69 | |||
f6084f6324 | |||
9323a3153b | |||
3257d449fa | |||
ef51831ee8 | |||
dc5ce861bf | |||
21fe7b481a | |||
a4cf256159 | |||
d746268e92 | |||
4433195ab7 | |||
4c05edb33a | |||
9b14d978aa | |||
519cc6ca12 | |||
b45f0d7946 | |||
a4c4daf364 | |||
e95f275f57 | |||
ef31eabc68 | |||
995a148575 | |||
63a164172d | |||
e25810ae29 | |||
073a4bd1c0 | |||
b7954776fd | |||
b18c9bbaba | |||
0590ec3fd9 | |||
c11f172187 | |||
169a0ff911 | |||
d2f058e76c | |||
f877a7d12a | |||
133707123e | |||
7e4bbda573 | |||
e7cfc4ef4c | |||
16ee07f22a | |||
40bc242579 | |||
661175bc82 | |||
3132aac043 | |||
c82b432d4a | |||
fa6ecb9aa7 | |||
c83919c7a6 | |||
98f47f2a40 | |||
8c1e77fb58 | |||
5fc5ce0fe4 | |||
3ed5e73146 | |||
9a8bff0285 | |||
a79b122400 | |||
d9b4b3f069 | |||
278be671a3 | |||
70dc14fbd0 | |||
cb4e1c3f3a | |||
395b1c7454 | |||
9b4b150395 | |||
197b4484a3 | |||
b98c62ba49 | |||
c411def234 | |||
308cc5e21e | |||
9e0a147d50 | |||
418cb3b93f | |||
1209261e93 | |||
e2251109c7 | |||
15cc2a9f1a | |||
e85250b1d1 | |||
cfb3bf25fb | |||
1bf905ddaa | |||
0a4d968500 | |||
0a71900bc9 | |||
2f0a0a17a4 | |||
7576cd38df | |||
9a99273b48 | |||
f5792c7c4a | |||
db66e018ea | |||
1f6584ee85 | |||
334d64d1e8 | |||
940635343a | |||
9a88f89799 | |||
519e8e4182 | |||
a6760f6456 | |||
45ac4ff270 | |||
6e9ff050c8 | |||
9db713a1dc | |||
1b583cfefa | |||
cf73f0c95e | |||
b1d920531f | |||
452a4e80c3 | |||
c27df94e1f | |||
d04b13a380 | |||
2b0879bfc2 | |||
ed46f14321 | |||
05d1f8c9c6 | |||
25d806e953 | |||
65813781a2 | |||
7c2134beda | |||
a30a605d21 | |||
571841b7fc | |||
7ea3cd7c3e | |||
214efc2c3c | |||
49628fe13e | |||
e4fbb14414 | |||
c055747867 | |||
eda2b3589c | |||
1c445dca51 | |||
1700c543a5 | |||
17d8fc1806 | |||
04668ebe7a | |||
651f6c31ac | |||
86a44fb896 | |||
4cfe5d2bca | |||
c8acd80548 | |||
4634a89d18 | |||
7c25fe45a6 | |||
02a43f82a9 | |||
cfea9c04ef | |||
7d8ffb344f | |||
4aba6e3d1a | |||
978b39744b | |||
ebda51968b | |||
9195dbdbca | |||
d559979c54 | |||
d345f409b7 | |||
28598f3939 | |||
948c859571 | |||
97814fbf0f | |||
eebad39f26 | |||
db100c5cde | |||
11fcf0e066 | |||
b6374e09b0 | |||
a111d0151f | |||
446c7806b2 | |||
33e0a2540a | |||
aed074860a | |||
9afa014552 | |||
46fe9b46d8 | |||
cf656f5a02 | |||
edec3385b6 | |||
f9310cbd0c | |||
7560ae5caf | |||
e7a8341c7c | |||
c51e397fe8 | |||
2385b60d83 | |||
da7e702c6f | |||
4d676f0852 | |||
d5ec121f95 | |||
8a93a598d9 | |||
1cfde82ffd | |||
f0e0238016 | |||
aaddce5d26 | |||
3430857b64 | |||
8b0fe06c89 | |||
9d827170a3 | |||
6c1208d083 | |||
388ee3de66 | |||
2f77b6cfec | |||
c68f7ede6a | |||
0cd3d9717e | |||
5f1d6af2b6 | |||
772a66732d | |||
63f1fde277 | |||
d5b28447e0 | |||
09dbf9ff16 | |||
343041c4c4 | |||
ed701ca963 | |||
7629a9c6e5 | |||
709c9f1f25 | |||
b4be5a8adb | |||
ad44437ba3 | |||
9e05252b46 | |||
d200972e7f | |||
d5b68aba2f | |||
a324d3a1a7 | |||
b00b33d77e | |||
efa9084628 | |||
803f37eaaa | |||
fd9f124971 | |||
1ea291a417 | |||
11fd7ea639 | |||
f028dff33d | |||
b4614656b8 | |||
25f9c78961 | |||
5390d6664f | |||
382b6a4852 | |||
272e31c0bd | |||
74f8c2cf5f | |||
8c1fb50705 | |||
7eb719df13 | |||
284203f171 | |||
90a6c759ca | |||
2298e69b5f | |||
a03ea40792 | |||
96d999fbe8 | |||
c2170a5b39 | |||
6b2d25efc7 | |||
281cc4b3cd | |||
4f686d139f | |||
31894a2155 | |||
7851b45196 | |||
4186be8111 | |||
e7ebb662d7 | |||
5be4e52b65 | |||
01aae1cc68 | |||
c7dec926f6 | |||
51bb12d17b | |||
47826cacf0 | |||
c4e464333e | |||
d1557e66d3 | |||
80d85c5d7b | |||
76aab90ab6 | |||
8d74b5aee9 | |||
cf349c4a97 | |||
905d0f0af4 | |||
643ecf7b11 | |||
4fd9375028 | |||
661a34fd4f | |||
361c29e174 | |||
b98d89efd4 | |||
8b6725b0cf | |||
1d75472626 | |||
2f427c2d16 | |||
755b85359b | |||
32e46e000f | |||
4f168f69a3 | |||
3e8d14d8a1 | |||
a067f85e08 | |||
c76ac49d26 |
@ -1,9 +1,14 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import os
|
||||
import sys
|
||||
import zipfile
|
||||
|
||||
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 250 MB
|
||||
VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 250))
|
||||
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 400 MiB
|
||||
# Note that we have 400 MiB quota, please use it wisely.
|
||||
# See https://github.com/pypi/support/issues/3792 .
|
||||
# Please also sync the value with the one in Dockerfile.
|
||||
VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 400))
|
||||
|
||||
|
||||
def print_top_10_largest_files(zip_file):
|
||||
|
26
.buildkite/generate_index.py
Normal file
26
.buildkite/generate_index.py
Normal file
@ -0,0 +1,26 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import argparse
|
||||
import os
|
||||
|
||||
template = """<!DOCTYPE html>
|
||||
<html>
|
||||
<body>
|
||||
<h1>Links for vLLM</h1/>
|
||||
<a href="../{wheel_html_escaped}">{wheel}</a><br/>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--wheel", help="The wheel path.", required=True)
|
||||
args = parser.parse_args()
|
||||
|
||||
filename = os.path.basename(args.wheel)
|
||||
|
||||
with open("index.html", "w") as f:
|
||||
print(f"Generated index.html for {args.wheel}")
|
||||
# cloudfront requires escaping the '+' character
|
||||
f.write(
|
||||
template.format(wheel=filename,
|
||||
wheel_html_escaped=filename.replace("+", "%2B")))
|
@ -1,3 +1,4 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2
|
||||
model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat"
|
||||
tasks:
|
||||
|
@ -1,3 +1,4 @@
|
||||
# For hf script, without -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5
|
||||
model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform"
|
||||
tasks:
|
||||
|
@ -1,3 +1,4 @@
|
||||
# For hf script, without -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
|
||||
model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
|
||||
tasks:
|
||||
|
@ -1,3 +1,4 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1
|
||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors"
|
||||
tasks:
|
||||
|
@ -1,3 +1,4 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1
|
||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform"
|
||||
tasks:
|
||||
|
@ -1,3 +1,4 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1
|
||||
model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
|
||||
tasks:
|
||||
|
@ -1,3 +1,4 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
|
||||
model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
|
||||
tasks:
|
||||
|
@ -1,3 +1,4 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
|
||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
|
||||
tasks:
|
||||
|
@ -1,3 +1,4 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
|
||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test"
|
||||
tasks:
|
||||
|
@ -1,3 +1,4 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1
|
||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
|
||||
tasks:
|
||||
|
@ -1,4 +1,5 @@
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 -t 1
|
||||
# For hf script, without -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5
|
||||
model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
|
@ -1,3 +1,4 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
|
||||
model_name: "HandH1998/QQQ-Llama-3-8b-g128"
|
||||
tasks:
|
||||
|
@ -1,3 +1,4 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
|
||||
model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"
|
||||
tasks:
|
||||
|
@ -1,11 +1,12 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1
|
||||
model_name: "mgoin/Minitron-4B-Base-FP8"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.233
|
||||
value: 0.231
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.236
|
||||
value: 0.22
|
||||
limit: 1000
|
||||
num_fewshot: 5
|
||||
|
@ -1,3 +1,4 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8
|
||||
model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic"
|
||||
tasks:
|
||||
|
@ -1,3 +1,4 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4
|
||||
model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
|
||||
tasks:
|
||||
|
@ -1,4 +1,5 @@
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5 -t 4
|
||||
# For hf script, without -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5
|
||||
model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
|
@ -0,0 +1,12 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16 -b auto -l 1319 -f 5 -t 1
|
||||
model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.30
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.465
|
||||
limit: 1319
|
||||
num_fewshot: 5
|
@ -1,3 +1,4 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1
|
||||
model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8"
|
||||
tasks:
|
||||
|
@ -1,3 +1,4 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
|
||||
model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
|
||||
tasks:
|
||||
|
@ -1,3 +1,4 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
|
||||
model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
|
||||
tasks:
|
||||
|
@ -1,3 +1,4 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
|
||||
model_name: "Qwen/Qwen2-57B-A14B-Instruct"
|
||||
tasks:
|
||||
|
@ -0,0 +1,12 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2
|
||||
model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.6353
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.637
|
||||
limit: null
|
||||
num_fewshot: null
|
@ -4,7 +4,7 @@ Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
|
||||
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
|
||||
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
|
||||
Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
|
||||
Minitron-4B-Base-FP8.yaml
|
||||
Qwen1.5-MoE-W4A16-compressed-tensors.yaml
|
||||
Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
|
||||
Qwen2-1.5B-Instruct-FP8W8.yaml
|
||||
Meta-Llama-3-8B-QQQ.yaml
|
||||
|
@ -1,3 +1,4 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
"""
|
||||
LM eval harness on model to compare vs HF baseline computed offline.
|
||||
Configs are found in configs/$MODEL.yaml
|
||||
@ -12,9 +13,10 @@ from pathlib import Path
|
||||
|
||||
import lm_eval
|
||||
import numpy
|
||||
import pytest
|
||||
import yaml
|
||||
|
||||
RTOL = 0.05
|
||||
RTOL = 0.08
|
||||
TEST_DATA_FILE = os.environ.get(
|
||||
"LM_EVAL_TEST_DATA_FILE",
|
||||
".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
|
||||
@ -45,6 +47,10 @@ def test_lm_eval_correctness():
|
||||
eval_config = yaml.safe_load(
|
||||
Path(TEST_DATA_FILE).read_text(encoding="utf-8"))
|
||||
|
||||
if eval_config[
|
||||
"model_name"] == "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform": #noqa: E501
|
||||
pytest.skip("FBGEMM is currently failing on main.")
|
||||
|
||||
# Launch eval requests.
|
||||
results = launch_lm_eval(eval_config)
|
||||
|
||||
|
@ -1,16 +1,14 @@
|
||||
# vLLM benchmark suite
|
||||
|
||||
|
||||
## Introduction
|
||||
|
||||
This directory contains two sets of benchmark for vllm.
|
||||
|
||||
- Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance
|
||||
- Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm.
|
||||
|
||||
|
||||
See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
|
||||
|
||||
|
||||
## Performance benchmark quick overview
|
||||
|
||||
**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!), with different models.
|
||||
@ -19,7 +17,6 @@ See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performan
|
||||
|
||||
**For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run.
|
||||
|
||||
|
||||
## Nightly benchmark quick overview
|
||||
|
||||
**Benchmarking Coverage**: Fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) on Llama-3 8B, 70B and Mixtral 8x7B.
|
||||
@ -28,8 +25,6 @@ See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performan
|
||||
|
||||
**Benchmarking Duration**: about 3.5hrs.
|
||||
|
||||
|
||||
|
||||
## Trigger the benchmark
|
||||
|
||||
Performance benchmark will be triggered when:
|
||||
@ -39,16 +34,11 @@ Performance benchmark will be triggered when:
|
||||
Nightly benchmark will be triggered when:
|
||||
- Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
|
||||
|
||||
|
||||
|
||||
|
||||
## Performance benchmark details
|
||||
|
||||
|
||||
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
|
||||
|
||||
|
||||
#### Latency test
|
||||
### Latency test
|
||||
|
||||
Here is an example of one test inside `latency-tests.json`:
|
||||
|
||||
@ -68,6 +58,7 @@ Here is an example of one test inside `latency-tests.json`:
|
||||
```
|
||||
|
||||
In this example:
|
||||
|
||||
- The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
|
||||
- The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
|
||||
|
||||
@ -75,16 +66,17 @@ Note that the performance numbers are highly sensitive to the value of the param
|
||||
|
||||
WARNING: The benchmarking script will save json results by itself, so please do not configure `--output-json` parameter in the json file.
|
||||
|
||||
### Throughput test
|
||||
|
||||
#### Throughput test
|
||||
The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`.
|
||||
|
||||
The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
|
||||
|
||||
#### Serving test
|
||||
### Serving test
|
||||
|
||||
We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
|
||||
|
||||
```
|
||||
```json
|
||||
[
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_sharegpt",
|
||||
@ -109,6 +101,7 @@ We test the throughput by using `benchmark_serving.py` with request rate = inf t
|
||||
```
|
||||
|
||||
Inside this example:
|
||||
|
||||
- The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`.
|
||||
- The `server-parameters` includes the command line arguments for vLLM server.
|
||||
- The `client-parameters` includes the command line arguments for `benchmark_serving.py`.
|
||||
@ -118,36 +111,33 @@ The number of this test is less stable compared to the delay and latency benchma
|
||||
|
||||
WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
|
||||
|
||||
#### Visualizing the results
|
||||
### Visualizing the results
|
||||
|
||||
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results.
|
||||
You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
|
||||
If you do not see the table, please wait till the benchmark finish running.
|
||||
The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
|
||||
The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
|
||||
|
||||
|
||||
|
||||
## Nightly test details
|
||||
|
||||
See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines.
|
||||
|
||||
|
||||
#### Workflow
|
||||
### Workflow
|
||||
|
||||
- The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines.
|
||||
- Inside each container, we run [run-nightly-suite.sh](run-nightly-suite.sh), which will probe the serving engine of the current container.
|
||||
- The `run-nightly-suite.sh` will redirect the request to `tests/run-[llm serving engine name]-nightly.sh`, which parses the workload described in [nightly-tests.json](tests/nightly-tests.json) and performs the benchmark.
|
||||
- At last, we run [scripts/plot-nightly-results.py](scripts/plot-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
|
||||
|
||||
#### Nightly tests
|
||||
### Nightly tests
|
||||
|
||||
In [nightly-tests.json](tests/nightly-tests.json), we include the command line arguments for benchmarking commands, together with the benchmarking test cases. The format is highly similar to performance benchmark.
|
||||
|
||||
#### Docker containers
|
||||
### Docker containers
|
||||
|
||||
The docker containers for benchmarking are specified in `nightly-pipeline.yaml`.
|
||||
|
||||
WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `tests/run-[llm serving engine name]-nightly.sh`.
|
||||
|
||||
WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git).
|
||||
|
||||
|
@ -1,5 +1,6 @@
|
||||
steps:
|
||||
- label: "Wait for container to be ready"
|
||||
key: wait-for-container-image
|
||||
agents:
|
||||
queue: A100
|
||||
plugins:
|
||||
@ -9,10 +10,102 @@ steps:
|
||||
- image: badouralix/curl-jq
|
||||
command:
|
||||
- sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
|
||||
- wait
|
||||
- label: "Cleanup H100"
|
||||
agents:
|
||||
queue: H100
|
||||
depends_on: ~
|
||||
command: docker system prune -a --volumes --force
|
||||
|
||||
- label: "A100"
|
||||
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
||||
agents:
|
||||
queue: A100
|
||||
depends_on: wait-for-container-image
|
||||
if: build.branch == "main"
|
||||
plugins:
|
||||
- kubernetes:
|
||||
podSpec:
|
||||
priorityClassName: perf-benchmark
|
||||
containers:
|
||||
- image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
|
||||
command:
|
||||
- bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: 8
|
||||
volumeMounts:
|
||||
- name: devshm
|
||||
mountPath: /dev/shm
|
||||
env:
|
||||
- name: VLLM_USAGE_SOURCE
|
||||
value: ci-test
|
||||
- name: HF_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: hf-token-secret
|
||||
key: token
|
||||
nodeSelector:
|
||||
nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
|
||||
volumes:
|
||||
- name: devshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
|
||||
- label: "H200"
|
||||
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
||||
agents:
|
||||
queue: H200
|
||||
depends_on: wait-for-container-image
|
||||
if: build.branch == "main"
|
||||
plugins:
|
||||
- docker#v5.12.0:
|
||||
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
|
||||
command:
|
||||
- bash
|
||||
- .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
||||
mount-buildkite-agent: true
|
||||
propagate-environment: true
|
||||
ipc: host
|
||||
gpus: 4,5,6,7
|
||||
volumes:
|
||||
- /data/benchmark-hf-cache:/root/.cache/huggingface
|
||||
environment:
|
||||
- VLLM_USAGE_SOURCE
|
||||
- HF_TOKEN
|
||||
|
||||
#- block: "Run H100 Benchmark"
|
||||
#key: block-h100
|
||||
#depends_on: ~
|
||||
|
||||
- label: "H100"
|
||||
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
||||
agents:
|
||||
queue: H100
|
||||
depends_on: wait-for-container-image
|
||||
if: build.branch == "main"
|
||||
plugins:
|
||||
- docker#v5.12.0:
|
||||
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
|
||||
command:
|
||||
- bash
|
||||
- .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
||||
mount-buildkite-agent: true
|
||||
propagate-environment: true
|
||||
ipc: host
|
||||
gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
|
||||
volumes:
|
||||
- /data/benchmark-hf-cache:/root/.cache/huggingface
|
||||
environment:
|
||||
- VLLM_USAGE_SOURCE
|
||||
- HF_TOKEN
|
||||
|
||||
# Premerge benchmark
|
||||
- label: "A100"
|
||||
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
||||
agents:
|
||||
queue: A100
|
||||
depends_on: wait-for-container-image
|
||||
if: build.branch != "main"
|
||||
plugins:
|
||||
- kubernetes:
|
||||
podSpec:
|
||||
@ -41,20 +134,51 @@ steps:
|
||||
- name: devshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
# - label: "H100"
|
||||
# agents:
|
||||
# queue: H100
|
||||
# plugins:
|
||||
# - docker#v5.11.0:
|
||||
# image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
|
||||
# command:
|
||||
# - bash
|
||||
# - .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
|
||||
# mount-buildkite-agent: true
|
||||
# propagate-environment: true
|
||||
# ipc: host
|
||||
# gpus: all
|
||||
# environment:
|
||||
# - VLLM_USAGE_SOURCE
|
||||
# - HF_TOKEN
|
||||
|
||||
- label: "H200"
|
||||
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
||||
agents:
|
||||
queue: H200
|
||||
depends_on: wait-for-container-image
|
||||
if: build.branch != "main"
|
||||
plugins:
|
||||
- docker#v5.12.0:
|
||||
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
|
||||
command:
|
||||
- bash
|
||||
- .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
||||
mount-buildkite-agent: true
|
||||
propagate-environment: true
|
||||
ipc: host
|
||||
gpus: 4,5,6,7
|
||||
volumes:
|
||||
- /data/benchmark-hf-cache:/root/.cache/huggingface
|
||||
environment:
|
||||
- VLLM_USAGE_SOURCE
|
||||
- HF_TOKEN
|
||||
|
||||
#- block: "Run H100 Benchmark"
|
||||
#key: block-h100
|
||||
#depends_on: ~
|
||||
|
||||
- label: "H100"
|
||||
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
||||
agents:
|
||||
queue: H100
|
||||
depends_on: wait-for-container-image
|
||||
if: build.branch != "main"
|
||||
plugins:
|
||||
- docker#v5.12.0:
|
||||
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
|
||||
command:
|
||||
- bash
|
||||
- .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
||||
mount-buildkite-agent: true
|
||||
propagate-environment: true
|
||||
ipc: host
|
||||
gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
|
||||
volumes:
|
||||
- /data/benchmark-hf-cache:/root/.cache/huggingface
|
||||
environment:
|
||||
- VLLM_USAGE_SOURCE
|
||||
- HF_TOKEN
|
||||
|
@ -9,20 +9,19 @@ This file contains the downloading link for benchmarking results.
|
||||
|
||||
Please download the visualization scripts in the post
|
||||
|
||||
|
||||
## Results reproduction
|
||||
|
||||
- Find the docker we use in `benchmarking pipeline`
|
||||
- Deploy the docker, and inside the docker:
|
||||
- Download `nightly-benchmarks.zip`.
|
||||
- In the same folder, run the following code
|
||||
```
|
||||
export HF_TOKEN=<your HF token>
|
||||
apt update
|
||||
apt install -y git
|
||||
unzip nightly-benchmarks.zip
|
||||
VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
|
||||
```
|
||||
- In the same folder, run the following code:
|
||||
|
||||
```console
|
||||
export HF_TOKEN=<your HF token>
|
||||
apt update
|
||||
apt install -y git
|
||||
unzip nightly-benchmarks.zip
|
||||
VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
|
||||
```
|
||||
|
||||
And the results will be inside `./benchmarks/results`.
|
||||
|
||||
|
@ -2,6 +2,7 @@
|
||||
# Nightly benchmark
|
||||
|
||||
This benchmark aims to:
|
||||
|
||||
- Provide performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and SGLang) leads in performance in what workload.
|
||||
- Be reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions.
|
||||
|
||||
@ -9,7 +10,6 @@ Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html)
|
||||
|
||||
Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
|
||||
|
||||
|
||||
## Setup
|
||||
|
||||
- Docker images:
|
||||
@ -33,7 +33,7 @@ Latest reproduction guilde: [github issue link](https://github.com/vllm-project/
|
||||
- Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
|
||||
- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
|
||||
|
||||
# Known issues
|
||||
## Known issues
|
||||
|
||||
- TRT-LLM crashes with Llama 3.1 8B [issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105).
|
||||
- TGI does not support `ignore-eos` flag.
|
@ -7,10 +7,8 @@
|
||||
- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
||||
- Evaluation metrics: end-to-end latency (mean, median, p99).
|
||||
|
||||
|
||||
{latency_tests_markdown_table}
|
||||
|
||||
|
||||
## Throughput tests
|
||||
|
||||
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
|
||||
@ -19,10 +17,8 @@
|
||||
- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
||||
- Evaluation metrics: throughput.
|
||||
|
||||
|
||||
{throughput_tests_markdown_table}
|
||||
|
||||
|
||||
## Serving tests
|
||||
|
||||
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
|
||||
@ -33,10 +29,8 @@
|
||||
- We also added a speculative decoding test for llama-3 70B, under QPS 2
|
||||
- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
|
||||
|
||||
|
||||
{serving_tests_markdown_table}
|
||||
|
||||
|
||||
## json version of the benchmarking tables
|
||||
|
||||
This section contains the data of the markdown tables above in JSON format.
|
||||
@ -54,9 +48,9 @@ serving_results = pd.DataFrame.from_dict(benchmarking_results["serving"])
|
||||
```
|
||||
|
||||
The json string for all benchmarking tables:
|
||||
|
||||
```json
|
||||
{benchmarking_results_in_json_string}
|
||||
```
|
||||
|
||||
You can also check the raw experiment data in the Artifact tab of the Buildkite page.
|
||||
|
||||
|
@ -1,3 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
@ -82,8 +84,13 @@ if __name__ == "__main__":
|
||||
# this result is generated via `benchmark_serving.py`
|
||||
|
||||
# attach the benchmarking command to raw_result
|
||||
try:
|
||||
with open(test_file.with_suffix(".commands")) as f:
|
||||
command = json.loads(f.read())
|
||||
except OSError as e:
|
||||
print(e)
|
||||
continue
|
||||
|
||||
raw_result.update(command)
|
||||
|
||||
# update the test name of this result
|
||||
@ -97,8 +104,13 @@ if __name__ == "__main__":
|
||||
# this result is generated via `benchmark_latency.py`
|
||||
|
||||
# attach the benchmarking command to raw_result
|
||||
try:
|
||||
with open(test_file.with_suffix(".commands")) as f:
|
||||
command = json.loads(f.read())
|
||||
except OSError as e:
|
||||
print(e)
|
||||
continue
|
||||
|
||||
raw_result.update(command)
|
||||
|
||||
# update the test name of this result
|
||||
@ -119,8 +131,13 @@ if __name__ == "__main__":
|
||||
# this result is generated via `benchmark_throughput.py`
|
||||
|
||||
# attach the benchmarking command to raw_result
|
||||
try:
|
||||
with open(test_file.with_suffix(".commands")) as f:
|
||||
command = json.loads(f.read())
|
||||
except OSError as e:
|
||||
print(e)
|
||||
continue
|
||||
|
||||
raw_result.update(command)
|
||||
|
||||
# update the test name of this result
|
||||
@ -157,6 +174,18 @@ if __name__ == "__main__":
|
||||
throughput_results,
|
||||
serving_results)
|
||||
|
||||
for df in [latency_results, serving_results, throughput_results]:
|
||||
if df.empty:
|
||||
continue
|
||||
|
||||
# Sort all dataframes by their respective "Test name" columns
|
||||
df.sort_values(by="Test name", inplace=True)
|
||||
|
||||
# The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
|
||||
# we want to turn it into "8xGPUTYPE"
|
||||
df["GPU"] = df["GPU"].apply(
|
||||
lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}")
|
||||
|
||||
# get markdown tables
|
||||
latency_md_table = tabulate(latency_results,
|
||||
headers='keys',
|
||||
|
@ -1,3 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import argparse
|
||||
|
||||
from transformers import AutoTokenizer
|
||||
|
@ -1,3 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
@ -1,3 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from lmdeploy.serve.openai.api_client import APIClient
|
||||
|
||||
api_client = APIClient("http://localhost:8000")
|
||||
|
@ -43,7 +43,7 @@ main() {
|
||||
|
||||
|
||||
|
||||
# The figures should be genereated by a separate process outside the CI/CD pipeline
|
||||
# The figures should be generated by a separate process outside the CI/CD pipeline
|
||||
|
||||
# # generate figures
|
||||
# python3 -m pip install tabulate pandas matplotlib
|
||||
|
@ -301,6 +301,104 @@ run_serving_tests() {
|
||||
kill_gpu_processes
|
||||
}
|
||||
|
||||
run_genai_perf_tests() {
|
||||
# run genai-perf tests
|
||||
|
||||
# $1: a json file specifying genai-perf test cases
|
||||
local genai_perf_test_file
|
||||
genai_perf_test_file=$1
|
||||
|
||||
# Iterate over genai-perf tests
|
||||
jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
|
||||
# get the test name, and append the GPU type back to it.
|
||||
test_name=$(echo "$params" | jq -r '.test_name')
|
||||
|
||||
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
||||
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
||||
echo "Skip test case $test_name."
|
||||
continue
|
||||
fi
|
||||
|
||||
# prepend the current serving engine to the test name
|
||||
test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
|
||||
|
||||
# get common parameters
|
||||
common_params=$(echo "$params" | jq -r '.common_parameters')
|
||||
model=$(echo "$common_params" | jq -r '.model')
|
||||
tp=$(echo "$common_params" | jq -r '.tp')
|
||||
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
|
||||
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
|
||||
port=$(echo "$common_params" | jq -r '.port')
|
||||
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
|
||||
reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
|
||||
|
||||
# get client and server arguments
|
||||
server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
|
||||
qps_list=$(echo "$params" | jq -r '.qps_list')
|
||||
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
||||
echo "Running over qps list $qps_list"
|
||||
|
||||
# check if there is enough GPU to run the test
|
||||
if [[ $gpu_count -lt $tp ]]; then
|
||||
echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
||||
continue
|
||||
fi
|
||||
|
||||
if [[ $reuse_server == "true" ]]; then
|
||||
echo "Reuse previous server for test case $test_name"
|
||||
else
|
||||
kill_gpu_processes
|
||||
bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
|
||||
"$server_params" "$common_params"
|
||||
fi
|
||||
|
||||
if wait_for_server; then
|
||||
echo ""
|
||||
echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
|
||||
else
|
||||
echo ""
|
||||
echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
|
||||
break
|
||||
fi
|
||||
|
||||
# iterate over different QPS
|
||||
for qps in $qps_list; do
|
||||
# remove the surrounding single quote from qps
|
||||
if [[ "$qps" == *"inf"* ]]; then
|
||||
echo "qps was $qps"
|
||||
qps=$num_prompts
|
||||
echo "now qps is $qps"
|
||||
fi
|
||||
|
||||
new_test_name=$test_name"_qps_"$qps
|
||||
backend=$CURRENT_LLM_SERVING_ENGINE
|
||||
|
||||
if [[ "$backend" == *"vllm"* ]]; then
|
||||
backend="vllm"
|
||||
fi
|
||||
#TODO: add output dir.
|
||||
client_command="genai-perf profile \
|
||||
-m $model \
|
||||
--service-kind openai \
|
||||
--backend vllm \
|
||||
--endpoint-type chat \
|
||||
--streaming \
|
||||
--url localhost:$port \
|
||||
--request-rate $qps \
|
||||
--num-prompts $num_prompts \
|
||||
"
|
||||
|
||||
echo "Client command: $client_command"
|
||||
|
||||
eval "$client_command"
|
||||
|
||||
#TODO: process/record outputs
|
||||
done
|
||||
done
|
||||
|
||||
kill_gpu_processes
|
||||
|
||||
}
|
||||
|
||||
prepare_dataset() {
|
||||
|
||||
@ -328,12 +426,17 @@ main() {
|
||||
|
||||
pip install -U transformers
|
||||
|
||||
pip install -r requirements/dev.txt
|
||||
which genai-perf
|
||||
|
||||
# check storage
|
||||
df -h
|
||||
|
||||
ensure_installed wget
|
||||
ensure_installed curl
|
||||
ensure_installed jq
|
||||
# genai-perf dependency
|
||||
ensure_installed libb64-0d
|
||||
|
||||
prepare_dataset
|
||||
|
||||
@ -345,6 +448,10 @@ main() {
|
||||
# run the test
|
||||
run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json"
|
||||
|
||||
# run genai-perf tests
|
||||
run_genai_perf_tests "$BENCHMARK_ROOT/tests/genai-perf-tests.json"
|
||||
mv artifacts/ $RESULTS_FOLDER/
|
||||
|
||||
# upload benchmark results to buildkite
|
||||
python3 -m pip install tabulate pandas
|
||||
python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py"
|
||||
|
@ -6,18 +6,28 @@
|
||||
|
||||
# Do not set -e, as the mixtral 8x22B model tends to crash occasionally
|
||||
# and we still want to see other benchmarking results even when mixtral crashes.
|
||||
set -x
|
||||
set -o pipefail
|
||||
|
||||
check_gpus() {
|
||||
if command -v nvidia-smi; then
|
||||
# check the number of GPUs and GPU type.
|
||||
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
||||
elif command -v amd-smi; then
|
||||
declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
|
||||
fi
|
||||
|
||||
if [[ $gpu_count -gt 0 ]]; then
|
||||
echo "GPU found."
|
||||
else
|
||||
echo "Need at least 1 GPU to run benchmarking."
|
||||
exit 1
|
||||
fi
|
||||
if command -v nvidia-smi; then
|
||||
declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
|
||||
elif command -v amd-smi; then
|
||||
declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}')
|
||||
fi
|
||||
echo "GPU type is $gpu_type"
|
||||
}
|
||||
|
||||
@ -85,17 +95,19 @@ kill_gpu_processes() {
|
||||
|
||||
ps -aux
|
||||
lsof -t -i:8000 | xargs -r kill -9
|
||||
pkill -f pt_main_thread
|
||||
# this line doesn't work now
|
||||
# ps aux | grep python | grep openai | awk '{print $2}' | xargs -r kill -9
|
||||
pkill -f python3
|
||||
pkill -f /usr/bin/python3
|
||||
pgrep python3 | xargs -r kill -9
|
||||
|
||||
|
||||
# wait until GPU memory usage smaller than 1GB
|
||||
if command -v nvidia-smi; then
|
||||
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
|
||||
sleep 1
|
||||
done
|
||||
elif command -v amd-smi; then
|
||||
while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do
|
||||
sleep 1
|
||||
done
|
||||
fi
|
||||
|
||||
# remove vllm config file
|
||||
rm -rf ~/.config/vllm
|
||||
@ -289,7 +301,7 @@ run_serving_tests() {
|
||||
# run the server
|
||||
echo "Running test case $test_name"
|
||||
echo "Server command: $server_command"
|
||||
eval "$server_command" &
|
||||
bash -c "$server_command" &
|
||||
server_pid=$!
|
||||
|
||||
# wait until the server is alive
|
||||
@ -312,17 +324,20 @@ run_serving_tests() {
|
||||
|
||||
new_test_name=$test_name"_qps_"$qps
|
||||
|
||||
# pass the tensor parallel size to the client so that it can be displayed
|
||||
# on the benchmark dashboard
|
||||
client_command="python3 benchmark_serving.py \
|
||||
--save-result \
|
||||
--result-dir $RESULTS_FOLDER \
|
||||
--result-filename ${new_test_name}.json \
|
||||
--request-rate $qps \
|
||||
--metadata "tensor_parallel_size=$tp" \
|
||||
$client_args"
|
||||
|
||||
echo "Running test case $test_name with qps $qps"
|
||||
echo "Client command: $client_command"
|
||||
|
||||
eval "$client_command"
|
||||
bash -c "$client_command"
|
||||
|
||||
# record the benchmarking commands
|
||||
jq_output=$(jq -n \
|
||||
@ -348,6 +363,11 @@ main() {
|
||||
check_gpus
|
||||
check_hf_token
|
||||
|
||||
# Set to v1 to run v1 benchmark
|
||||
if [[ "${ENGINE_VERSION:-v0}" == "v1" ]]; then
|
||||
export VLLM_USE_V1=1
|
||||
fi
|
||||
|
||||
# dependencies
|
||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||
(which jq) || (apt-get update && apt-get -y install jq)
|
||||
@ -356,7 +376,7 @@ main() {
|
||||
# get the current IP address, required by benchmark_serving.py
|
||||
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
|
||||
# turn of the reporting of the status of each request, to clean up the terminal output
|
||||
export VLLM_LOG_LEVEL="WARNING"
|
||||
export VLLM_LOGGING_LEVEL="WARNING"
|
||||
|
||||
# prepare for benchmarking
|
||||
cd benchmarks || exit 1
|
||||
|
@ -1,3 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import datetime
|
||||
import json
|
||||
import os
|
||||
|
@ -1,6 +1,10 @@
|
||||
#!/bin/sh
|
||||
TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-test-repo:pull" | jq -r .token)
|
||||
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
|
||||
TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-postmerge-repo:pull" | jq -r .token)
|
||||
if [[ "$BUILDKITE_BRANCH" == "main" ]]; then
|
||||
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT"
|
||||
else
|
||||
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
|
||||
fi
|
||||
|
||||
TIMEOUT_SECONDS=10
|
||||
|
||||
|
23
.buildkite/nightly-benchmarks/tests/genai-perf-tests.json
Normal file
23
.buildkite/nightly-benchmarks/tests/genai-perf-tests.json
Normal file
@ -0,0 +1,23 @@
|
||||
[
|
||||
{
|
||||
"test_name": "llama8B_tp1_genai_perf",
|
||||
"qps_list": [4,8,16,32],
|
||||
"common_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3-8B-Instruct",
|
||||
"tp": 1,
|
||||
"port": 8000,
|
||||
"num_prompts": 500,
|
||||
"reuse_server": false
|
||||
},
|
||||
"vllm_server_parameters": {
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": "",
|
||||
"gpu_memory_utilization": 0.9,
|
||||
"num_scheduler_steps": 10,
|
||||
"max_num_seqs": 512,
|
||||
"dtype": "bfloat16"
|
||||
},
|
||||
"genai_perf_input_parameters": {
|
||||
}
|
||||
}
|
||||
]
|
@ -64,10 +64,11 @@
|
||||
"disable_log_requests": "",
|
||||
"tensor_parallel_size": 4,
|
||||
"swap_space": 16,
|
||||
"speculative_model": "turboderp/Qwama-0.5B-Instruct",
|
||||
"speculative_config": {
|
||||
"model": "turboderp/Qwama-0.5B-Instruct",
|
||||
"num_speculative_tokens": 4,
|
||||
"speculative_draft_tensor_parallel_size": 1,
|
||||
"use_v2_block_manager": ""
|
||||
"draft_tensor_parallel_size": 1
|
||||
}
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||
|
@ -1,12 +1,23 @@
|
||||
steps:
|
||||
- label: "Build wheel - CUDA 12.1"
|
||||
- label: "Build wheel - CUDA 12.4"
|
||||
agents:
|
||||
queue: cpu_queue
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/upload-wheels.sh"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- label: "Build wheel - CUDA 12.1"
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
@ -18,11 +29,75 @@ steps:
|
||||
- label: "Build wheel - CUDA 11.8"
|
||||
# depends_on: block-build-cu118-wheel
|
||||
agents:
|
||||
queue: cpu_queue
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/upload-wheels.sh"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- block: "Build release image"
|
||||
depends_on: ~
|
||||
key: block-release-image-build
|
||||
|
||||
- label: "Build release image"
|
||||
depends_on: block-release-image-build
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||
|
||||
- label: "Build and publish TPU release image"
|
||||
depends_on: ~
|
||||
if: build.env("NIGHTLY") == "1"
|
||||
agents:
|
||||
queue: tpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ."
|
||||
- "docker push vllm/vllm-tpu:nightly"
|
||||
- "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
|
||||
plugins:
|
||||
- docker-login#v3.0.0:
|
||||
username: vllm
|
||||
password-env: DOCKERHUB_TOKEN
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- input: "Provide Release version here"
|
||||
fields:
|
||||
- text: "What is the release version?"
|
||||
key: "release-version"
|
||||
|
||||
- block: "Build CPU release image"
|
||||
key: block-cpu-release-image-build
|
||||
depends_on: ~
|
||||
|
||||
- label: "Build and publish CPU release image"
|
||||
depends_on: block-cpu-release-image-build
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- block: "Build Neuron release image"
|
||||
key: block-neuron-release-image-build
|
||||
depends_on: ~
|
||||
|
||||
- label: "Build and publish Neuron release image"
|
||||
depends_on: block-neuron-release-image-build
|
||||
agents:
|
||||
queue: neuron-postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
@ -1,52 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script build the CPU docker image and run the offline inference inside the container.
|
||||
# It serves a sanity check for compilation and basic model usage.
|
||||
set -ex
|
||||
|
||||
# Try building the docker image
|
||||
docker build -t cpu-test -f Dockerfile.ppc64le .
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() { docker rm -f cpu-test || true; }
|
||||
trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Run the image, setting --shm-size=4g for tensor parallel.
|
||||
source /etc/environment
|
||||
#docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
|
||||
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN="$HF_TOKEN" --name cpu-test cpu-test
|
||||
|
||||
function cpu_tests() {
|
||||
set -e
|
||||
|
||||
# Run basic model test
|
||||
docker exec cpu-test bash -c "
|
||||
set -e
|
||||
pip install pytest pytest-asyncio \
|
||||
decord einops librosa peft Pillow sentence-transformers soundfile \
|
||||
transformers_stream_generator matplotlib datamodel_code_generator
|
||||
pip install torchvision --index-url https://download.pytorch.org/whl/cpu
|
||||
pytest -v -s tests/models/decoder_only/language -m cpu_model
|
||||
pytest -v -s tests/models/embedding/language -m cpu_model
|
||||
pytest -v -s tests/models/encoder_decoder/language -m cpu_model
|
||||
pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
|
||||
pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
|
||||
|
||||
# online inference
|
||||
docker exec cpu-test bash -c "
|
||||
set -e
|
||||
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m &
|
||||
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
|
||||
python3 benchmarks/benchmark_serving.py \
|
||||
--backend vllm \
|
||||
--dataset-name random \
|
||||
--model facebook/opt-125m \
|
||||
--num-prompts 20 \
|
||||
--endpoint /v1/completions \
|
||||
--tokenizer facebook/opt-125m"
|
||||
}
|
||||
|
||||
# All of CPU tests are expected to be finished less than 25 mins.
|
||||
export -f cpu_tests
|
||||
timeout 25m bash -c "cpu_tests"
|
@ -1,78 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script build the CPU docker image and run the offline inference inside the container.
|
||||
# It serves a sanity check for compilation and basic model usage.
|
||||
set -ex
|
||||
|
||||
# allow to bind to different cores
|
||||
CORE_RANGE=${CORE_RANGE:-48-95}
|
||||
NUMA_NODE=${NUMA_NODE:-1}
|
||||
|
||||
# Try building the docker image
|
||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test -f Dockerfile.cpu .
|
||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() { docker rm -f cpu-test cpu-test-avx2 || true; }
|
||||
trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Run the image, setting --shm-size=4g for tensor parallel.
|
||||
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
|
||||
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
|
||||
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
|
||||
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2
|
||||
|
||||
function cpu_tests() {
|
||||
set -e
|
||||
|
||||
# offline inference
|
||||
docker exec cpu-test-avx2 bash -c "
|
||||
set -e
|
||||
python3 examples/offline_inference.py"
|
||||
|
||||
# Run basic model test
|
||||
docker exec cpu-test bash -c "
|
||||
set -e
|
||||
pip install pytest pytest-asyncio \
|
||||
decord einops librosa peft Pillow sentence-transformers soundfile \
|
||||
transformers_stream_generator matplotlib datamodel_code_generator
|
||||
pip install torchvision --index-url https://download.pytorch.org/whl/cpu
|
||||
pytest -v -s tests/models/decoder_only/language -m cpu_model
|
||||
pytest -v -s tests/models/embedding/language -m cpu_model
|
||||
pytest -v -s tests/models/encoder_decoder/language -m cpu_model
|
||||
pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
|
||||
pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
|
||||
|
||||
# Run compressed-tensor test
|
||||
docker exec cpu-test bash -c "
|
||||
set -e
|
||||
pytest -s -v \
|
||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
|
||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
|
||||
|
||||
# Run AWQ test
|
||||
docker exec cpu-test bash -c "
|
||||
set -e
|
||||
pytest -s -v \
|
||||
tests/quantization/test_ipex_quant.py"
|
||||
|
||||
# online inference
|
||||
docker exec cpu-test bash -c "
|
||||
set -e
|
||||
export VLLM_CPU_KVCACHE_SPACE=10
|
||||
export VLLM_CPU_OMP_THREADS_BIND=$1
|
||||
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half &
|
||||
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
|
||||
python3 benchmarks/benchmark_serving.py \
|
||||
--backend vllm \
|
||||
--dataset-name random \
|
||||
--model facebook/opt-125m \
|
||||
--num-prompts 20 \
|
||||
--endpoint /v1/completions \
|
||||
--tokenizer facebook/opt-125m"
|
||||
}
|
||||
|
||||
# All of CPU tests are expected to be finished less than 25 mins.
|
||||
export -f cpu_tests
|
||||
timeout 25m bash -c "cpu_tests $CORE_RANGE"
|
@ -1,16 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script build the CPU docker image and run the offline inference inside the container.
|
||||
# It serves a sanity check for compilation and basic model usage.
|
||||
set -ex
|
||||
|
||||
# Try building the docker image
|
||||
docker build -t hpu-test-env -f Dockerfile.hpu .
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() { docker rm -f hpu-test || true; }
|
||||
trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Run the image and launch offline inference
|
||||
docker run --runtime=habana --name=hpu-test --network=host -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference.py
|
@ -1,53 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script build the Neuron docker image and run the API server inside the container.
|
||||
# It serves a sanity check for compilation and basic model usage.
|
||||
set -e
|
||||
|
||||
# Try building the docker image
|
||||
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
|
||||
|
||||
# prune old image and containers to save disk space, and only once a day
|
||||
# by using a timestamp file in tmp.
|
||||
if [ -f /tmp/neuron-docker-build-timestamp ]; then
|
||||
last_build=$(cat /tmp/neuron-docker-build-timestamp)
|
||||
current_time=$(date +%s)
|
||||
if [ $((current_time - last_build)) -gt 86400 ]; then
|
||||
docker system prune -f
|
||||
echo "$current_time" > /tmp/neuron-docker-build-timestamp
|
||||
fi
|
||||
else
|
||||
date "+%s" > /tmp/neuron-docker-build-timestamp
|
||||
fi
|
||||
|
||||
docker build -t neuron -f Dockerfile.neuron .
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() { docker rm -f neuron || true; }
|
||||
trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Run the image
|
||||
docker run --device=/dev/neuron0 --device=/dev/neuron1 --network host --name neuron neuron python3 -m vllm.entrypoints.api_server \
|
||||
--model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --max-num-seqs 8 --max-model-len 128 --block-size 128 --device neuron --tensor-parallel-size 2 &
|
||||
|
||||
# Wait for the server to start
|
||||
wait_for_server_to_start() {
|
||||
timeout=300
|
||||
counter=0
|
||||
|
||||
while [ "$(curl -s -o /dev/null -w '%{http_code}' localhost:8000/health)" != "200" ]; do
|
||||
sleep 1
|
||||
counter=$((counter + 1))
|
||||
if [ $counter -ge $timeout ]; then
|
||||
echo "Timeout after $timeout seconds"
|
||||
break
|
||||
fi
|
||||
done
|
||||
}
|
||||
wait_for_server_to_start
|
||||
|
||||
# Test a simple prompt
|
||||
curl -X POST -H "Content-Type: application/json" \
|
||||
localhost:8000/generate \
|
||||
-d '{"prompt": "San Francisco is a"}'
|
@ -1,16 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script build the OpenVINO docker image and run the offline inference inside the container.
|
||||
# It serves a sanity check for compilation and basic model usage.
|
||||
set -ex
|
||||
|
||||
# Try building the docker image
|
||||
docker build -t openvino-test -f Dockerfile.openvino .
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() { docker rm -f openvino-test || true; }
|
||||
trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Run the image and launch offline inference
|
||||
docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference.py
|
@ -1,17 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
# Build the docker image.
|
||||
docker build -f Dockerfile.tpu -t vllm-tpu .
|
||||
|
||||
# Set up cleanup.
|
||||
remove_docker_container() { docker rm -f tpu-test || true; }
|
||||
trap remove_docker_container EXIT
|
||||
# Remove the container that might not be cleaned up in the previous run.
|
||||
remove_docker_container
|
||||
|
||||
# For HF_TOKEN.
|
||||
source /etc/environment
|
||||
# Run a simple end-to-end example.
|
||||
docker run --privileged --net host --shm-size=16G -it -e "HF_TOKEN=$HF_TOKEN" --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && python3 -m pip install lm_eval[api]==0.4.4 && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py"
|
@ -1,16 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script build the CPU docker image and run the offline inference inside the container.
|
||||
# It serves a sanity check for compilation and basic model usage.
|
||||
set -ex
|
||||
|
||||
# Try building the docker image
|
||||
docker build -t xpu-test -f Dockerfile.xpu .
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() { docker rm -f xpu-test || true; }
|
||||
trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Run the image and launch offline inference
|
||||
docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test python3 examples/offline_inference.py
|
@ -77,7 +77,6 @@ echo "Commands:$commands"
|
||||
#ignore certain kernels tests
|
||||
if [[ $commands == *" kernels "* ]]; then
|
||||
commands="${commands} \
|
||||
--ignore=kernels/test_attention.py \
|
||||
--ignore=kernels/test_attention_selector.py \
|
||||
--ignore=kernels/test_blocksparse_attention.py \
|
||||
--ignore=kernels/test_causal_conv1d.py \
|
||||
@ -85,7 +84,6 @@ if [[ $commands == *" kernels "* ]]; then
|
||||
--ignore=kernels/test_encoder_decoder_attn.py \
|
||||
--ignore=kernels/test_flash_attn.py \
|
||||
--ignore=kernels/test_flashinfer.py \
|
||||
--ignore=kernels/test_gguf.py \
|
||||
--ignore=kernels/test_int8_quant.py \
|
||||
--ignore=kernels/test_machete_gemm.py \
|
||||
--ignore=kernels/test_mamba_ssm.py \
|
||||
@ -93,19 +91,61 @@ if [[ $commands == *" kernels "* ]]; then
|
||||
--ignore=kernels/test_moe.py \
|
||||
--ignore=kernels/test_prefix_prefill.py \
|
||||
--ignore=kernels/test_rand.py \
|
||||
--ignore=kernels/test_sampler.py"
|
||||
--ignore=kernels/test_sampler.py \
|
||||
--ignore=kernels/test_cascade_flash_attn.py \
|
||||
--ignore=kernels/test_mamba_mixer2.py \
|
||||
--ignore=kernels/test_aqlm.py \
|
||||
--ignore=kernels/test_machete_mm.py \
|
||||
--ignore=kernels/test_mha_attn.py \
|
||||
--ignore=kernels/test_block_fp8.py \
|
||||
--ignore=kernels/test_cutlass_moe.py \
|
||||
--ignore=kernels/test_mamba_ssm_ssd.py \
|
||||
--ignore=kernels/test_attention.py \
|
||||
--ignore=kernels/test_block_int8.py \
|
||||
--ignore=kernels/test_fused_quant_layernorm.py \
|
||||
--ignore=kernels/test_int8_kernel.py \
|
||||
--ignore=kernels/test_triton_moe_ptpc_fp8.py \
|
||||
--ignore=kernels/test_permute_cols.py"
|
||||
fi
|
||||
|
||||
#ignore certain Entrypoints tests
|
||||
#ignore certain Entrypoints/openai tests
|
||||
if [[ $commands == *" entrypoints/openai "* ]]; then
|
||||
commands=${commands//" entrypoints/openai "/" entrypoints/openai \
|
||||
--ignore=entrypoints/openai/test_accuracy.py \
|
||||
--ignore=entrypoints/openai/test_audio.py \
|
||||
--ignore=entrypoints/openai/test_encoder_decoder.py \
|
||||
--ignore=entrypoints/openai/test_embedding.py \
|
||||
--ignore=entrypoints/openai/test_oot_registration.py "}
|
||||
--ignore=entrypoints/openai/test_shutdown.py \
|
||||
--ignore=entrypoints/openai/test_completion.py \
|
||||
--ignore=entrypoints/openai/test_sleep.py \
|
||||
--ignore=entrypoints/openai/test_models.py \
|
||||
--ignore=entrypoints/openai/test_lora_adapters.py \
|
||||
--ignore=entrypoints/openai/test_return_tokens_as_ids.py \
|
||||
--ignore=entrypoints/openai/test_root_path.py \
|
||||
--ignore=entrypoints/openai/test_tokenization.py \
|
||||
--ignore=entrypoints/openai/test_prompt_validation.py "}
|
||||
fi
|
||||
|
||||
#ignore certain Entrypoints/llm tests
|
||||
if [[ $commands == *" entrypoints/llm "* ]]; then
|
||||
commands=${commands//" entrypoints/llm "/" entrypoints/llm \
|
||||
--ignore=entrypoints/llm/test_chat.py \
|
||||
--ignore=entrypoints/llm/test_accuracy.py \
|
||||
--ignore=entrypoints/llm/test_init.py \
|
||||
--ignore=entrypoints/llm/test_generate_multiple_loras.py \
|
||||
--ignore=entrypoints/llm/test_prompt_validation.py "}
|
||||
fi
|
||||
|
||||
#Obsolete currently
|
||||
##ignore certain Entrypoints/llm tests
|
||||
#if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
|
||||
# commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
|
||||
#fi
|
||||
|
||||
# --ignore=entrypoints/openai/test_encoder_decoder.py \
|
||||
# --ignore=entrypoints/openai/test_embedding.py \
|
||||
# --ignore=entrypoints/openai/test_oot_registration.py
|
||||
# --ignore=entrypoints/openai/test_accuracy.py \
|
||||
# --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13
|
||||
|
||||
|
||||
PARALLEL_JOB_COUNT=8
|
||||
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
|
||||
if [[ $commands == *"--shard-id="* ]]; then
|
||||
@ -115,13 +155,16 @@ if [[ $commands == *"--shard-id="* ]]; then
|
||||
# assign shard-id for each shard
|
||||
commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
|
||||
echo "Shard ${GPU} commands:$commands_gpu"
|
||||
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
|
||||
docker run \
|
||||
--device /dev/kfd --device /dev/dri \
|
||||
--network host \
|
||||
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
|
||||
--network=host \
|
||||
--shm-size=16gb \
|
||||
--rm \
|
||||
-e HIP_VISIBLE_DEVICES="${GPU}" \
|
||||
-e HF_TOKEN \
|
||||
-e AWS_ACCESS_KEY_ID \
|
||||
-e AWS_SECRET_ACCESS_KEY \
|
||||
-v "${HF_CACHE}:${HF_MOUNT}" \
|
||||
-e "HF_HOME=${HF_MOUNT}" \
|
||||
--name "${container_name}_${GPU}" \
|
||||
@ -142,13 +185,16 @@ if [[ $commands == *"--shard-id="* ]]; then
|
||||
fi
|
||||
done
|
||||
else
|
||||
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
|
||||
docker run \
|
||||
--device /dev/kfd --device /dev/dri \
|
||||
--network host \
|
||||
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
|
||||
--network=host \
|
||||
--shm-size=16gb \
|
||||
--rm \
|
||||
-e HIP_VISIBLE_DEVICES=0 \
|
||||
-e HF_TOKEN \
|
||||
-e AWS_ACCESS_KEY_ID \
|
||||
-e AWS_SECRET_ACCESS_KEY \
|
||||
-v "${HF_CACHE}:${HF_MOUNT}" \
|
||||
-e "HF_HOME=${HF_MOUNT}" \
|
||||
--name "${container_name}" \
|
45
.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
Executable file
45
.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
Executable file
@ -0,0 +1,45 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script build the CPU docker image and run the offline inference inside the container.
|
||||
# It serves a sanity check for compilation and basic model usage.
|
||||
set -ex
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() {
|
||||
if [[ -n "$container_id" ]]; then
|
||||
podman rm -f "$container_id" || true
|
||||
fi
|
||||
podman system prune -f
|
||||
}
|
||||
trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Try building the docker image
|
||||
podman build -t cpu-test-ubi9-ppc -f docker/Dockerfile.ppc64le .
|
||||
|
||||
# Run the image
|
||||
container_id=$(podman run -itd --entrypoint /bin/bash -v /tmp/:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN cpu-test-ubi9-ppc)
|
||||
|
||||
function cpu_tests() {
|
||||
|
||||
# offline inference
|
||||
podman exec -it "$container_id" bash -c "
|
||||
set -e
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
||||
|
||||
# Run basic model test
|
||||
podman exec -it "$container_id" bash -c "
|
||||
set -e
|
||||
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
|
||||
pip install sentence-transformers datamodel_code_generator
|
||||
pytest -v -s tests/models/embedding/language/test_cls_models.py::test_classification_models[float-jason9693/Qwen2.5-1.5B-apeach]
|
||||
pytest -v -s tests/models/embedding/language/test_embedding.py::test_models[half-BAAI/bge-base-en-v1.5]
|
||||
pytest -v -s tests/models/encoder_decoder/language -m cpu_model"
|
||||
}
|
||||
|
||||
# All of CPU tests are expected to be finished less than 40 mins.
|
||||
|
||||
export container_id
|
||||
export -f cpu_tests
|
||||
timeout 40m bash -c cpu_tests
|
||||
|
13
.buildkite/scripts/hardware_ci/run-cpu-test-s390x.sh
Executable file
13
.buildkite/scripts/hardware_ci/run-cpu-test-s390x.sh
Executable file
@ -0,0 +1,13 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script build the CPU docker image and run the offline inference inside the container.
|
||||
# It serves a sanity check for compilation and basic model usage.
|
||||
set -ex
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() { docker rm -f cpu-test || true; docker system prune -f; }
|
||||
trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Try building the docker image
|
||||
docker build -t cpu-test -f docker/Dockerfile.s390x .
|
94
.buildkite/scripts/hardware_ci/run-cpu-test.sh
Normal file
94
.buildkite/scripts/hardware_ci/run-cpu-test.sh
Normal file
@ -0,0 +1,94 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script build the CPU docker image and run the offline inference inside the container.
|
||||
# It serves a sanity check for compilation and basic model usage.
|
||||
set -ex
|
||||
|
||||
# allow to bind to different cores
|
||||
CORE_RANGE=${CORE_RANGE:-48-95}
|
||||
NUMA_NODE=${NUMA_NODE:-1}
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() {
|
||||
set -e;
|
||||
docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true;
|
||||
docker image rm cpu-test-"$BUILDKITE_BUILD_NUMBER" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 || true;
|
||||
}
|
||||
trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Try building the docker image
|
||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f docker/Dockerfile.cpu .
|
||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
|
||||
|
||||
# Run the image, setting --shm-size=4g for tensor parallel.
|
||||
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
|
||||
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
|
||||
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
|
||||
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
|
||||
|
||||
function cpu_tests() {
|
||||
set -e
|
||||
export NUMA_NODE=$2
|
||||
export BUILDKITE_BUILD_NUMBER=$3
|
||||
|
||||
# offline inference
|
||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
||||
|
||||
# Run basic model test
|
||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pytest -v -s tests/kernels/test_cache.py -m cpu_model
|
||||
pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model
|
||||
pytest -v -s tests/models/decoder_only/language -m cpu_model
|
||||
pytest -v -s tests/models/embedding/language -m cpu_model
|
||||
pytest -v -s tests/models/encoder_decoder/language -m cpu_model
|
||||
pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
|
||||
pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
|
||||
|
||||
# Run compressed-tensor test
|
||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pytest -s -v \
|
||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
|
||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
|
||||
|
||||
# Run AWQ test
|
||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pytest -s -v \
|
||||
tests/quantization/test_ipex_quant.py"
|
||||
|
||||
# Run chunked-prefill and prefix-cache test
|
||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pytest -s -v -k cpu_model \
|
||||
tests/basic_correctness/test_chunked_prefill.py"
|
||||
|
||||
# online serving
|
||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
export VLLM_CPU_KVCACHE_SPACE=10
|
||||
export VLLM_CPU_OMP_THREADS_BIND=$1
|
||||
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half &
|
||||
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
|
||||
python3 benchmarks/benchmark_serving.py \
|
||||
--backend vllm \
|
||||
--dataset-name random \
|
||||
--model facebook/opt-125m \
|
||||
--num-prompts 20 \
|
||||
--endpoint /v1/completions \
|
||||
--tokenizer facebook/opt-125m"
|
||||
|
||||
# Run multi-lora tests
|
||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pytest -s -v \
|
||||
tests/lora/test_qwen2vl.py"
|
||||
}
|
||||
|
||||
# All of CPU tests are expected to be finished less than 40 mins.
|
||||
export -f cpu_tests
|
||||
timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE $BUILDKITE_BUILD_NUMBER"
|
30
.buildkite/scripts/hardware_ci/run-gh200-test.sh
Normal file
30
.buildkite/scripts/hardware_ci/run-gh200-test.sh
Normal file
@ -0,0 +1,30 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script build the GH200 docker image and run the offline inference inside the container.
|
||||
# It serves a sanity check for compilation and basic model usage.
|
||||
set -ex
|
||||
|
||||
# Skip the new torch installation during build since we are using the specified version for arm64 in the Dockerfile
|
||||
python3 use_existing_torch.py
|
||||
|
||||
# Try building the docker image
|
||||
DOCKER_BUILDKIT=1 docker build . \
|
||||
--file docker/Dockerfile \
|
||||
--target vllm-openai \
|
||||
--platform "linux/arm64" \
|
||||
-t gh200-test \
|
||||
--build-arg max_jobs=66 \
|
||||
--build-arg nvcc_threads=2 \
|
||||
--build-arg RUN_WHEEL_CHECK=false \
|
||||
--build-arg torch_cuda_arch_list="9.0+PTX" \
|
||||
--build-arg vllm_fa_cmake_gpu_arches="90-real"
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() { docker rm -f gh200-test || true; }
|
||||
trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Run the image and test offline inference
|
||||
docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
|
||||
python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B
|
||||
'
|
24
.buildkite/scripts/hardware_ci/run-hpu-test.sh
Normal file
24
.buildkite/scripts/hardware_ci/run-hpu-test.sh
Normal file
@ -0,0 +1,24 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script build the CPU docker image and run the offline inference inside the container.
|
||||
# It serves a sanity check for compilation and basic model usage.
|
||||
set -ex
|
||||
|
||||
# Try building the docker image
|
||||
docker build -t hpu-test-env -f docker/Dockerfile.hpu .
|
||||
|
||||
# Setup cleanup
|
||||
# certain versions of HPU software stack have a bug that can
|
||||
# override the exit code of the script, so we need to use
|
||||
# separate remove_docker_container and remove_docker_container_and_exit
|
||||
# functions, while other platforms only need one remove_docker_container
|
||||
# function.
|
||||
EXITCODE=1
|
||||
remove_docker_container() { docker rm -f hpu-test || true; }
|
||||
remove_docker_container_and_exit() { remove_docker_container; exit $EXITCODE; }
|
||||
trap remove_docker_container_and_exit EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Run the image and launch offline inference
|
||||
docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
|
||||
EXITCODE=$?
|
54
.buildkite/scripts/hardware_ci/run-neuron-test.sh
Normal file
54
.buildkite/scripts/hardware_ci/run-neuron-test.sh
Normal file
@ -0,0 +1,54 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script build the Neuron docker image and run the API server inside the container.
|
||||
# It serves a sanity check for compilation and basic model usage.
|
||||
set -e
|
||||
set -v
|
||||
|
||||
image_name="neuron/vllm-ci"
|
||||
container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
||||
|
||||
HF_CACHE="$(realpath ~)/huggingface"
|
||||
mkdir -p "${HF_CACHE}"
|
||||
HF_MOUNT="/root/.cache/huggingface"
|
||||
|
||||
NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
|
||||
mkdir -p "${NEURON_COMPILE_CACHE_URL}"
|
||||
NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
|
||||
|
||||
# Try building the docker image
|
||||
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
|
||||
|
||||
# prune old image and containers to save disk space, and only once a day
|
||||
# by using a timestamp file in tmp.
|
||||
if [ -f /tmp/neuron-docker-build-timestamp ]; then
|
||||
last_build=$(cat /tmp/neuron-docker-build-timestamp)
|
||||
current_time=$(date +%s)
|
||||
if [ $((current_time - last_build)) -gt 86400 ]; then
|
||||
# Remove dangling images (those that are not tagged and not used by any container)
|
||||
docker image prune -f
|
||||
# Remove unused volumes / force the system prune for old images as well.
|
||||
docker volume prune -f && docker system prune -f
|
||||
echo "$current_time" > /tmp/neuron-docker-build-timestamp
|
||||
fi
|
||||
else
|
||||
date "+%s" > /tmp/neuron-docker-build-timestamp
|
||||
fi
|
||||
|
||||
docker build -t "${image_name}" -f docker/Dockerfile.neuron .
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() {
|
||||
docker image rm -f "${image_name}" || true;
|
||||
}
|
||||
trap remove_docker_container EXIT
|
||||
|
||||
# Run the image
|
||||
docker run --rm -it --device=/dev/neuron0 --network bridge \
|
||||
-v "${HF_CACHE}:${HF_MOUNT}" \
|
||||
-e "HF_HOME=${HF_MOUNT}" \
|
||||
-v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
|
||||
-e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
|
||||
--name "${container_name}" \
|
||||
${image_name} \
|
||||
/bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys && python3 -m pytest /workspace/vllm/tests/neuron/2_core/ -v --capture=tee-sys"
|
54
.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
Executable file
54
.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
Executable file
@ -0,0 +1,54 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -xue
|
||||
|
||||
# Build the docker image.
|
||||
docker build -f docker/Dockerfile.tpu -t vllm-tpu .
|
||||
|
||||
# Set up cleanup.
|
||||
remove_docker_container() { docker rm -f tpu-test || true; }
|
||||
trap remove_docker_container EXIT
|
||||
# Remove the container that might not be cleaned up in the previous run.
|
||||
remove_docker_container
|
||||
|
||||
# For HF_TOKEN.
|
||||
source /etc/environment
|
||||
# Run a simple end-to-end example.
|
||||
docker run --privileged --net host --shm-size=16G -it \
|
||||
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
|
||||
vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
|
||||
&& python3 -m pip install pytest pytest-asyncio tpu-info \
|
||||
&& python3 -m pip install lm_eval[api]==0.4.4 \
|
||||
&& export VLLM_XLA_CACHE_PATH= \
|
||||
&& export VLLM_USE_V1=1 \
|
||||
&& export VLLM_XLA_CHECK_RECOMPILATION=1 \
|
||||
&& echo HARDWARE \
|
||||
&& tpu-info \
|
||||
&& echo TEST_0 \
|
||||
&& pytest -v -s /workspace/vllm/tests/v1/tpu/test_perf.py \
|
||||
&& echo TEST_1 \
|
||||
&& pytest -v -s /workspace/vllm/tests/tpu/test_compilation.py \
|
||||
&& echo TEST_2 \
|
||||
&& pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \
|
||||
&& echo TEST_3 \
|
||||
&& pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \
|
||||
&& echo TEST_4 \
|
||||
&& pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
|
||||
&& echo TEST_5 \
|
||||
&& python3 /workspace/vllm/examples/offline_inference/tpu.py \
|
||||
&& echo TEST_6 \
|
||||
&& pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py \
|
||||
&& echo TEST_7 \
|
||||
&& pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py \
|
||||
&& echo TEST_8 \
|
||||
&& pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py \
|
||||
&& echo TEST_9 \
|
||||
&& pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py \
|
||||
&& echo TEST_10 \
|
||||
&& pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py \
|
||||
&& echo TEST_11 \
|
||||
&& pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py" \
|
||||
|
||||
|
||||
# TODO: This test fails because it uses RANDOM_SEED sampling
|
||||
# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
|
31
.buildkite/scripts/hardware_ci/run-xpu-test.sh
Normal file
31
.buildkite/scripts/hardware_ci/run-xpu-test.sh
Normal file
@ -0,0 +1,31 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script build the CPU docker image and run the offline inference inside the container.
|
||||
# It serves a sanity check for compilation and basic model usage.
|
||||
set -ex
|
||||
|
||||
image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
|
||||
container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
||||
|
||||
# Try building the docker image
|
||||
docker build -t ${image_name} -f docker/Dockerfile.xpu .
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() {
|
||||
docker rm -f "${container_name}" || true;
|
||||
docker image rm -f "${image_name}" || true;
|
||||
docker system prune -f || true;
|
||||
}
|
||||
trap remove_docker_container EXIT
|
||||
|
||||
# Run the image and test offline inference/tensor parallel
|
||||
docker run \
|
||||
--device /dev/dri \
|
||||
-v /dev/dri/by-path:/dev/dri/by-path \
|
||||
--entrypoint="" \
|
||||
--name "${container_name}" \
|
||||
"${image_name}" \
|
||||
sh -c '
|
||||
VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
|
||||
VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
|
||||
'
|
@ -5,8 +5,8 @@
|
||||
set -ex
|
||||
set -o pipefail
|
||||
|
||||
# cd into parent directory of this file
|
||||
cd "$(dirname "${BASH_SOURCE[0]}")/.."
|
||||
# cd 2 levels into the working directory
|
||||
cd "$(dirname "${BASH_SOURCE[0]}")/../.."
|
||||
|
||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||
|
@ -3,7 +3,7 @@
|
||||
set -euox pipefail
|
||||
|
||||
if [[ $# -lt 4 ]]; then
|
||||
echo "Usage: .buildkite/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
|
||||
echo "Usage: .buildkite/scripts/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
|
||||
exit 1
|
||||
fi
|
||||
|
77
.buildkite/scripts/upload-wheels.sh
Normal file
77
.buildkite/scripts/upload-wheels.sh
Normal file
@ -0,0 +1,77 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -ex
|
||||
|
||||
# Assume wheels are in artifacts/dist/*.whl
|
||||
wheel_files=(artifacts/dist/*.whl)
|
||||
|
||||
# Check that exactly one wheel is found
|
||||
if [[ ${#wheel_files[@]} -ne 1 ]]; then
|
||||
echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Get the single wheel file
|
||||
wheel="${wheel_files[0]}"
|
||||
|
||||
# Rename 'linux' to 'manylinux1' in the wheel filename
|
||||
new_wheel="${wheel/linux/manylinux1}"
|
||||
mv -- "$wheel" "$new_wheel"
|
||||
wheel="$new_wheel"
|
||||
|
||||
# Extract the version from the wheel
|
||||
version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
|
||||
echo "Version: $version"
|
||||
|
||||
normal_wheel="$wheel" # Save the original wheel filename
|
||||
|
||||
# If the version contains "dev", rename it to v1.0.0.dev for consistency
|
||||
if [[ $version == *dev* ]]; then
|
||||
suffix="${version##*.}"
|
||||
if [[ $suffix == cu* ]]; then
|
||||
new_version="1.0.0.dev+${suffix}"
|
||||
else
|
||||
new_version="1.0.0.dev"
|
||||
fi
|
||||
new_wheel="${wheel/$version/$new_version}"
|
||||
# use cp to keep both files in the artifacts directory
|
||||
cp -- "$wheel" "$new_wheel"
|
||||
wheel="$new_wheel"
|
||||
version="$new_version"
|
||||
fi
|
||||
|
||||
# Upload the wheel to S3
|
||||
python3 .buildkite/generate_index.py --wheel "$normal_wheel"
|
||||
|
||||
# generate index for this commit
|
||||
aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
||||
aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
||||
|
||||
if [[ $normal_wheel == *"cu118"* ]]; then
|
||||
# if $normal_wheel matches cu118, do not upload the index.html
|
||||
echo "Skipping index files for cu118 wheels"
|
||||
elif [[ $normal_wheel == *"cu121"* ]]; then
|
||||
# if $normal_wheel matches cu121, do not upload the index.html
|
||||
echo "Skipping index files for cu121 wheels"
|
||||
else
|
||||
# only upload index.html for cu124 wheels (default wheels)
|
||||
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
|
||||
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
|
||||
fi
|
||||
|
||||
# generate index for nightly
|
||||
aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
|
||||
aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
|
||||
|
||||
if [[ $normal_wheel == *"cu118"* ]]; then
|
||||
# if $normal_wheel matches cu118, do not upload the index.html
|
||||
echo "Skipping index files for cu118 wheels"
|
||||
elif [[ $normal_wheel == *"cu121"* ]]; then
|
||||
# if $normal_wheel matches cu121, do not upload the index.html
|
||||
echo "Skipping index files for cu121 wheels"
|
||||
else
|
||||
# only upload index.html for cu124 wheels (default wheels)
|
||||
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
|
||||
fi
|
||||
|
||||
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
|
@ -8,9 +8,9 @@
|
||||
# Documentation
|
||||
# label(str): the name of the test. emoji allowed.
|
||||
# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
|
||||
# torch_nightly(bool): whether to run this on vllm against torch nightly pipeline.
|
||||
# fast_check_only(bool): run this test on fastcheck pipeline only
|
||||
# nightly(bool): run this test in nightly pipeline only
|
||||
# optional(bool): never run this test by default (i.e. need to unblock manually)
|
||||
# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
|
||||
# command(str): the single command to run for tests. incompatible with commands.
|
||||
# commands(list): the list of commands to run for test. incompatbile with command.
|
||||
# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
|
||||
@ -36,13 +36,12 @@ steps:
|
||||
fast_check: true
|
||||
no_gpu: True
|
||||
commands:
|
||||
- pip install -r requirements-docs.txt
|
||||
- pip install -r ../../requirements/docs.txt
|
||||
- SPHINXOPTS=\"-W\" make html
|
||||
# Check API reference (if it fails, you may have missing mock imports)
|
||||
- grep \"sig sig-object py\" build/html/dev/sampling_params.html
|
||||
- grep \"sig sig-object py\" build/html/api/inference_params.html
|
||||
|
||||
- label: Async Engine, Inputs, Utils, Worker Test # 24min
|
||||
fast_check: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/mq_llm_engine
|
||||
@ -51,7 +50,9 @@ steps:
|
||||
- tests/multimodal
|
||||
- tests/test_utils
|
||||
- tests/worker
|
||||
- tests/standalone_tests/lazy_imports.py
|
||||
commands:
|
||||
- python3 standalone_tests/lazy_imports.py
|
||||
- pytest -v -s mq_llm_engine # MQLLMEngine
|
||||
- pytest -v -s async_engine # AsyncLLMEngine
|
||||
- NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
|
||||
@ -60,15 +61,26 @@ steps:
|
||||
- pytest -v -s test_utils.py # Utils
|
||||
- pytest -v -s worker # Worker
|
||||
|
||||
- label: Python-only Installation Test
|
||||
source_file_dependencies:
|
||||
- tests/standalone_tests/python_only_compile.sh
|
||||
- setup.py
|
||||
commands:
|
||||
- bash standalone_tests/python_only_compile.sh
|
||||
|
||||
- label: Basic Correctness Test # 30min
|
||||
#mirror_hardwares: [amd]
|
||||
fast_check: true
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/basic_correctness/test_basic_correctness
|
||||
- tests/basic_correctness/test_cpu_offload
|
||||
- tests/basic_correctness/test_preemption
|
||||
- tests/basic_correctness/test_cumem.py
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s basic_correctness/test_cumem.py
|
||||
- pytest -v -s basic_correctness/test_basic_correctness.py
|
||||
- pytest -v -s basic_correctness/test_cpu_offload.py
|
||||
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
|
||||
@ -94,58 +106,73 @@ steps:
|
||||
- label: Entrypoints Test # 40min
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
fast_check: true
|
||||
mirror_hardwares: [amd]
|
||||
torch_nightly: true
|
||||
#mirror_hardwares: [amd]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/entrypoints/llm
|
||||
- tests/entrypoints/openai
|
||||
- tests/entrypoints/test_chat_utils
|
||||
- tests/entrypoints/offline_mode
|
||||
commands:
|
||||
- pip install -e ./plugins/vllm_add_dummy_model
|
||||
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
|
||||
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
|
||||
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
||||
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
|
||||
- pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
|
||||
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py
|
||||
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
|
||||
- VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
|
||||
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_openai_schema.py
|
||||
- pytest -v -s entrypoints/test_chat_utils.py
|
||||
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
||||
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
||||
|
||||
- label: Distributed Tests (4 GPUs) # 10min
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
fast_check: true
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/
|
||||
- vllm/core/
|
||||
- tests/distributed
|
||||
- tests/distributed/test_utils
|
||||
- tests/distributed/test_pynccl
|
||||
- tests/spec_decode/e2e/test_integration_dist_tp4
|
||||
- tests/compile
|
||||
- tests/compile/test_basic_correctness
|
||||
- examples/offline_inference/rlhf.py
|
||||
- examples/offline_inference/rlhf_colocate.py
|
||||
- tests/examples/offline_inference/data_parallel.py
|
||||
- tests/v1/test_async_llm_dp.py
|
||||
commands:
|
||||
# test with tp=2 and external_dp=2
|
||||
- VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||
# test with internal dp
|
||||
- python3 ../examples/offline_inference/data_parallel.py
|
||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
||||
- pytest -v -s distributed/test_utils.py
|
||||
- pytest -v -s compile/test_basic_correctness.py
|
||||
- pytest -v -s distributed/test_pynccl.py
|
||||
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
|
||||
# TODO: create a dedicated test section for multi-GPU example tests
|
||||
# when we have multiple distributed example tests
|
||||
- pushd ../examples/offline_inference
|
||||
- python3 rlhf.py
|
||||
- RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
|
||||
- popd
|
||||
|
||||
- label: Metrics, Tracing Test # 10min
|
||||
mirror_hardwares: [amd]
|
||||
num_gpus: 2
|
||||
fast_check: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/metrics
|
||||
- tests/tracing
|
||||
commands:
|
||||
- pytest -v -s metrics
|
||||
- "pip install \
|
||||
'opentelemetry-sdk>=1.26.0,<1.27.0' \
|
||||
'opentelemetry-api>=1.26.0,<1.27.0' \
|
||||
'opentelemetry-exporter-otlp>=1.26.0,<1.27.0' \
|
||||
'opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0'"
|
||||
- pytest -v -s tracing
|
||||
|
||||
##### fast check tests #####
|
||||
##### 1 GPU test #####
|
||||
|
||||
- label: Regression Test # 5min
|
||||
mirror_hardwares: [amd]
|
||||
#mirror_hardwares: [amd]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/test_regression
|
||||
@ -160,6 +187,9 @@ steps:
|
||||
- vllm/
|
||||
- tests/engine
|
||||
- tests/tokenization
|
||||
- tests/test_sequence
|
||||
- tests/test_config
|
||||
- tests/test_logger
|
||||
commands:
|
||||
- pytest -v -s engine test_sequence.py test_config.py test_logger.py
|
||||
# OOM in the CI unless we run this separately
|
||||
@ -171,29 +201,52 @@ steps:
|
||||
- vllm/
|
||||
- tests/v1
|
||||
commands:
|
||||
- pytest -v -s v1
|
||||
# split the test to avoid interference
|
||||
- pytest -v -s v1/core
|
||||
- pytest -v -s v1/engine
|
||||
- pytest -v -s v1/entrypoints
|
||||
- pytest -v -s v1/sample
|
||||
- pytest -v -s v1/worker
|
||||
- pytest -v -s v1/structured_output
|
||||
- pytest -v -s v1/spec_decode
|
||||
- pytest -v -s v1/test_serial_utils.py
|
||||
- pytest -v -s v1/test_stats.py
|
||||
- pytest -v -s v1/test_utils.py
|
||||
- pytest -v -s v1/test_oracle.py
|
||||
# TODO: accuracy does not match, whether setting
|
||||
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
|
||||
- pytest -v -s v1/e2e
|
||||
# Integration test for streaming correctness (requires special branch).
|
||||
- pip install -U git+https://github.com/robertgshaw2-neuralmagic/lm-evaluation-harness.git@streaming-api
|
||||
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
||||
|
||||
- label: Examples Test # 15min
|
||||
- label: Examples Test # 25min
|
||||
working_dir: "/vllm-workspace/examples"
|
||||
#mirror_hardwares: [amd]
|
||||
source_file_dependencies:
|
||||
- vllm/entrypoints
|
||||
- examples/
|
||||
commands:
|
||||
- pip install awscli tensorizer # for llava example and tensorizer test
|
||||
- python3 offline_inference.py
|
||||
- python3 cpu_offload.py
|
||||
- python3 offline_inference_chat.py
|
||||
- python3 offline_inference_with_prefix.py
|
||||
- python3 llm_engine_example.py
|
||||
- python3 offline_inference_vision_language.py
|
||||
- python3 offline_inference_vision_language_multi_image.py
|
||||
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||
- python3 offline_inference_encoder_decoder.py
|
||||
- python3 offline_profile.py --model facebook/opt-125m
|
||||
- pip install tensorizer # for tensorizer test
|
||||
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
|
||||
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
||||
- python3 offline_inference/basic/chat.py
|
||||
- python3 offline_inference/prefix_caching.py
|
||||
- python3 offline_inference/llm_engine_example.py
|
||||
- python3 offline_inference/audio_language.py --seed 0
|
||||
- python3 offline_inference/vision_language.py --seed 0
|
||||
- python3 offline_inference/vision_language_embedding.py --seed 0
|
||||
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
||||
- VLLM_USE_V1=0 python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||
- python3 offline_inference/encoder_decoder.py
|
||||
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
||||
- python3 offline_inference/basic/classify.py
|
||||
- python3 offline_inference/basic/embed.py
|
||||
- python3 offline_inference/basic/score.py
|
||||
- VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
|
||||
|
||||
- label: Prefix Caching Test # 9min
|
||||
#mirror_hardwares: [amd]
|
||||
mirror_hardwares: [amd]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/prefix_caching
|
||||
@ -205,6 +258,7 @@ steps:
|
||||
- vllm/model_executor/layers
|
||||
- vllm/sampling_metadata.py
|
||||
- tests/samplers
|
||||
- tests/conftest.py
|
||||
commands:
|
||||
- pytest -v -s samplers
|
||||
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
|
||||
@ -213,27 +267,41 @@ steps:
|
||||
mirror_hardwares: [amd]
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/layers
|
||||
- vllm/model_executor/guided_decoding
|
||||
- tests/test_logits_processor
|
||||
command: pytest -v -s test_logits_processor.py
|
||||
- tests/model_executor/test_guided_processors
|
||||
commands:
|
||||
- pytest -v -s test_logits_processor.py
|
||||
- pytest -v -s model_executor/test_guided_processors.py
|
||||
|
||||
- label: Speculative decoding tests # 30min
|
||||
- label: Speculative decoding tests # 40min
|
||||
source_file_dependencies:
|
||||
- vllm/spec_decode
|
||||
- tests/spec_decode
|
||||
- vllm/model_executor/models/eagle.py
|
||||
commands:
|
||||
- pytest -v -s spec_decode/e2e/test_multistep_correctness.py
|
||||
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py
|
||||
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py --ignore=spec_decode/e2e/test_mtp_correctness.py
|
||||
- pytest -v -s spec_decode/e2e/test_eagle_correctness.py
|
||||
|
||||
- label: LoRA Test %N # 15min each
|
||||
mirror_hardwares: [amd]
|
||||
#mirror_hardwares: [amd]
|
||||
source_file_dependencies:
|
||||
- vllm/lora
|
||||
- tests/lora
|
||||
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
|
||||
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
|
||||
parallelism: 4
|
||||
|
||||
- label: "PyTorch Fullgraph Smoke Test" # 9min
|
||||
fast_check: true
|
||||
- label: PyTorch Compilation Unit Tests
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/compile
|
||||
commands:
|
||||
- pytest -v -s compile/test_pass_manager.py
|
||||
- pytest -v -s compile/test_fusion.py
|
||||
- pytest -v -s compile/test_sequence_parallelism.py
|
||||
|
||||
- label: PyTorch Fullgraph Smoke Test # 9min
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/compile
|
||||
@ -243,25 +311,56 @@ steps:
|
||||
- pytest -v -s compile/piecewise/test_simple.py
|
||||
- pytest -v -s compile/piecewise/test_toy_llama.py
|
||||
|
||||
- label: "PyTorch Fullgraph Test" # 18min
|
||||
- label: PyTorch Fullgraph Test # 18min
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/compile
|
||||
commands:
|
||||
- pytest -v -s compile/test_full_graph.py
|
||||
|
||||
- label: Kernels Test %N # 1h each
|
||||
mirror_hardwares: [amd]
|
||||
- label: Kernels Core Operation Test
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/attention
|
||||
- tests/kernels
|
||||
- tests/kernels/core
|
||||
commands:
|
||||
- pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||
parallelism: 4
|
||||
- pytest -v -s kernels/core
|
||||
|
||||
- label: Kernels Attention Test %N
|
||||
source_file_dependencies:
|
||||
- csrc/attention/
|
||||
- vllm/attention
|
||||
- vllm/v1/attention
|
||||
- tests/kernels/attention
|
||||
commands:
|
||||
- pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||
parallelism: 2
|
||||
|
||||
- label: Kernels Quantization Test %N
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/
|
||||
- vllm/model_executor/layers/quantization
|
||||
- tests/kernels/quantization
|
||||
commands:
|
||||
- pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||
parallelism: 2
|
||||
|
||||
- label: Kernels MoE Test
|
||||
source_file_dependencies:
|
||||
- csrc/moe/
|
||||
- tests/kernels/moe
|
||||
- vllm/model_executor/layers/fused_moe/
|
||||
commands:
|
||||
- pytest -v -s kernels/moe
|
||||
|
||||
- label: Kernels Mamba Test
|
||||
source_file_dependencies:
|
||||
- csrc/mamba/
|
||||
- tests/kernels/mamba
|
||||
commands:
|
||||
- pytest -v -s kernels/mamba
|
||||
|
||||
- label: Tensorizer Test # 11min
|
||||
mirror_hardwares: [amd]
|
||||
# mirror_hardwares: [amd]
|
||||
soft_fail: true
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/model_loader
|
||||
@ -277,7 +376,14 @@ steps:
|
||||
source_file_dependencies:
|
||||
- benchmarks/
|
||||
commands:
|
||||
- bash run-benchmarks.sh
|
||||
- bash scripts/run-benchmarks.sh
|
||||
|
||||
- label: Benchmarks CLI Test # 10min
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/benchmarks/
|
||||
commands:
|
||||
- pytest -v -s benchmarks/
|
||||
|
||||
- label: Quantization Test # 33min
|
||||
source_file_dependencies:
|
||||
@ -295,6 +401,14 @@ steps:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- bash ./run-tests.sh -c configs/models-small.txt -t 1
|
||||
|
||||
- label: OpenAI API correctness
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/entrypoints/openai/
|
||||
- vllm/model_executor/models/whisper.py
|
||||
commands: # LMEval+Transcription WER check
|
||||
- pytest -s entrypoints/openai/correctness/
|
||||
|
||||
- label: Encoder Decoder tests # 5min
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
@ -304,26 +418,30 @@ steps:
|
||||
|
||||
- label: OpenAI-Compatible Tool Use # 20 min
|
||||
fast_check: false
|
||||
mirror_hardwares: [ amd ]
|
||||
#mirror_hardwares: [ amd ]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/tool_use
|
||||
- tests/mistral_tool_use
|
||||
commands:
|
||||
- pytest -v -s tool_use
|
||||
- pytest -v -s mistral_tool_use
|
||||
|
||||
##### models test #####
|
||||
|
||||
- label: Basic Models Test # 30min
|
||||
- label: Basic Models Test # 24min
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models
|
||||
commands:
|
||||
- pip install -e ./plugins/vllm_add_dummy_model
|
||||
- pytest -v -s models/test_oot_registration.py # it needs a clean process
|
||||
- pytest -v -s models/test_transformers.py
|
||||
- pytest -v -s models/test_registry.py
|
||||
- pytest -v -s models/test_initialization.py
|
||||
# V1 Test: https://github.com/vllm-project/vllm/issues/14531
|
||||
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'
|
||||
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'llama4'
|
||||
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'plamo2'
|
||||
|
||||
- label: Language Models Test (Standard) # 42min
|
||||
- label: Language Models Test (Standard) # 32min
|
||||
#mirror_hardwares: [amd]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
@ -331,38 +449,46 @@ steps:
|
||||
- tests/models/embedding/language
|
||||
- tests/models/encoder_decoder/language
|
||||
commands:
|
||||
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
|
||||
- pip install causal-conv1d
|
||||
- pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
|
||||
- pytest -v -s models/embedding/language -m core_model
|
||||
- pytest -v -s models/embedding/vision_language -m core_model
|
||||
|
||||
- label: Language Models Test (Extended) # 50min
|
||||
nightly: true
|
||||
- label: Language Models Test (Extended) # 1h10min
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/decoder_only/language
|
||||
- tests/models/embedding/language
|
||||
- tests/models/encoder_decoder/language
|
||||
commands:
|
||||
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
|
||||
- pip install causal-conv1d
|
||||
- pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
|
||||
- pytest -v -s models/embedding/language -m 'not core_model'
|
||||
- pytest -v -s models/embedding/vision_language -m 'not core_model'
|
||||
|
||||
- label: Multi-Modal Models Test (Standard) # 26min
|
||||
- label: Multi-Modal Models Test (Standard) # 40min
|
||||
#mirror_hardwares: [amd]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/decoder_only/audio_language
|
||||
- tests/models/decoder_only/vision_language
|
||||
- tests/models/embedding/vision_language
|
||||
- tests/models/encoder_decoder/audio_language
|
||||
- tests/models/encoder_decoder/vision_language
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/multimodal
|
||||
- pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
|
||||
- pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
|
||||
- pytest -v -s models/decoder_only/vision_language -m 'core_model or quant_model'
|
||||
- pytest -v -s models/embedding/vision_language -m core_model
|
||||
- pytest -v -s models/encoder_decoder/audio_language -m core_model
|
||||
- pytest -v -s models/encoder_decoder/language -m core_model
|
||||
- pytest -v -s models/encoder_decoder/vision_language -m core_model
|
||||
- pytest -v -s models/decoder_only/vision_language/test_interleaved.py
|
||||
|
||||
- label: Multi-Modal Models Test (Extended) # 1h15m
|
||||
nightly: true
|
||||
- label: Multi-Modal Models Test (Extended) 1 # 48m
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/decoder_only/audio_language
|
||||
@ -370,16 +496,26 @@ steps:
|
||||
- tests/models/embedding/vision_language
|
||||
- tests/models/encoder_decoder/vision_language
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
|
||||
# HACK - run phi3v tests separately to sidestep this transformers bug
|
||||
# https://github.com/huggingface/transformers/issues/34307
|
||||
- pytest -v -s models/decoder_only/vision_language/test_phi3v.py
|
||||
- pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
|
||||
- pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model'
|
||||
- pytest -v -s --ignore models/decoder_only/vision_language/test_models.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
|
||||
- pytest -v -s models/embedding/vision_language -m 'not core_model'
|
||||
- pytest -v -s models/encoder_decoder/language -m 'not core_model'
|
||||
- pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
|
||||
|
||||
- label: Multi-Modal Models Test (Extended) 2 # 38m
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/decoder_only/vision_language
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=1) and not core_model and not quant_model'
|
||||
|
||||
# This test is used only in PR development phase to test individual models and should never run on main
|
||||
- label: Custom Models Test
|
||||
mirror_hardwares: [amd]
|
||||
optional: true
|
||||
commands:
|
||||
- echo 'Testing custom models...'
|
||||
@ -391,6 +527,7 @@ steps:
|
||||
##### multi gpus test #####
|
||||
|
||||
- label: Distributed Comm Ops Test # 7min
|
||||
mirror_hardwares: [amd]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
source_file_dependencies:
|
||||
@ -412,11 +549,11 @@ steps:
|
||||
- tests/distributed/
|
||||
commands:
|
||||
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
|
||||
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'
|
||||
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
|
||||
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
|
||||
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
|
||||
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
|
||||
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'
|
||||
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
|
||||
|
||||
- label: Distributed Tests (2 GPUs) # 40min
|
||||
#mirror_hardwares: [amd]
|
||||
@ -429,19 +566,51 @@ steps:
|
||||
- vllm/model_executor/models/
|
||||
- tests/distributed/
|
||||
- vllm/compilation
|
||||
- vllm/worker/worker_base.py
|
||||
- vllm/worker/worker.py
|
||||
- vllm/worker/model_runner.py
|
||||
- entrypoints/llm/test_collective_rpc.py
|
||||
- tests/v1/test_async_llm_dp.py
|
||||
- vllm/v1/engine/
|
||||
commands:
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
||||
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
||||
- pytest -v -s ./compile/test_basic_correctness.py
|
||||
- pytest -v -s ./compile/test_wrapper.py
|
||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
|
||||
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus
|
||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
||||
# Avoid importing model tests that cause CUDA reinitialization error
|
||||
- pytest models/encoder_decoder/language/test_bart.py -v -s -m distributed_2_gpus
|
||||
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
|
||||
- pytest models/decoder_only/vision_language/test_models.py -v -s -m distributed_2_gpus
|
||||
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
|
||||
- pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
|
||||
- pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
|
||||
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
|
||||
- pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
|
||||
# test sequence parallel
|
||||
- pytest -v -s distributed/test_sequence_parallel.py
|
||||
# this test fails consistently.
|
||||
# TODO: investigate and fix
|
||||
# - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
|
||||
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
|
||||
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
|
||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
||||
|
||||
- label: Plugin Tests (2 GPUs) # 40min
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
source_file_dependencies:
|
||||
- vllm/plugins/
|
||||
- tests/plugins/
|
||||
commands:
|
||||
# begin platform plugin tests, all the code in-between runs on dummy platform
|
||||
- pip install -e ./plugins/vllm_add_dummy_platform
|
||||
- pytest -v -s plugins_tests/test_platform_plugins.py
|
||||
- pip uninstall vllm_add_dummy_platform -y
|
||||
# end platform plugin tests
|
||||
# other tests continue here:
|
||||
- pytest -v -s plugins_tests/test_scheduler_plugins.py
|
||||
- pip install -e ./plugins/vllm_add_dummy_model
|
||||
- pytest -v -s distributed/test_distributed_oot.py
|
||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
|
||||
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
|
||||
- pytest -v -s models/test_oot_registration.py # it needs a clean process
|
||||
|
||||
- label: Multi-step Tests (4 GPUs) # 36min
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
@ -458,7 +627,9 @@ steps:
|
||||
- vllm/engine
|
||||
- tests/multi_step
|
||||
commands:
|
||||
- pytest -v -s multi_step/test_correctness_async_llm.py
|
||||
# this test is quite flaky
|
||||
# TODO: investigate and fix.
|
||||
# - pytest -v -s multi_step/test_correctness_async_llm.py
|
||||
- pytest -v -s multi_step/test_correctness_llm.py
|
||||
|
||||
- label: Pipeline Parallelism Test # 45min
|
||||
@ -474,18 +645,20 @@ steps:
|
||||
- pytest -v -s distributed/test_pp_cudagraph.py
|
||||
- pytest -v -s distributed/test_pipeline_parallel.py
|
||||
|
||||
- label: LoRA Long Context (Distributed) # 11min
|
||||
# This test runs llama 13B, so it is required to run on 4 GPUs.
|
||||
- label: LoRA TP Test (Distributed)
|
||||
num_gpus: 4
|
||||
soft_fail: true
|
||||
source_file_dependencies:
|
||||
- vllm/lora
|
||||
- tests/lora/test_long_context
|
||||
- tests/lora
|
||||
commands:
|
||||
# FIXIT: find out which code initialize cuda before running the test
|
||||
# before the fix, we need to use spawn to test it
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s -x lora/test_long_context.py
|
||||
# There is some Tensor Parallelism related processing logic in LoRA that
|
||||
# requires multi-GPU testing for validation.
|
||||
- pytest -v -s -x lora/test_chatglm3_tp.py
|
||||
- pytest -v -s -x lora/test_llama_tp.py
|
||||
|
||||
|
||||
- label: Weight Loading Multiple GPU Test # 33min
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
@ -513,6 +686,7 @@ steps:
|
||||
|
||||
- label: Distributed Tests (A100) # optional
|
||||
gpu: a100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
@ -521,11 +695,12 @@ steps:
|
||||
# see https://github.com/vllm-project/vllm/pull/5689 for details
|
||||
- pytest -v -s distributed/test_custom_all_reduce.py
|
||||
- torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
|
||||
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m distributed_2_gpus
|
||||
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
||||
- pytest -v -s -x lora/test_mixtral.py
|
||||
|
||||
- label: LM Eval Large Models # optional
|
||||
gpu: a100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
source_file_dependencies:
|
||||
|
@ -1,38 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -ex
|
||||
|
||||
# Assume wheels are in artifacts/dist/*.whl
|
||||
wheel_files=(artifacts/dist/*.whl)
|
||||
|
||||
# Check that exactly one wheel is found
|
||||
if [[ ${#wheel_files[@]} -ne 1 ]]; then
|
||||
echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Get the single wheel file
|
||||
wheel="${wheel_files[0]}"
|
||||
|
||||
# Rename 'linux' to 'manylinux1' in the wheel filename
|
||||
new_wheel="${wheel/linux/manylinux1}"
|
||||
mv -- "$wheel" "$new_wheel"
|
||||
wheel="$new_wheel"
|
||||
|
||||
# Extract the version from the wheel
|
||||
version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
|
||||
echo "Version: $version"
|
||||
|
||||
# If the version contains "dev", rename it to v1.0.0.dev for consistency
|
||||
if [[ $version == *dev* ]]; then
|
||||
new_version="1.0.0.dev"
|
||||
new_wheel="${wheel/$version/$new_version}"
|
||||
mv -- "$wheel" "$new_wheel"
|
||||
wheel="$new_wheel"
|
||||
version="$new_version"
|
||||
fi
|
||||
|
||||
# Upload the wheel to S3
|
||||
aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
||||
aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
|
||||
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
|
50
.github/CODEOWNERS
vendored
50
.github/CODEOWNERS
vendored
@ -2,29 +2,41 @@
|
||||
# for more info about CODEOWNERS file
|
||||
|
||||
# This lists cover the "core" components of vLLM that require careful review
|
||||
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
|
||||
/vllm/core @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
|
||||
/vllm/engine/llm_engine.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
|
||||
/vllm/executor/executor_base.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
|
||||
/vllm/worker/worker_base.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
|
||||
/vllm/worker/worker.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
|
||||
/vllm/model_executor/layers/sampler.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
|
||||
CMakeLists.txt @tlrmchlsmth @WoosukKwon
|
||||
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||
/vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||
/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||
/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
|
||||
/vllm/model_executor/guided_decoding @mgoin @russellb
|
||||
/vllm/multimodal @DarkLight1337 @ywang96
|
||||
/vllm/vllm_flash_attn @LucasWilkinson
|
||||
CMakeLists.txt @tlrmchlsmth
|
||||
|
||||
# vLLM V1
|
||||
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
|
||||
/vllm/v1/structured_output @mgoin @russellb
|
||||
|
||||
# Test ownership
|
||||
/tests/async_engine @njhill @robertgshaw2-neuralmagic @simon-mo
|
||||
/tests/test_inputs.py @DarkLight1337 @ywang96
|
||||
/tests/entrypoints @DarkLight1337 @robertgshaw2-neuralmagic @simon-mo
|
||||
/tests/models @DarkLight1337 @ywang96
|
||||
/tests/multimodal @DarkLight1337 @ywang96
|
||||
/tests/prefix_caching @comaniac @KuntaiDu
|
||||
/tests/spec_decode @njhill @LiuXiaoxuanPKU
|
||||
/tests/kernels @tlrmchlsmth @WoosukKwon
|
||||
/tests/quantization @mgoin @robertgshaw2-neuralmagic
|
||||
/.buildkite/lm-eval-harness @mgoin @simon-mo
|
||||
/tests/async_engine @njhill @robertgshaw2-redhat @simon-mo
|
||||
/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac
|
||||
/tests/distributed/test_multi_node_assignment.py @youkaichao
|
||||
/tests/distributed/test_pipeline_parallel.py @youkaichao
|
||||
/tests/distributed/test_same_node.py @youkaichao
|
||||
/tests/multi_step @alexm-neuralmagic @comaniac
|
||||
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo
|
||||
/tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb
|
||||
/tests/kernels @tlrmchlsmth @WoosukKwon
|
||||
/tests/model_executor/test_guided_processors.py @mgoin @russellb
|
||||
/tests/models @DarkLight1337 @ywang96
|
||||
/tests/multi_step @alexm-redhat @comaniac
|
||||
/tests/multimodal @DarkLight1337 @ywang96
|
||||
/tests/prefix_caching @comaniac @KuntaiDu
|
||||
/tests/quantization @mgoin @robertgshaw2-redhat
|
||||
/tests/spec_decode @njhill @LiuXiaoxuanPKU
|
||||
/tests/test_inputs.py @DarkLight1337 @ywang96
|
||||
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb
|
||||
/tests/v1/structured_output @mgoin @russellb
|
||||
/tests/weight_loading @mgoin @youkaichao
|
||||
/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac
|
||||
|
2
.github/FUNDING.yml
vendored
2
.github/FUNDING.yml
vendored
@ -1,2 +1,2 @@
|
||||
github: [vllm-project]
|
||||
open_collective: [vllm]
|
||||
open_collective: vllm
|
||||
|
2
.github/ISSUE_TEMPLATE/200-installation.yml
vendored
2
.github/ISSUE_TEMPLATE/200-installation.yml
vendored
@ -14,7 +14,7 @@ body:
|
||||
description: |
|
||||
Please run the following and paste the output below.
|
||||
```sh
|
||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
|
||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
|
||||
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
||||
python collect_env.py
|
||||
```
|
||||
|
2
.github/ISSUE_TEMPLATE/300-usage.yml
vendored
2
.github/ISSUE_TEMPLATE/300-usage.yml
vendored
@ -14,7 +14,7 @@ body:
|
||||
description: |
|
||||
Please run the following and paste the output below.
|
||||
```sh
|
||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
|
||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
|
||||
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
||||
python collect_env.py
|
||||
```
|
||||
|
@ -14,7 +14,7 @@ body:
|
||||
description: |
|
||||
Please run the following and paste the output below.
|
||||
```sh
|
||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
|
||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
|
||||
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
||||
python collect_env.py
|
||||
```
|
||||
@ -30,15 +30,6 @@ body:
|
||||
</details>
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
attributes:
|
||||
label: Model Input Dumps
|
||||
description: |
|
||||
If you are facing crashing due to illegal memory access or other issues with model execution, vLLM may dump the problematic input of the model. In this case, you will see the message `Error in model execution (input dumped to /tmp/err_xxx.pkl)`. If you see this message, please zip the file (because GitHub doesn't support .pkl file format) and upload it here. This will help us to reproduce the issue and facilitate the debugging process.
|
||||
placeholder: |
|
||||
Upload the dumped input file.
|
||||
validations:
|
||||
required: false
|
||||
- type: textarea
|
||||
attributes:
|
||||
label: 🐛 Describe the bug
|
@ -9,7 +9,7 @@ body:
|
||||
value: >
|
||||
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
|
||||
|
||||
#### We also highly recommend you read https://docs.vllm.ai/en/latest/models/adding_model.html first to understand how to add a new model.
|
||||
#### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/index.html first to understand how to add a new model.
|
||||
- type: textarea
|
||||
attributes:
|
||||
label: The model to consider.
|
@ -35,7 +35,7 @@ body:
|
||||
description: |
|
||||
Please run the following and paste the output below.
|
||||
```sh
|
||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
|
||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
|
||||
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
||||
python collect_env.py
|
||||
```
|
28
.github/ISSUE_TEMPLATE/800-misc discussion.yml
vendored
28
.github/ISSUE_TEMPLATE/800-misc discussion.yml
vendored
@ -1,28 +0,0 @@
|
||||
name: 🎲 Misc/random discussions that do not fit into the above categories.
|
||||
description: Submit a discussion as you like. Note that developers are heavily overloaded and we mainly rely on community users to answer these issues.
|
||||
title: "[Misc]: "
|
||||
labels: ["misc"]
|
||||
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: >
|
||||
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
|
||||
- type: textarea
|
||||
attributes:
|
||||
label: Anything you want to discuss about vllm.
|
||||
description: >
|
||||
Anything you want to discuss about vllm.
|
||||
validations:
|
||||
required: true
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: >
|
||||
Thanks for contributing 🎉!
|
||||
- type: checkboxes
|
||||
id: askllm
|
||||
attributes:
|
||||
label: Before submitting a new issue...
|
||||
options:
|
||||
- label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
|
||||
required: true
|
4
.github/ISSUE_TEMPLATE/config.yml
vendored
4
.github/ISSUE_TEMPLATE/config.yml
vendored
@ -1 +1,5 @@
|
||||
blank_issues_enabled: false
|
||||
contact_links:
|
||||
- name: Questions
|
||||
url: https://discuss.vllm.ai
|
||||
about: Ask questions and discuss with other vLLM community members
|
||||
|
72
.github/PULL_REQUEST_TEMPLATE.md
vendored
72
.github/PULL_REQUEST_TEMPLATE.md
vendored
@ -2,73 +2,5 @@ FILL IN THE PR DESCRIPTION HERE
|
||||
|
||||
FIX #xxxx (*link existing issues this PR will resolve*)
|
||||
|
||||
**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**
|
||||
|
||||
---
|
||||
|
||||
<details>
|
||||
<!-- inside this <details> section, markdown rendering does not work, so we use raw html here. -->
|
||||
<summary><b> PR Checklist (Click to Expand) </b></summary>
|
||||
|
||||
<p>Thank you for your contribution to vLLM! Before submitting the pull request, please ensure the PR meets the following criteria. This helps vLLM maintain the code quality and improve the efficiency of the review process.</p>
|
||||
|
||||
<h3>PR Title and Classification</h3>
|
||||
<p>Only specific types of PRs will be reviewed. The PR title is prefixed appropriately to indicate the type of change. Please use one of the following:</p>
|
||||
<ul>
|
||||
<li><code>[Bugfix]</code> for bug fixes.</li>
|
||||
<li><code>[CI/Build]</code> for build or continuous integration improvements.</li>
|
||||
<li><code>[Doc]</code> for documentation fixes and improvements.</li>
|
||||
<li><code>[Model]</code> for adding a new model or improving an existing model. Model name should appear in the title.</li>
|
||||
<li><code>[Frontend]</code> For changes on the vLLM frontend (e.g., OpenAI API server, <code>LLM</code> class, etc.) </li>
|
||||
<li><code>[Kernel]</code> for changes affecting CUDA kernels or other compute kernels.</li>
|
||||
<li><code>[Core]</code> for changes in the core vLLM logic (e.g., <code>LLMEngine</code>, <code>AsyncLLMEngine</code>, <code>Scheduler</code>, etc.)</li>
|
||||
<li><code>[Hardware][Vendor]</code> for hardware-specific changes. Vendor name should appear in the prefix (e.g., <code>[Hardware][AMD]</code>).</li>
|
||||
<li><code>[Misc]</code> for PRs that do not fit the above categories. Please use this sparingly.</li>
|
||||
</ul>
|
||||
<p><strong>Note:</strong> If the PR spans more than one category, please include all relevant prefixes.</p>
|
||||
|
||||
<h3>Code Quality</h3>
|
||||
|
||||
<p>The PR need to meet the following code quality standards:</p>
|
||||
|
||||
<ul>
|
||||
<li>We adhere to <a href="https://google.github.io/styleguide/pyguide.html">Google Python style guide</a> and <a href="https://google.github.io/styleguide/cppguide.html">Google C++ style guide</a>.</li>
|
||||
<li>Pass all linter checks. Please use <a href="https://github.com/vllm-project/vllm/blob/main/format.sh"><code>format.sh</code></a> to format your code.</li>
|
||||
<li>The code need to be well-documented to ensure future contributors can easily understand the code.</li>
|
||||
<li>Include sufficient tests to ensure the project to stay correct and robust. This includes both unit tests and integration tests.</li>
|
||||
<li>Please add documentation to <code>docs/source/</code> if the PR modifies the user-facing behaviors of vLLM. It helps vLLM user understand and utilize the new features or changes.</li>
|
||||
</ul>
|
||||
|
||||
<h3>Adding or changing kernels</h3>
|
||||
<p>Each custom kernel needs a schema and one or more implementations to be registered with PyTorch.</p>
|
||||
<ul>
|
||||
<li>Make sure custom ops are registered following PyTorch guidelines: <a href="https://pytorch.org/tutorials/advanced/cpp_custom_ops.html#cpp-custom-ops-tutorial">Custom C++ and CUDA Operators</a> and <a href="https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU">The Custom Operators Manual</a></li>
|
||||
<li>Custom operations that return <code>Tensors</code> require meta-functions. Meta-functions should be implemented and registered in python so that dynamic dims can be handled automatically. See above documents for a description of meta-functions.</li>
|
||||
<li>Use <a href="https://pytorch.org/docs/stable/library.html#torch.library.opcheck"><code>torch.libary.opcheck()</code></a> to test the function registration and meta-function for any registered ops. See <code>tests/kernels</code> for examples.</li>
|
||||
<li>When changing the C++ signature of an existing op, the schema must be updated to reflect the changes.</li>
|
||||
<li>If a new custom type is needed, see the following document: <a href="https://docs.google.com/document/d/18fBMPuOJ0fY5ZQ6YyrHUppw9FA332CpNtgB6SOIgyuA">Custom Class Support in PT2</a>.
|
||||
</ul>
|
||||
|
||||
<h3>Notes for Large Changes</h3>
|
||||
<p>Please keep the changes as concise as possible. For major architectural changes (>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue (RFC) discussing the technical design and justification. Otherwise, we will tag it with <code>rfc-required</code> and might not go through the PR.</p>
|
||||
|
||||
<h3>What to Expect for the Reviews</h3>
|
||||
|
||||
<p>The goal of the vLLM team is to be a <i>transparent reviewing machine</i>. We would like to make the review process transparent and efficient and make sure no contributor feel confused or frustrated. However, the vLLM team is small, so we need to prioritize some PRs over others. Here is what you can expect from the review process: </p>
|
||||
|
||||
<ul>
|
||||
<li> After the PR is submitted, the PR will be assigned to a reviewer. Every reviewer will pick up the PRs based on their expertise and availability.</li>
|
||||
<li> After the PR is assigned, the reviewer will provide status update every 2-3 days. If the PR is not reviewed within 7 days, please feel free to ping the reviewer or the vLLM team.</li>
|
||||
<li> After the review, the reviewer will put an <code> action-required</code> label on the PR if there are changes required. The contributor should address the comments and ping the reviewer to re-review the PR.</li>
|
||||
<li> Please respond to all comments within a reasonable time frame. If a comment isn't clear or you disagree with a suggestion, feel free to ask for clarification or discuss the suggestion.
|
||||
</li>
|
||||
</ul>
|
||||
|
||||
<h3>Thank You</h3>
|
||||
|
||||
<p> Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM. Your contributions make vLLM a great tool for everyone! </p>
|
||||
|
||||
|
||||
</details>
|
||||
|
||||
|
||||
<!--- pyml disable-next-line no-emphasis-as-heading -->
|
||||
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing/overview.html>** (anything written below this line will be removed by GitHub Actions)
|
||||
|
7
.github/dependabot.yml
vendored
7
.github/dependabot.yml
vendored
@ -15,18 +15,17 @@ updates:
|
||||
allow:
|
||||
- dependency-type: "all"
|
||||
ignore:
|
||||
- dependency-name: "*"
|
||||
update-types: ["version-update:semver-patch"]
|
||||
- dependency-name: "torch"
|
||||
- dependency-name: "torchvision"
|
||||
- dependency-name: "xformers"
|
||||
- dependency-name: "lm-format-enforcer"
|
||||
- dependency-name: "gguf"
|
||||
- dependency-name: "compressed-tensors"
|
||||
- dependency-name: "ray[adag]"
|
||||
- dependency-name: "ray[cgraph]" # Ray Compiled Graph
|
||||
- dependency-name: "lm-eval"
|
||||
groups:
|
||||
patch-update:
|
||||
applies-to: version-updates
|
||||
update-types: ["patch"]
|
||||
minor-update:
|
||||
applies-to: version-updates
|
||||
update-types: ["minor"]
|
||||
|
115
.github/mergify.yml
vendored
115
.github/mergify.yml
vendored
@ -5,6 +5,7 @@ pull_request_rules:
|
||||
- or:
|
||||
- files~=^[^/]+\.md$
|
||||
- files~=^docs/
|
||||
- files~=^examples/
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
@ -18,7 +19,7 @@ pull_request_rules:
|
||||
- files~=\.buildkite/
|
||||
- files~=^cmake/
|
||||
- files=CMakeLists.txt
|
||||
- files~=^Dockerfile
|
||||
- files~=^docker/Dockerfile
|
||||
- files~=^requirements.*\.txt
|
||||
- files=setup.py
|
||||
actions:
|
||||
@ -35,6 +36,118 @@ pull_request_rules:
|
||||
add:
|
||||
- frontend
|
||||
|
||||
- name: label-multi-modality
|
||||
description: Automatically apply multi-modality label
|
||||
conditions:
|
||||
- or:
|
||||
- files~=^vllm/multimodal/
|
||||
- files~=^tests/multimodal/
|
||||
- files~=^tests/models/multimodal/
|
||||
- files~=^tests/models/*/audio_language/
|
||||
- files~=^tests/models/*/vision_language/
|
||||
- files=tests/models/test_vision.py
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
- multi-modality
|
||||
|
||||
- name: label-structured-output
|
||||
description: Automatically apply structured-output label
|
||||
conditions:
|
||||
- or:
|
||||
- files~=^benchmarks/structured_schemas/
|
||||
- files=benchmarks/benchmark_serving_structured_output.py
|
||||
- files=benchmarks/run_structured_output_benchmark.sh
|
||||
- files=docs/source/features/structured_outputs.md
|
||||
- files=examples/offline_inference/structured_outputs.py
|
||||
- files=examples/online_serving/openai_chat_completion_structured_outputs.py
|
||||
- files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
|
||||
- files~=^vllm/model_executor/guided_decoding/
|
||||
- files=tests/model_executor/test_guided_processors.py
|
||||
- files=tests/entrypoints/llm/test_guided_generate.py
|
||||
- files~=^tests/v1/structured_output/
|
||||
- files=tests/v1/entrypoints/llm/test_guided_generate.py
|
||||
- files~=^vllm/v1/structured_output/
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
- structured-output
|
||||
|
||||
- name: label-speculative-decoding
|
||||
description: Automatically apply speculative-decoding label
|
||||
conditions:
|
||||
- or:
|
||||
- files~=^vllm/spec_decode/
|
||||
- files=vllm/model_executor/layers/spec_decode_base_sampler.py
|
||||
- files~=^tests/spec_decode/
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
- speculative-decoding
|
||||
|
||||
- name: label-v1
|
||||
description: Automatically apply v1 label
|
||||
conditions:
|
||||
- or:
|
||||
- files~=^vllm/v1/
|
||||
- files~=^tests/v1/
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
- v1
|
||||
|
||||
- name: label-tpu
|
||||
description: Automatically apply tpu label
|
||||
# Keep this list in sync with `label-tpu-remove` conditions
|
||||
conditions:
|
||||
- or:
|
||||
- files~=tpu.py
|
||||
- files~=_tpu
|
||||
- files~=tpu_
|
||||
- files~=/tpu/
|
||||
- files~=pallas
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
- tpu
|
||||
|
||||
- name: label-tpu-remove
|
||||
description: Automatically remove tpu label
|
||||
# Keep this list in sync with `label-tpu` conditions
|
||||
conditions:
|
||||
- and:
|
||||
- -files~=tpu.py
|
||||
- -files~=_tpu
|
||||
- -files~=tpu_
|
||||
- -files~=/tpu/
|
||||
- -files~=pallas
|
||||
actions:
|
||||
label:
|
||||
remove:
|
||||
- tpu
|
||||
|
||||
- name: label-tool-calling
|
||||
description: Automatically add tool-calling label
|
||||
conditions:
|
||||
- or:
|
||||
- files~=^tests/tool_use/
|
||||
- files~=^tests/mistral_tool_use/
|
||||
- files~=^tests/entrypoints/openai/tool_parsers/
|
||||
- files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
|
||||
- files~=^vllm/entrypoints/openai/tool_parsers/
|
||||
- files=docs/source/features/tool_calling.md
|
||||
- files=docs/source/getting_started/examples/openai_chat_completion_client_with_tools.md
|
||||
- files=docs/source/getting_started/examples/chat_with_tools.md
|
||||
- files~=^examples/tool_chat_*
|
||||
- files=examples/offline_inference/chat_with_tools.py
|
||||
- files=examples/online_serving/openai_chat_completion_client_with_tools_required.py
|
||||
- files=examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
|
||||
- files=examples/online_serving/openai_chat_completion_client_with_tools.py
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
- tool-calling
|
||||
|
||||
- name: ping author on conflicts and add 'needs-rebase' label
|
||||
conditions:
|
||||
- conflict
|
||||
|
25
.github/scripts/cleanup_pr_body.sh
vendored
25
.github/scripts/cleanup_pr_body.sh
vendored
@ -15,19 +15,36 @@ NEW=/tmp/new_pr_body.txt
|
||||
gh pr view --json body --template "{{.body}}" "${PR_NUMBER}" > "${OLD}"
|
||||
cp "${OLD}" "${NEW}"
|
||||
|
||||
# Remove all lines after and including "**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**"
|
||||
sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE\*\*/,$d' "${NEW}"
|
||||
|
||||
# Remove "FIX #xxxx (*link existing issues this PR will resolve*)"
|
||||
sed -i '/FIX #xxxx.*$/d' "${NEW}"
|
||||
|
||||
# Remove "FILL IN THE PR DESCRIPTION HERE"
|
||||
sed -i '/FILL IN THE PR DESCRIPTION HERE/d' "${NEW}"
|
||||
|
||||
# Remove all lines after and including "**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**"
|
||||
sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ.*\*\*/,$d' "${NEW}"
|
||||
|
||||
# Remove HTML <details> section that includes <summary> text of "PR Checklist (Click to Expand)"
|
||||
python3 - <<EOF
|
||||
import re
|
||||
|
||||
with open("${NEW}", "r") as file:
|
||||
content = file.read()
|
||||
|
||||
pattern = re.compile(r'(---\n\n)?<details>.*?<summary>.*?PR Checklist \(Click to Expand\).*?</summary>.*?</details>', re.DOTALL)
|
||||
content = re.sub(pattern, '', content)
|
||||
|
||||
with open("${NEW}", "w") as file:
|
||||
file.write(content)
|
||||
EOF
|
||||
|
||||
# Run this only if ${NEW} is different than ${OLD}
|
||||
if ! cmp -s "${OLD}" "${NEW}"; then
|
||||
echo "Updating PR body"
|
||||
gh pr edit --body-file "${NEW}" "${PR_NUMBER}"
|
||||
echo
|
||||
echo "Updated PR body:"
|
||||
echo
|
||||
cat "${NEW}"
|
||||
else
|
||||
echo "No changes needed"
|
||||
fi
|
||||
|
40
.github/workflows/actionlint.yml
vendored
40
.github/workflows/actionlint.yml
vendored
@ -1,40 +0,0 @@
|
||||
name: Lint GitHub Actions workflows
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- "main"
|
||||
paths:
|
||||
- '.github/workflows/*.ya?ml'
|
||||
- '.github/workflows/actionlint.*'
|
||||
- '.github/workflows/matchers/actionlint.json'
|
||||
pull_request:
|
||||
branches:
|
||||
- "main"
|
||||
paths:
|
||||
- '.github/workflows/*.ya?ml'
|
||||
- '.github/workflows/actionlint.*'
|
||||
- '.github/workflows/matchers/actionlint.json'
|
||||
|
||||
env:
|
||||
LC_ALL: en_US.UTF-8
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
actionlint:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: "Checkout"
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: "Run actionlint"
|
||||
run: |
|
||||
echo "::add-matcher::.github/workflows/matchers/actionlint.json"
|
||||
tools/actionlint.sh -color
|
53
.github/workflows/clang-format.yml
vendored
53
.github/workflows/clang-format.yml
vendored
@ -1,53 +0,0 @@
|
||||
name: clang-format
|
||||
|
||||
on:
|
||||
# Trigger the workflow on push or pull request,
|
||||
# but only for the main branch
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- '**/*.h'
|
||||
- '**/*.cpp'
|
||||
- '**/*.cu'
|
||||
- '**/*.cuh'
|
||||
- '.github/workflows/clang-format.yml'
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- '**/*.h'
|
||||
- '**/*.cpp'
|
||||
- '**/*.cu'
|
||||
- '**/*.cuh'
|
||||
- '.github/workflows/clang-format.yml'
|
||||
|
||||
jobs:
|
||||
clang-format:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: ["3.11"]
|
||||
steps:
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install clang-format==18.1.5
|
||||
- name: Running clang-format
|
||||
run: |
|
||||
EXCLUDES=(
|
||||
'csrc/moe/topk_softmax_kernels.cu'
|
||||
'csrc/quantization/gguf/ggml-common.h'
|
||||
'csrc/quantization/gguf/dequantize.cuh'
|
||||
'csrc/quantization/gguf/vecdotq.cuh'
|
||||
'csrc/quantization/gguf/mmq.cuh'
|
||||
'csrc/quantization/gguf/mmvq.cuh'
|
||||
)
|
||||
find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \
|
||||
| grep -vFf <(printf "%s\n" "${EXCLUDES[@]}") \
|
||||
| xargs clang-format --dry-run --Werror
|
2
.github/workflows/cleanup_pr_body.yml
vendored
2
.github/workflows/cleanup_pr_body.yml
vendored
@ -16,7 +16,7 @@ jobs:
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
|
||||
uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
|
||||
with:
|
||||
python-version: '3.12'
|
||||
|
||||
|
45
.github/workflows/codespell.yml
vendored
45
.github/workflows/codespell.yml
vendored
@ -1,45 +0,0 @@
|
||||
name: codespell
|
||||
|
||||
on:
|
||||
# Trigger the workflow on push or pull request,
|
||||
# but only for the main branch
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- "**/*.py"
|
||||
- "**/*.md"
|
||||
- "**/*.rst"
|
||||
- pyproject.toml
|
||||
- requirements-lint.txt
|
||||
- .github/workflows/codespell.yml
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- "**/*.py"
|
||||
- "**/*.md"
|
||||
- "**/*.rst"
|
||||
- pyproject.toml
|
||||
- requirements-lint.txt
|
||||
- .github/workflows/codespell.yml
|
||||
|
||||
jobs:
|
||||
codespell:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: ["3.12"]
|
||||
steps:
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install -r requirements-lint.txt
|
||||
- name: Spelling check with codespell
|
||||
run: |
|
||||
codespell --toml pyproject.toml
|
82
.github/workflows/lint-and-deploy.yaml
vendored
Normal file
82
.github/workflows/lint-and-deploy.yaml
vendored
Normal file
@ -0,0 +1,82 @@
|
||||
name: Lint and Deploy Charts
|
||||
|
||||
on: pull_request
|
||||
|
||||
jobs:
|
||||
lint-and-deploy:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Set up Helm
|
||||
uses: azure/setup-helm@b9e51907a09c216f16ebe8536097933489208112 # v4.3.0
|
||||
with:
|
||||
version: v3.14.4
|
||||
|
||||
#Python is required because ct lint runs Yamale and yamllint which require Python.
|
||||
- uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
|
||||
with:
|
||||
python-version: '3.13'
|
||||
|
||||
- name: Set up chart-testing
|
||||
uses: helm/chart-testing-action@0d28d3144d3a25ea2cc349d6e59901c4ff469b3b # v2.7.0
|
||||
with:
|
||||
version: v3.10.1
|
||||
|
||||
- name: Run chart-testing (lint)
|
||||
run: ct lint --target-branch ${{ github.event.repository.default_branch }} --chart-dirs examples/online_serving/chart-helm --charts examples/online_serving/chart-helm
|
||||
|
||||
- name: Setup minio
|
||||
run: |
|
||||
docker network create vllm-net
|
||||
docker run -d -p 9000:9000 --name minio --net vllm-net \
|
||||
-e "MINIO_ACCESS_KEY=minioadmin" \
|
||||
-e "MINIO_SECRET_KEY=minioadmin" \
|
||||
-v /tmp/data:/data \
|
||||
-v /tmp/config:/root/.minio \
|
||||
minio/minio server /data
|
||||
export AWS_ACCESS_KEY_ID=minioadmin
|
||||
export AWS_SECRET_ACCESS_KEY=minioadmin
|
||||
export AWS_EC2_METADATA_DISABLED=true
|
||||
mkdir opt-125m
|
||||
cd opt-125m && curl -O -Ls "https://huggingface.co/facebook/opt-125m/resolve/main/{pytorch_model.bin,config.json,generation_config.json,merges.txt,special_tokens_map.json,tokenizer_config.json,vocab.json}" && cd ..
|
||||
aws --endpoint-url http://127.0.0.1:9000/ s3 mb s3://testbucket
|
||||
aws --endpoint-url http://127.0.0.1:9000/ s3 cp opt-125m/ s3://testbucket/opt-125m --recursive
|
||||
|
||||
- name: Create kind cluster
|
||||
uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0
|
||||
|
||||
- name: Build the Docker image vllm cpu
|
||||
run: docker buildx build -f docker/Dockerfile.cpu -t vllm-cpu-env .
|
||||
|
||||
- name: Configuration of docker images, network and namespace for the kind cluster
|
||||
run: |
|
||||
docker pull amazon/aws-cli:2.6.4
|
||||
kind load docker-image amazon/aws-cli:2.6.4 --name chart-testing
|
||||
kind load docker-image vllm-cpu-env:latest --name chart-testing
|
||||
docker network connect vllm-net "$(docker ps -aqf "name=chart-testing-control-plane")"
|
||||
kubectl create ns ns-vllm
|
||||
|
||||
- name: Run chart-testing (install)
|
||||
run: |
|
||||
export AWS_ACCESS_KEY_ID=minioadmin
|
||||
export AWS_SECRET_ACCESS_KEY=minioadmin
|
||||
sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" &
|
||||
helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
|
||||
|
||||
- name: curl test
|
||||
run: |
|
||||
kubectl -n ns-vllm port-forward service/test-vllm-service 8001:80 &
|
||||
sleep 10
|
||||
CODE="$(curl -v -f --location http://localhost:8001/v1/completions \
|
||||
--header "Content-Type: application/json" \
|
||||
--data '{
|
||||
"model": "opt-125m",
|
||||
"prompt": "San Francisco is a",
|
||||
"max_tokens": 7,
|
||||
"temperature": 0
|
||||
}'):$CODE"
|
||||
echo "$CODE"
|
17
.github/workflows/matchers/ruff.json
vendored
17
.github/workflows/matchers/ruff.json
vendored
@ -1,17 +0,0 @@
|
||||
{
|
||||
"problemMatcher": [
|
||||
{
|
||||
"owner": "ruff",
|
||||
"pattern": [
|
||||
{
|
||||
"regexp": "^(.+?):(\\d+):(\\d+): (\\w+): (.+)$",
|
||||
"file": 1,
|
||||
"line": 2,
|
||||
"column": 3,
|
||||
"code": 4,
|
||||
"message": 5
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
51
.github/workflows/mypy.yaml
vendored
51
.github/workflows/mypy.yaml
vendored
@ -1,51 +0,0 @@
|
||||
name: mypy
|
||||
|
||||
on:
|
||||
# Trigger the workflow on push or pull request,
|
||||
# but only for the main branch
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- '**/*.py'
|
||||
- '.github/workflows/mypy.yaml'
|
||||
- 'tools/mypy.sh'
|
||||
- 'pyproject.toml'
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
# This workflow is only relevant when one of the following files changes.
|
||||
# However, we have github configured to expect and require this workflow
|
||||
# to run and pass before github with auto-merge a pull request. Until github
|
||||
# allows more flexible auto-merge policy, we can just run this on every PR.
|
||||
# It doesn't take that long to run, anyway.
|
||||
#paths:
|
||||
# - '**/*.py'
|
||||
# - '.github/workflows/mypy.yaml'
|
||||
# - 'tools/mypy.sh'
|
||||
# - 'pyproject.toml'
|
||||
|
||||
jobs:
|
||||
mypy:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: ["3.9", "3.10", "3.11", "3.12"]
|
||||
steps:
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install mypy==1.11.1
|
||||
pip install types-setuptools
|
||||
pip install types-PyYAML
|
||||
pip install types-requests
|
||||
pip install types-setuptools
|
||||
- name: Mypy
|
||||
run: |
|
||||
echo "::add-matcher::.github/workflows/matchers/mypy.json"
|
||||
tools/mypy.sh 1 ${{ matrix.python-version }}
|
20
.github/workflows/pre-commit.yml
vendored
Normal file
20
.github/workflows/pre-commit.yml
vendored
Normal file
@ -0,0 +1,20 @@
|
||||
name: pre-commit
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
push:
|
||||
branches: [main]
|
||||
|
||||
jobs:
|
||||
pre-commit:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
- uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
|
||||
with:
|
||||
python-version: "3.12"
|
||||
- run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
|
||||
- run: echo "::add-matcher::.github/workflows/matchers/mypy.json"
|
||||
- uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
|
||||
with:
|
||||
extra_args: --all-files --hook-stage manual
|
105
.github/workflows/publish.yml
vendored
105
.github/workflows/publish.yml
vendored
@ -39,67 +39,68 @@ jobs:
|
||||
const script = require('.github/workflows/scripts/create_release.js')
|
||||
await script(github, context, core)
|
||||
|
||||
wheel:
|
||||
name: Build Wheel
|
||||
runs-on: ${{ matrix.os }}
|
||||
needs: release
|
||||
# NOTE(simon): No longer build wheel using GitHub Actions. See buildkite's release workflow.
|
||||
# wheel:
|
||||
# name: Build Wheel
|
||||
# runs-on: ${{ matrix.os }}
|
||||
# needs: release
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
os: ['ubuntu-20.04']
|
||||
python-version: ['3.9', '3.10', '3.11', '3.12']
|
||||
pytorch-version: ['2.4.0'] # Must be the most recent version that meets requirements-cuda.txt.
|
||||
cuda-version: ['11.8', '12.1']
|
||||
# strategy:
|
||||
# fail-fast: false
|
||||
# matrix:
|
||||
# os: ['ubuntu-20.04']
|
||||
# python-version: ['3.9', '3.10', '3.11', '3.12']
|
||||
# pytorch-version: ['2.4.0'] # Must be the most recent version that meets requirements/cuda.txt.
|
||||
# cuda-version: ['11.8', '12.1']
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
# steps:
|
||||
# - name: Checkout
|
||||
# uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
|
||||
- name: Setup ccache
|
||||
uses: hendrikmuhs/ccache-action@ed74d11c0b343532753ecead8a951bb09bb34bc9 # v1.2.14
|
||||
with:
|
||||
create-symlink: true
|
||||
key: ${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}
|
||||
# - name: Setup ccache
|
||||
# uses: hendrikmuhs/ccache-action@ed74d11c0b343532753ecead8a951bb09bb34bc9 # v1.2.14
|
||||
# with:
|
||||
# create-symlink: true
|
||||
# key: ${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}
|
||||
|
||||
- name: Set up Linux Env
|
||||
if: ${{ runner.os == 'Linux' }}
|
||||
run: |
|
||||
bash -x .github/workflows/scripts/env.sh
|
||||
# - name: Set up Linux Env
|
||||
# if: ${{ runner.os == 'Linux' }}
|
||||
# run: |
|
||||
# bash -x .github/workflows/scripts/env.sh
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
# - name: Set up Python
|
||||
# uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
|
||||
# with:
|
||||
# python-version: ${{ matrix.python-version }}
|
||||
|
||||
- name: Install CUDA ${{ matrix.cuda-version }}
|
||||
run: |
|
||||
bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }}
|
||||
# - name: Install CUDA ${{ matrix.cuda-version }}
|
||||
# run: |
|
||||
# bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }}
|
||||
|
||||
- name: Install PyTorch ${{ matrix.pytorch-version }} with CUDA ${{ matrix.cuda-version }}
|
||||
run: |
|
||||
bash -x .github/workflows/scripts/pytorch-install.sh ${{ matrix.python-version }} ${{ matrix.pytorch-version }} ${{ matrix.cuda-version }}
|
||||
# - name: Install PyTorch ${{ matrix.pytorch-version }} with CUDA ${{ matrix.cuda-version }}
|
||||
# run: |
|
||||
# bash -x .github/workflows/scripts/pytorch-install.sh ${{ matrix.python-version }} ${{ matrix.pytorch-version }} ${{ matrix.cuda-version }}
|
||||
|
||||
- name: Build wheel
|
||||
shell: bash
|
||||
env:
|
||||
CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
|
||||
run: |
|
||||
bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
|
||||
wheel_name=$(find dist -name "*whl" -print0 | xargs -0 -n 1 basename)
|
||||
asset_name=${wheel_name//"linux"/"manylinux1"}
|
||||
echo "wheel_name=${wheel_name}" >> "$GITHUB_ENV"
|
||||
echo "asset_name=${asset_name}" >> "$GITHUB_ENV"
|
||||
# - name: Build wheel
|
||||
# shell: bash
|
||||
# env:
|
||||
# CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
|
||||
# run: |
|
||||
# bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
|
||||
# wheel_name=$(find dist -name "*whl" -print0 | xargs -0 -n 1 basename)
|
||||
# asset_name=${wheel_name//"linux"/"manylinux1"}
|
||||
# echo "wheel_name=${wheel_name}" >> "$GITHUB_ENV"
|
||||
# echo "asset_name=${asset_name}" >> "$GITHUB_ENV"
|
||||
|
||||
- name: Upload Release Asset
|
||||
uses: actions/upload-release-asset@e8f9f06c4b078e705bd2ea027f0926603fc9b4d5 # v1.0.2
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
with:
|
||||
upload_url: ${{ needs.release.outputs.upload_url }}
|
||||
asset_path: ./dist/${{ env.wheel_name }}
|
||||
asset_name: ${{ env.asset_name }}
|
||||
asset_content_type: application/*
|
||||
# - name: Upload Release Asset
|
||||
# uses: actions/upload-release-asset@e8f9f06c4b078e705bd2ea027f0926603fc9b4d5 # v1.0.2
|
||||
# env:
|
||||
# GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
# with:
|
||||
# upload_url: ${{ needs.release.outputs.upload_url }}
|
||||
# asset_path: ./dist/${{ env.wheel_name }}
|
||||
# asset_name: ${{ env.asset_name }}
|
||||
# asset_content_type: application/*
|
||||
|
||||
# (Danielkinz): This last step will publish the .whl to pypi. Warning: untested
|
||||
# - name: Publish package
|
||||
|
8
.github/workflows/reminder_comment.yml
vendored
8
.github/workflows/reminder_comment.yml
vendored
@ -2,7 +2,6 @@ name: PR Reminder Comment Bot
|
||||
on:
|
||||
pull_request_target:
|
||||
types: [opened]
|
||||
|
||||
jobs:
|
||||
pr_reminder:
|
||||
runs-on: ubuntu-latest
|
||||
@ -15,7 +14,12 @@ jobs:
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
issue_number: context.issue.number,
|
||||
body: '👋 Hi! Thank you for contributing to the vLLM project.\n Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping `simon-mo` or `khluu` to add you in our Buildkite org. \n\nOnce the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n To run CI, PR reviewers can do one of these:\n- Add `ready` label to the PR\n- Enable auto-merge.\n\n🚀'
|
||||
body: '👋 Hi! Thank you for contributing to the vLLM project.\n\n' +
|
||||
'💬 Join our developer Slack at https://slack.vllm.ai to discuss your PR in #pr-reviews, coordinate on features in #feat- channels, or join special interest groups in #sig- channels.\n\n' +
|
||||
'Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping `simon-mo` or `khluu` to add you in our Buildkite org.\n\n' +
|
||||
'Once the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n' +
|
||||
'To run CI, PR reviewers can either: Add `ready` label to the PR or enable auto-merge.\n\n' +
|
||||
'🚀'
|
||||
})
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
52
.github/workflows/ruff.yml
vendored
52
.github/workflows/ruff.yml
vendored
@ -1,52 +0,0 @@
|
||||
name: ruff
|
||||
|
||||
on:
|
||||
# Trigger the workflow on push or pull request,
|
||||
# but only for the main branch
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- "**/*.py"
|
||||
- pyproject.toml
|
||||
- requirements-lint.txt
|
||||
- .github/workflows/matchers/ruff.json
|
||||
- .github/workflows/ruff.yml
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
# This workflow is only relevant when one of the following files changes.
|
||||
# However, we have github configured to expect and require this workflow
|
||||
# to run and pass before github with auto-merge a pull request. Until github
|
||||
# allows more flexible auto-merge policy, we can just run this on every PR.
|
||||
# It doesn't take that long to run, anyway.
|
||||
#paths:
|
||||
# - "**/*.py"
|
||||
# - pyproject.toml
|
||||
# - requirements-lint.txt
|
||||
# - .github/workflows/matchers/ruff.json
|
||||
# - .github/workflows/ruff.yml
|
||||
|
||||
jobs:
|
||||
ruff:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: ["3.12"]
|
||||
steps:
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install -r requirements-lint.txt
|
||||
- name: Analysing the code with ruff
|
||||
run: |
|
||||
echo "::add-matcher::.github/workflows/matchers/ruff.json"
|
||||
ruff check --output-format github .
|
||||
- name: Run isort
|
||||
run: |
|
||||
isort . --check-only
|
2
.github/workflows/scripts/build.sh
vendored
2
.github/workflows/scripts/build.sh
vendored
@ -9,7 +9,7 @@ PATH=${cuda_home}/bin:$PATH
|
||||
LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH
|
||||
|
||||
# Install requirements
|
||||
$python_executable -m pip install -r requirements-build.txt -r requirements-cuda.txt
|
||||
$python_executable -m pip install -r requirements/build.txt -r requirements/cuda.txt
|
||||
|
||||
# Limit the number of parallel jobs to avoid OOM
|
||||
export MAX_JOBS=1
|
||||
|
2
.github/workflows/scripts/create_release.js
vendored
2
.github/workflows/scripts/create_release.js
vendored
@ -1,4 +1,4 @@
|
||||
// Uses Github's API to create the release and wait for result.
|
||||
// Uses GitHub's API to create the release and wait for result.
|
||||
// We use a JS script since github CLI doesn't provide a way to wait for the release's creation and returns immediately.
|
||||
|
||||
module.exports = async (github, context, core) => {
|
||||
|
37
.github/workflows/shellcheck.yml
vendored
37
.github/workflows/shellcheck.yml
vendored
@ -1,37 +0,0 @@
|
||||
name: Lint shell scripts
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- "main"
|
||||
paths:
|
||||
- '**/*.sh'
|
||||
- '.github/workflows/shellcheck.yml'
|
||||
pull_request:
|
||||
branches:
|
||||
- "main"
|
||||
paths:
|
||||
- '**/*.sh'
|
||||
- '.github/workflows/shellcheck.yml'
|
||||
|
||||
env:
|
||||
LC_ALL: en_US.UTF-8
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
shellcheck:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: "Checkout"
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: "Check shell scripts"
|
||||
run: |
|
||||
tools/shellcheck.sh
|
2
.github/workflows/stale.yml
vendored
2
.github/workflows/stale.yml
vendored
@ -13,7 +13,7 @@ jobs:
|
||||
actions: write
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/stale@28ca1036281a5e5922ead5184a1bbf96e5fc984e # v9.0.0
|
||||
- uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0
|
||||
with:
|
||||
# Increasing this value ensures that changes to this workflow
|
||||
# propagate to all issues and PRs in days rather than months
|
||||
|
38
.github/workflows/yapf.yml
vendored
38
.github/workflows/yapf.yml
vendored
@ -1,38 +0,0 @@
|
||||
name: yapf
|
||||
|
||||
on:
|
||||
# Trigger the workflow on push or pull request,
|
||||
# but only for the main branch
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- "**/*.py"
|
||||
- .github/workflows/yapf.yml
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- "**/*.py"
|
||||
- .github/workflows/yapf.yml
|
||||
|
||||
jobs:
|
||||
yapf:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: ["3.12"]
|
||||
steps:
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install yapf==0.32.0
|
||||
pip install toml==0.10.2
|
||||
- name: Running yapf
|
||||
run: |
|
||||
yapf --diff --recursive .
|
10
.gitignore
vendored
10
.gitignore
vendored
@ -2,7 +2,7 @@
|
||||
/vllm/_version.py
|
||||
|
||||
# vllm-flash-attn built from source
|
||||
vllm/vllm_flash_attn/
|
||||
vllm/vllm_flash_attn/*
|
||||
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
@ -79,8 +79,7 @@ instance/
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
docs/source/getting_started/examples/*.rst
|
||||
!**/*.template.rst
|
||||
docs/source/getting_started/examples/
|
||||
|
||||
# PyBuilder
|
||||
.pybuilder/
|
||||
@ -198,8 +197,11 @@ _build/
|
||||
hip_compat.h
|
||||
|
||||
# Benchmark dataset
|
||||
benchmarks/*.json
|
||||
benchmarks/**/*.json
|
||||
|
||||
# Linting
|
||||
actionlint
|
||||
shellcheck*/
|
||||
|
||||
# Ingore moe/marlin_moe gen code
|
||||
csrc/moe/marlin_moe_wna16/kernel_*
|
||||
|
137
.pre-commit-config.yaml
Normal file
137
.pre-commit-config.yaml
Normal file
@ -0,0 +1,137 @@
|
||||
default_install_hook_types:
|
||||
- pre-commit
|
||||
- commit-msg
|
||||
default_stages:
|
||||
- pre-commit # Run locally
|
||||
- manual # Run in CI
|
||||
exclude: 'vllm/third_party/.*'
|
||||
repos:
|
||||
- repo: https://github.com/google/yapf
|
||||
rev: v0.43.0
|
||||
hooks:
|
||||
- id: yapf
|
||||
args: [--in-place, --verbose]
|
||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||
rev: v0.9.3
|
||||
hooks:
|
||||
- id: ruff
|
||||
args: [--output-format, github, --fix]
|
||||
- repo: https://github.com/codespell-project/codespell
|
||||
rev: v2.4.0
|
||||
hooks:
|
||||
- id: codespell
|
||||
additional_dependencies: ['tomli']
|
||||
args: ['--toml', 'pyproject.toml']
|
||||
- repo: https://github.com/PyCQA/isort
|
||||
rev: 0a0b7a830386ba6a31c2ec8316849ae4d1b8240d # 6.0.0
|
||||
hooks:
|
||||
- id: isort
|
||||
- repo: https://github.com/pre-commit/mirrors-clang-format
|
||||
rev: v19.1.7
|
||||
hooks:
|
||||
- id: clang-format
|
||||
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
|
||||
types_or: [c++, cuda]
|
||||
args: [--style=file, --verbose]
|
||||
- repo: https://github.com/jackdewinter/pymarkdown
|
||||
rev: v0.9.27
|
||||
hooks:
|
||||
- id: pymarkdown
|
||||
args: [fix]
|
||||
- repo: https://github.com/rhysd/actionlint
|
||||
rev: v1.7.7
|
||||
hooks:
|
||||
- id: actionlint
|
||||
- repo: https://github.com/astral-sh/uv-pre-commit
|
||||
rev: 0.6.2
|
||||
hooks:
|
||||
- id: pip-compile
|
||||
args: [requirements/test.in, -o, requirements/test.txt]
|
||||
files: ^requirements/test\.(in|txt)$
|
||||
- repo: local
|
||||
hooks:
|
||||
- id: mypy-local
|
||||
name: Run mypy for local Python installation
|
||||
entry: tools/mypy.sh 0 "local"
|
||||
language: python
|
||||
types: [python]
|
||||
additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests]
|
||||
stages: [pre-commit] # Don't run in CI
|
||||
- id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||
name: Run mypy for Python 3.9
|
||||
entry: tools/mypy.sh 1 "3.9"
|
||||
language: python
|
||||
types: [python]
|
||||
additional_dependencies: *mypy_deps
|
||||
stages: [manual] # Only run in CI
|
||||
- id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||
name: Run mypy for Python 3.10
|
||||
entry: tools/mypy.sh 1 "3.10"
|
||||
language: python
|
||||
types: [python]
|
||||
additional_dependencies: *mypy_deps
|
||||
stages: [manual] # Only run in CI
|
||||
- id: mypy-3.11 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||
name: Run mypy for Python 3.11
|
||||
entry: tools/mypy.sh 1 "3.11"
|
||||
language: python
|
||||
types: [python]
|
||||
additional_dependencies: *mypy_deps
|
||||
stages: [manual] # Only run in CI
|
||||
- id: mypy-3.12 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||
name: Run mypy for Python 3.12
|
||||
entry: tools/mypy.sh 1 "3.12"
|
||||
language: python
|
||||
types: [python]
|
||||
additional_dependencies: *mypy_deps
|
||||
stages: [manual] # Only run in CI
|
||||
- id: shellcheck
|
||||
name: Lint shell scripts
|
||||
entry: tools/shellcheck.sh
|
||||
language: script
|
||||
types: [shell]
|
||||
- id: png-lint
|
||||
name: Lint PNG exports from excalidraw
|
||||
entry: tools/png-lint.sh
|
||||
language: script
|
||||
types: [png]
|
||||
- id: signoff-commit
|
||||
name: Sign-off Commit
|
||||
entry: bash
|
||||
args:
|
||||
- -c
|
||||
- |
|
||||
if ! grep -q "^Signed-off-by: $(git config user.name) <$(git config user.email)>" .git/COMMIT_EDITMSG; then
|
||||
printf "\nSigned-off-by: $(git config user.name) <$(git config user.email)>\n" >> .git/COMMIT_EDITMSG
|
||||
fi
|
||||
language: system
|
||||
verbose: true
|
||||
stages: [commit-msg]
|
||||
- id: check-spdx-header
|
||||
name: Check SPDX headers
|
||||
entry: python tools/check_spdx_header.py
|
||||
language: python
|
||||
types: [python]
|
||||
- id: check-filenames
|
||||
name: Check for spaces in all filenames
|
||||
entry: bash
|
||||
args:
|
||||
- -c
|
||||
- 'git ls-files | grep " " && echo "Filenames should not contain spaces!" && exit 1 || exit 0'
|
||||
language: system
|
||||
always_run: true
|
||||
pass_filenames: false
|
||||
- id: update-dockerfile-graph
|
||||
name: Update Dockerfile dependency graph
|
||||
entry: tools/update-dockerfile-graph.sh
|
||||
language: script
|
||||
files: ^docker/Dockerfile$
|
||||
pass_filenames: false
|
||||
# Keep `suggestion` last
|
||||
- id: suggestion
|
||||
name: Suggestion
|
||||
entry: bash -c 'echo "To bypass pre-commit hooks, add --no-verify to git commit."'
|
||||
language: system
|
||||
verbose: true
|
||||
pass_filenames: false
|
||||
# Insert new entries above the `suggestion` entry
|
@ -18,4 +18,4 @@ formats: []
|
||||
# Optionally declare the Python requirements required to build your docs
|
||||
python:
|
||||
install:
|
||||
- requirements: docs/requirements-docs.txt
|
||||
- requirements: requirements/docs.txt
|
||||
|
407
CMakeLists.txt
407
CMakeLists.txt
@ -24,9 +24,6 @@ include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
|
||||
# Suppress potential warnings about unused manually-specified variables
|
||||
set(ignoreMe "${VLLM_PYTHON_PATH}")
|
||||
|
||||
# Prevent installation of dependencies (cutlass) by default.
|
||||
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
|
||||
|
||||
#
|
||||
# Supported python versions. These versions will be searched in order, the
|
||||
# first match will be selected. These should be kept in sync with setup.py.
|
||||
@ -34,10 +31,10 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
|
||||
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
|
||||
|
||||
# Supported NVIDIA architectures.
|
||||
set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")
|
||||
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
|
||||
|
||||
# Supported AMD GPU architectures.
|
||||
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101")
|
||||
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
|
||||
|
||||
#
|
||||
# Supported/expected torch versions for CUDA/ROCm.
|
||||
@ -47,10 +44,10 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11
|
||||
#
|
||||
# Note: the CUDA torch version is derived from pyproject.toml and various
|
||||
# requirements.txt files and should be kept consistent. The ROCm torch
|
||||
# versions are derived from Dockerfile.rocm
|
||||
# versions are derived from docker/Dockerfile.rocm
|
||||
#
|
||||
set(TORCH_SUPPORTED_VERSION_CUDA "2.5.1")
|
||||
set(TORCH_SUPPORTED_VERSION_ROCM "2.5.1")
|
||||
set(TORCH_SUPPORTED_VERSION_CUDA "2.6.0")
|
||||
set(TORCH_SUPPORTED_VERSION_ROCM "2.6.0")
|
||||
|
||||
#
|
||||
# Try to find python package with an executable that exactly matches
|
||||
@ -177,10 +174,54 @@ include(FetchContent)
|
||||
file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists
|
||||
message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
|
||||
|
||||
#
|
||||
# Set rocm version dev int.
|
||||
#
|
||||
if(VLLM_GPU_LANG STREQUAL "HIP")
|
||||
#
|
||||
# Overriding the default -O set up by cmake, adding ggdb3 for the most verbose devug info
|
||||
#
|
||||
set(CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG "${CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG} -O0 -ggdb3")
|
||||
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -ggdb3")
|
||||
|
||||
|
||||
#
|
||||
# Certain HIP functions are marked as [[nodiscard]], yet vllm ignores the result which generates
|
||||
# a lot of warnings that always mask real issues. Suppressing until this is properly addressed.
|
||||
#
|
||||
set(CMAKE_${VLLM_GPU_LANG}_FLAGS "${CMAKE_${VLLM_GPU_LANG}_FLAGS} -Wno-unused-result")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-result")
|
||||
endif()
|
||||
|
||||
#
|
||||
# Define other extension targets
|
||||
#
|
||||
|
||||
#
|
||||
# cumem_allocator extension
|
||||
#
|
||||
|
||||
set(VLLM_CUMEM_EXT_SRC
|
||||
"csrc/cumem_allocator.cpp")
|
||||
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${VLLM_CUMEM_EXT_SRC}"
|
||||
CUDA_ARCHS "${CUDA_ARCHS}")
|
||||
|
||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
message(STATUS "Enabling cumem allocator extension.")
|
||||
# link against cuda driver library
|
||||
list(APPEND CUMEM_LIBS CUDA::cuda_driver)
|
||||
define_gpu_extension_target(
|
||||
cumem_allocator
|
||||
DESTINATION vllm
|
||||
LANGUAGE CXX
|
||||
SOURCES ${VLLM_CUMEM_EXT_SRC}
|
||||
LIBRARIES ${CUMEM_LIBS}
|
||||
USE_SABI 3.8
|
||||
WITH_SOABI)
|
||||
endif()
|
||||
|
||||
#
|
||||
# _C extension
|
||||
#
|
||||
@ -189,27 +230,46 @@ set(VLLM_EXT_SRC
|
||||
"csrc/cache_kernels.cu"
|
||||
"csrc/attention/paged_attention_v1.cu"
|
||||
"csrc/attention/paged_attention_v2.cu"
|
||||
"csrc/attention/merge_attn_states.cu"
|
||||
"csrc/pos_encoding_kernels.cu"
|
||||
"csrc/activation_kernels.cu"
|
||||
"csrc/layernorm_kernels.cu"
|
||||
"csrc/layernorm_quant_kernels.cu"
|
||||
"csrc/cuda_view.cu"
|
||||
"csrc/quantization/gptq/q_gemm.cu"
|
||||
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
|
||||
"csrc/quantization/fp8/common.cu"
|
||||
"csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
|
||||
"csrc/quantization/gguf/gguf_kernel.cu"
|
||||
"csrc/cuda_utils_kernels.cu"
|
||||
"csrc/prepare_inputs/advance_step.cu"
|
||||
"csrc/custom_all_reduce.cu"
|
||||
"csrc/torch_bindings.cpp")
|
||||
|
||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
|
||||
|
||||
# Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
|
||||
set(CUTLASS_REVISION "v3.5.1" CACHE STRING "CUTLASS revision to use")
|
||||
# Please keep this in sync with FetchContent_Declare line below.
|
||||
set(CUTLASS_REVISION "v3.9.0" CACHE STRING "CUTLASS revision to use")
|
||||
|
||||
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
|
||||
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
|
||||
set(VLLM_CUTLASS_SRC_DIR $ENV{VLLM_CUTLASS_SRC_DIR})
|
||||
endif()
|
||||
|
||||
if(VLLM_CUTLASS_SRC_DIR)
|
||||
if(NOT IS_ABSOLUTE VLLM_CUTLASS_SRC_DIR)
|
||||
get_filename_component(VLLM_CUTLASS_SRC_DIR "${VLLM_CUTLASS_SRC_DIR}" ABSOLUTE)
|
||||
endif()
|
||||
message(STATUS "The VLLM_CUTLASS_SRC_DIR is set, using ${VLLM_CUTLASS_SRC_DIR} for compilation")
|
||||
FetchContent_Declare(cutlass SOURCE_DIR ${VLLM_CUTLASS_SRC_DIR})
|
||||
else()
|
||||
FetchContent_Declare(
|
||||
cutlass
|
||||
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
|
||||
GIT_TAG v3.5.1
|
||||
# Please keep this in sync with CUTLASS_REVISION line above.
|
||||
GIT_TAG v3.9.0
|
||||
GIT_PROGRESS TRUE
|
||||
|
||||
# Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
|
||||
@ -217,6 +277,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
# So if the GIT_TAG above is updated to a commit hash, GIT_SHALLOW must be set to FALSE
|
||||
GIT_SHALLOW TRUE
|
||||
)
|
||||
endif()
|
||||
FetchContent_MakeAvailable(cutlass)
|
||||
|
||||
list(APPEND VLLM_EXT_SRC
|
||||
@ -224,10 +285,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
"csrc/mamba/causal_conv1d/causal_conv1d.cu"
|
||||
"csrc/quantization/aqlm/gemm_kernels.cu"
|
||||
"csrc/quantization/awq/gemm_kernels.cu"
|
||||
"csrc/quantization/gguf/gguf_kernel.cu"
|
||||
"csrc/custom_all_reduce.cu"
|
||||
"csrc/permute_cols.cu"
|
||||
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu")
|
||||
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
|
||||
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
|
||||
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
|
||||
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
|
||||
"csrc/cutlass_extensions/common.cpp"
|
||||
"csrc/attention/mla/cutlass_mla_entry.cu")
|
||||
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${VLLM_EXT_SRC}"
|
||||
@ -236,7 +300,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
# Only build Marlin kernels if we are building for at least some compatible archs.
|
||||
# Keep building Marlin for 9.0 as there are some group sizes and shapes that
|
||||
# are not supported by Machete yet.
|
||||
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.9;9.0" ${CUDA_ARCHS})
|
||||
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
|
||||
if (MARLIN_ARCHS)
|
||||
set(MARLIN_SRCS
|
||||
"csrc/quantization/fp8/fp8_marlin.cu"
|
||||
@ -256,39 +320,87 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
" in CUDA target architectures")
|
||||
endif()
|
||||
|
||||
#
|
||||
# Only build AllSpark kernels if we are building for at least some compatible archs.
|
||||
cuda_archs_loose_intersection(ALLSPARK_ARCHS "8.0;8.6;8.7;8.9" "${CUDA_ARCHS}")
|
||||
if (ALLSPARK_ARCHS)
|
||||
set(ALLSPARK_SRCS
|
||||
"csrc/quantization/gptq_allspark/allspark_repack.cu"
|
||||
"csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${ALLSPARK_SRCS}"
|
||||
CUDA_ARCHS "${ALLSPARK_ARCHS}")
|
||||
list(APPEND VLLM_EXT_SRC "${ALLSPARK_SRCS}")
|
||||
message(STATUS "Building AllSpark kernels for archs: ${ALLSPARK_ARCHS}")
|
||||
else()
|
||||
message(STATUS "Not building AllSpark kernels as no compatible archs found"
|
||||
" in CUDA target architectures")
|
||||
endif()
|
||||
|
||||
|
||||
set(SCALED_MM_3X_ARCHS)
|
||||
# The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
|
||||
# CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now).
|
||||
cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0;9.0a" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
|
||||
set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
|
||||
# CUDA 12.0 or later
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS)
|
||||
set(SRCS
|
||||
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu"
|
||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
|
||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
|
||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu"
|
||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${SRCS}"
|
||||
CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
|
||||
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
||||
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
||||
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C3X=1")
|
||||
message(STATUS "Building scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
|
||||
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_SM90=1")
|
||||
# Let scaled_mm_c2x know it doesn't need to build these arches
|
||||
list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
|
||||
message(STATUS "Building scaled_mm_c3x_sm90 for archs: ${SCALED_MM_ARCHS}")
|
||||
else()
|
||||
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
|
||||
message(STATUS "Not building scaled_mm_c3x as CUDA Compiler version is "
|
||||
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS)
|
||||
message(STATUS "Not building scaled_mm_c3x_sm90 as CUDA Compiler version is "
|
||||
"not >= 12.0, we recommend upgrading to CUDA 12.0 or "
|
||||
"later if you intend on running FP8 quantized models on "
|
||||
"Hopper.")
|
||||
else()
|
||||
message(STATUS "Not building scaled_mm_c3x as no compatible archs found "
|
||||
message(STATUS "Not building scaled_mm_c3x_sm90 as no compatible archs found "
|
||||
"in CUDA target architectures")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't
|
||||
# build any 3x kernels
|
||||
set(SCALED_MM_3X_ARCHS)
|
||||
# The cutlass_scaled_mm kernels for Blackwell (c3x, i.e. CUTLASS 3.x) require
|
||||
# CUDA 12.8 or later
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;12.0a" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
|
||||
set(SRCS
|
||||
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
|
||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
|
||||
)
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${SRCS}"
|
||||
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
||||
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
||||
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_SM100=1")
|
||||
# Let scaled_mm_c2x know it doesn't need to build these arches
|
||||
list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
|
||||
message(STATUS "Building scaled_mm_c3x_sm100 for archs: ${SCALED_MM_ARCHS}")
|
||||
else()
|
||||
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
|
||||
message(STATUS "Not building scaled_mm_c3x_sm100 as CUDA Compiler version is "
|
||||
"not >= 12.8, we recommend upgrading to CUDA 12.8 or "
|
||||
"later if you intend on running FP8 quantized models on "
|
||||
"Blackwell.")
|
||||
else()
|
||||
message(STATUS "Not building scaled_mm_c3x_100 as no compatible archs found "
|
||||
"in CUDA target architectures")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
#
|
||||
# For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
|
||||
# kernels for the remaining archs that are not already built for 3x.
|
||||
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
|
||||
"7.5;8.0;8.6;8.9;9.0" "${CUDA_ARCHS}")
|
||||
"7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
|
||||
# subtract out the archs that are already built for 3x
|
||||
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
|
||||
if (SCALED_MM_2X_ARCHS)
|
||||
@ -309,6 +421,94 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
#
|
||||
# 2:4 Sparse Kernels
|
||||
|
||||
# The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
|
||||
# require CUDA 12.2 or later (and only work on Hopper).
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS)
|
||||
set(SRCS "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${SRCS}"
|
||||
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
||||
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
||||
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SPARSE_SCALED_MM_C3X=1")
|
||||
message(STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_ARCHS}")
|
||||
else()
|
||||
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS)
|
||||
message(STATUS "Not building sparse_scaled_mm_c3x kernels as CUDA Compiler version is "
|
||||
"not >= 12.2, we recommend upgrading to CUDA 12.2 or later "
|
||||
"if you intend on running FP8 sparse quantized models on Hopper.")
|
||||
else()
|
||||
message(STATUS "Not building sparse_scaled_mm_c3x as no compatible archs found "
|
||||
"in CUDA target architectures")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# FP4 Archs and flags
|
||||
cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS)
|
||||
set(SRCS
|
||||
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
|
||||
"csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${SRCS}"
|
||||
CUDA_ARCHS "${FP4_ARCHS}")
|
||||
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
||||
list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4=1")
|
||||
message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
|
||||
else()
|
||||
message(STATUS "Not building NVFP4 as no compatible archs were found.")
|
||||
# clear FP4_ARCHS
|
||||
set(FP4_ARCHS)
|
||||
endif()
|
||||
|
||||
# CUTLASS MLA Archs and flags
|
||||
cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND MLA_ARCHS)
|
||||
set(SRCS
|
||||
"csrc/attention/mla/cutlass_mla_kernels.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${SRCS}"
|
||||
CUDA_ARCHS "${MLA_ARCHS}")
|
||||
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
||||
list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MLA=1")
|
||||
# Add MLA-specific include directories only to MLA source files
|
||||
set_source_files_properties(${SRCS}
|
||||
PROPERTIES INCLUDE_DIRECTORIES "${CUTLASS_DIR}/examples/77_blackwell_fmha;${CUTLASS_DIR}/examples/common")
|
||||
message(STATUS "Building CUTLASS MLA for archs: ${MLA_ARCHS}")
|
||||
else()
|
||||
message(STATUS "Not building CUTLASS MLA as no compatible archs were found.")
|
||||
# clear MLA_ARCHS
|
||||
set(MLA_ARCHS)
|
||||
endif()
|
||||
|
||||
# CUTLASS MoE kernels
|
||||
|
||||
# The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and only works
|
||||
# on Hopper). get_cutlass_moe_mm_data should only be compiled if it's possible
|
||||
# to compile MoE kernels that use its output.
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
|
||||
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu"
|
||||
"csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${SRCS}"
|
||||
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
||||
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
||||
list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM90=1")
|
||||
message(STATUS "Building grouped_mm_c3x for archs: ${SCALED_MM_ARCHS}")
|
||||
else()
|
||||
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
|
||||
message(STATUS "Not building grouped_mm_c3x kernels as CUDA Compiler version is "
|
||||
"not >= 12.3, we recommend upgrading to CUDA 12.3 or later "
|
||||
"if you intend on running FP8 quantized MoE models on Hopper.")
|
||||
else()
|
||||
message(STATUS "Not building grouped_mm_c3x as no compatible archs found "
|
||||
"in CUDA target architectures")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
#
|
||||
# Machete kernels
|
||||
@ -391,6 +591,7 @@ define_gpu_extension_target(
|
||||
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
|
||||
ARCHITECTURES ${VLLM_GPU_ARCHES}
|
||||
INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
|
||||
INCLUDE_DIRECTORIES ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
|
||||
USE_SABI 3
|
||||
WITH_SOABI)
|
||||
|
||||
@ -409,28 +610,70 @@ set(VLLM_MOE_EXT_SRC
|
||||
"csrc/moe/moe_align_sum_kernels.cu"
|
||||
"csrc/moe/topk_softmax_kernels.cu")
|
||||
|
||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
list(APPEND VLLM_MOE_EXT_SRC "csrc/moe/moe_wna16.cu")
|
||||
endif()
|
||||
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${VLLM_MOE_EXT_SRC}"
|
||||
CUDA_ARCHS "${CUDA_ARCHS}")
|
||||
|
||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.9;9.0" "${CUDA_ARCHS}")
|
||||
if (MARLIN_MOE_ARCHS)
|
||||
set(MARLIN_MOE_SRC
|
||||
"csrc/moe/marlin_kernels/marlin_moe_kernel.h"
|
||||
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h"
|
||||
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu"
|
||||
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h"
|
||||
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu"
|
||||
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.h"
|
||||
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.cu"
|
||||
"csrc/moe/marlin_moe_ops.cu")
|
||||
set(VLLM_MOE_WNA16_SRC
|
||||
"csrc/moe/moe_wna16.cu")
|
||||
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${MARLIN_MOE_SRC}"
|
||||
SRCS "${VLLM_MOE_WNA16_SRC}"
|
||||
CUDA_ARCHS "${CUDA_ARCHS}")
|
||||
|
||||
list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
|
||||
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
|
||||
if (MARLIN_MOE_ARCHS)
|
||||
|
||||
#
|
||||
# For the Marlin MOE kernels we automatically generate sources for various
|
||||
# preselected input type pairs and schedules.
|
||||
# Generate sources:
|
||||
set(MOE_MARLIN_GEN_SCRIPT
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/csrc/moe/marlin_moe_wna16/generate_kernels.py)
|
||||
file(MD5 ${MOE_MARLIN_GEN_SCRIPT} MOE_MARLIN_GEN_SCRIPT_HASH)
|
||||
|
||||
message(STATUS "Marlin MOE generation script hash: ${MOE_MARLIN_GEN_SCRIPT_HASH}")
|
||||
message(STATUS "Last run Marlin MOE generate script hash: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}")
|
||||
|
||||
if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}
|
||||
OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH})
|
||||
execute_process(
|
||||
COMMAND ${CMAKE_COMMAND} -E env
|
||||
PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
|
||||
${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT}
|
||||
RESULT_VARIABLE moe_marlin_generation_result
|
||||
OUTPUT_VARIABLE moe_marlin_generation_output
|
||||
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
|
||||
ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
|
||||
)
|
||||
|
||||
if (NOT moe_marlin_generation_result EQUAL 0)
|
||||
message(FATAL_ERROR "Marlin MOE generation failed."
|
||||
" Result: \"${moe_marlin_generation_result}\""
|
||||
"\nCheck the log for details: "
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log")
|
||||
else()
|
||||
set(MOE_MARLIN_GEN_SCRIPT_HASH ${MOE_MARLIN_GEN_SCRIPT_HASH}
|
||||
CACHE STRING "Last run Marlin MOE generate script hash" FORCE)
|
||||
message(STATUS "Marlin MOE generation completed successfully.")
|
||||
endif()
|
||||
else()
|
||||
message(STATUS "Marlin MOE generation script has not changed, skipping generation.")
|
||||
endif()
|
||||
|
||||
file(GLOB MOE_WNAA16_MARLIN_SRC "csrc/moe/marlin_moe_wna16/*.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${MOE_WNAA16_MARLIN_SRC}"
|
||||
CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
|
||||
|
||||
list(APPEND VLLM_MOE_EXT_SRC "${MARLIN_MOE_SRC}")
|
||||
list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC})
|
||||
|
||||
message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
|
||||
else()
|
||||
message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
|
||||
@ -455,6 +698,7 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
|
||||
#
|
||||
set(VLLM_ROCM_EXT_SRC
|
||||
"csrc/rocm/torch_bindings.cpp"
|
||||
"csrc/rocm/skinny_gemms.cu"
|
||||
"csrc/rocm/attention.cu")
|
||||
|
||||
define_gpu_extension_target(
|
||||
@ -468,79 +712,8 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
|
||||
WITH_SOABI)
|
||||
endif()
|
||||
|
||||
# vllm-flash-attn currently only supported on CUDA
|
||||
if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda")
|
||||
return()
|
||||
# For CUDA we also build and ship some external projects.
|
||||
if (VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
include(cmake/external_projects/flashmla.cmake)
|
||||
include(cmake/external_projects/vllm_flash_attn.cmake)
|
||||
endif ()
|
||||
|
||||
# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target
|
||||
# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the
|
||||
# arches in the CUDA case (and instead set the gencodes on a per file basis)
|
||||
# we need to manually set VLLM_GPU_ARCHES here.
|
||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
foreach(_ARCH ${CUDA_ARCHS})
|
||||
string(REPLACE "." "" _ARCH "${_ARCH}")
|
||||
list(APPEND VLLM_GPU_ARCHES "${_ARCH}-real")
|
||||
endforeach()
|
||||
endif()
|
||||
|
||||
#
|
||||
# Build vLLM flash attention from source
|
||||
#
|
||||
# IMPORTANT: This has to be the last thing we do, because vllm-flash-attn uses the same macros/functions as vLLM.
|
||||
# Because functions all belong to the global scope, vllm-flash-attn's functions overwrite vLLMs.
|
||||
# They should be identical but if they aren't, this is a massive footgun.
|
||||
#
|
||||
# The vllm-flash-attn install rules are nested under vllm to make sure the library gets installed in the correct place.
|
||||
# To only install vllm-flash-attn, use --component vllm_flash_attn_c.
|
||||
# If no component is specified, vllm-flash-attn is still installed.
|
||||
|
||||
# If VLLM_FLASH_ATTN_SRC_DIR is set, vllm-flash-attn is installed from that directory instead of downloading.
|
||||
# This is to enable local development of vllm-flash-attn within vLLM.
|
||||
# It can be set as an environment variable or passed as a cmake argument.
|
||||
# The environment variable takes precedence.
|
||||
if (DEFINED ENV{VLLM_FLASH_ATTN_SRC_DIR})
|
||||
set(VLLM_FLASH_ATTN_SRC_DIR $ENV{VLLM_FLASH_ATTN_SRC_DIR})
|
||||
endif()
|
||||
|
||||
if(VLLM_FLASH_ATTN_SRC_DIR)
|
||||
FetchContent_Declare(vllm-flash-attn SOURCE_DIR ${VLLM_FLASH_ATTN_SRC_DIR})
|
||||
else()
|
||||
FetchContent_Declare(
|
||||
vllm-flash-attn
|
||||
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
|
||||
GIT_TAG 5259c586c403a4e4d8bf69973c159b40cc346fb9
|
||||
GIT_PROGRESS TRUE
|
||||
# Don't share the vllm-flash-attn build between build types
|
||||
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
||||
)
|
||||
endif()
|
||||
|
||||
# Set the parent build flag so that the vllm-flash-attn library does not redo compile flag and arch initialization.
|
||||
set(VLLM_PARENT_BUILD ON)
|
||||
|
||||
# Ensure the vllm/vllm_flash_attn directory exists before installation
|
||||
install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn\")" COMPONENT vllm_flash_attn_c)
|
||||
|
||||
# Make sure vllm-flash-attn install rules are nested under vllm/
|
||||
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY FALSE)" COMPONENT vllm_flash_attn_c)
|
||||
install(CODE "set(OLD_CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}\")" COMPONENT vllm_flash_attn_c)
|
||||
install(CODE "set(CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}/vllm/\")" COMPONENT vllm_flash_attn_c)
|
||||
|
||||
# Fetch the vllm-flash-attn library
|
||||
FetchContent_MakeAvailable(vllm-flash-attn)
|
||||
message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}")
|
||||
|
||||
# Restore the install prefix
|
||||
install(CODE "set(CMAKE_INSTALL_PREFIX \"\${OLD_CMAKE_INSTALL_PREFIX}\")" COMPONENT vllm_flash_attn_c)
|
||||
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" COMPONENT vllm_flash_attn_c)
|
||||
|
||||
# Copy over the vllm-flash-attn python files
|
||||
install(
|
||||
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
|
||||
DESTINATION vllm/vllm_flash_attn
|
||||
COMPONENT vllm_flash_attn_c
|
||||
FILES_MATCHING PATTERN "*.py"
|
||||
)
|
||||
|
||||
# Nothing after vllm-flash-attn, see comment about macros above
|
||||
|
@ -61,7 +61,7 @@ representative at an online or offline/IRL event.
|
||||
|
||||
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
||||
reported to the community leaders responsible for enforcement in the #code-of-conduct
|
||||
channel in the [vLLM Discord](https://discord.com/invite/jz7wjKhh6g).
|
||||
channel in the [vLLM Slack](https://slack.vllm.ai).
|
||||
All complaints will be reviewed and investigated promptly and fairly.
|
||||
|
||||
All community leaders are obligated to respect the privacy and security of the
|
||||
@ -125,4 +125,3 @@ Community Impact Guidelines were inspired by
|
||||
For answers to common questions about this code of conduct, see the
|
||||
[Contributor Covenant FAQ](https://www.contributor-covenant.org/faq). Translations are available at
|
||||
[Contributor Covenant translations](https://www.contributor-covenant.org/translations).
|
||||
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user